#ifndef VC_VECTOR_H_ #define VC_VECTOR_H_ #ifndef VC_SCALAR_VECTOR_H_ #define VC_SCALAR_VECTOR_H_ #include #include #include #ifdef _MSC_VER #include #endif #ifndef VC_COMMON_TYPES_H_ #define VC_COMMON_TYPES_H_ #ifdef Vc_CHECK_ALIGNMENT #include #include #endif #include #ifndef VC_GLOBAL_H_ #define VC_GLOBAL_H_ #include #ifndef VC_FWDDECL_H_ #define VC_FWDDECL_H_ #include #define Vc_VERSIONED_NAMESPACE Vc_1 namespace Vc_VERSIONED_NAMESPACE { namespace VectorAbi { struct Scalar {}; struct Sse {}; struct Avx {}; struct Mic {}; template struct DeduceCompatible; template struct DeduceBest; } namespace Common { template struct select_best_vector_type; } template class Mask; template class Vector; template ::type, std::size_t Wt = V::Size> class SimdArray; template ::type, std::size_t Wt = V::Size> class SimdMaskArray; namespace simd_abi { using scalar = VectorAbi::Scalar; template struct fixed_size; template using compatible = typename VectorAbi::DeduceCompatible::type; template using native = typename VectorAbi::DeduceBest::type; using __sse = VectorAbi::Sse; using __avx = VectorAbi::Avx; struct __avx512; struct __neon; } template > using simd = Vector; template > using simd_mask = Mask; template using native_simd = simd>; template using native_simd_mask = simd_mask>; template using fixed_size_simd = simd>; template using fixed_size_simd_mask = simd_mask>; } #ifndef DOXYGEN namespace Vc = Vc_VERSIONED_NAMESPACE; #endif #endif #ifdef DOXYGEN #define Vc_ICC __INTEL_COMPILER_BUILD_DATE #undef Vc_ICC #define Vc_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__) #undef Vc_CLANG #define Vc_APPLECLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__) #undef Vc_APPLECLANG #define Vc_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__) #define Vc_MSVC _MSC_FULL_VER #undef Vc_MSVC #else #ifdef __INTEL_COMPILER #define Vc_ICC __INTEL_COMPILER_BUILD_DATE #elif defined(__clang__) && defined(__apple_build_version__) #define Vc_APPLECLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__) #elif defined(__clang__) #define Vc_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__) #elif defined(__GNUC__) #define Vc_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__) #elif defined(_MSC_VER) #define Vc_MSVC _MSC_FULL_VER #else #define Vc_UNSUPPORTED_COMPILER 1 #endif #if defined Vc_GCC && Vc_GCC >= 0x60000 #define Vc_RESET_DIAGNOSTICS _Pragma("GCC diagnostic pop") #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wignored-attributes" #else #define Vc_RESET_DIAGNOSTICS #endif #if defined Vc_ICC #pragma warning disable 2922 #endif #if __cplusplus < 201103 && (!defined Vc_MSVC || _MSC_VER < 1900) # error "Vc requires support for C++11." #elif __cplusplus >= 201402L #define Vc_CXX14 1 # if __cplusplus > 201700L #define Vc_CXX17 1 # endif #endif #if defined(__GNUC__) && !defined(Vc_NO_INLINE_ASM) #define Vc_GNU_ASM 1 #endif #ifdef Vc_GCC # if Vc_GCC >= 0x70000 && defined __i386__ # ifdef __GLIBC_PREREQ # if __GLIBC_PREREQ(2,26) #define Vc_HAVE_STD_MAX_ALIGN_T 1 # endif # endif # elif Vc_GCC >= 0x40900 #define Vc_HAVE_STD_MAX_ALIGN_T 1 # else #define Vc_HAVE_MAX_ALIGN_T 1 # endif #elif !defined(Vc_CLANG) && !defined(Vc_ICC) #define Vc_HAVE_STD_MAX_ALIGN_T 1 #endif #if defined(Vc_GCC) || defined(Vc_CLANG) || defined Vc_APPLECLANG #define Vc_USE_BUILTIN_VECTOR_TYPES 1 #endif #ifdef Vc_MSVC #define Vc_CDECL __cdecl #define Vc_VDECL __vectorcall #else #define Vc_CDECL #define Vc_VDECL #endif #define Scalar 0x00100000 #define SSE 0x00200000 #define SSE2 0x00300000 #define SSE3 0x00400000 #define SSSE3 0x00500000 #define SSE4_1 0x00600000 #define SSE4_2 0x00700000 #define AVX 0x00800000 #define AVX2 0x00900000 #define XOP 0x00000001 #define FMA4 0x00000002 #define F16C 0x00000004 #define POPCNT 0x00000008 #define SSE4a 0x00000010 #define FMA 0x00000020 #define BMI2 0x00000040 #define IMPL_MASK 0xFFF00000 #define EXT_MASK 0x000FFFFF #ifdef Vc_MSVC # ifdef _M_IX86_FP # if _M_IX86_FP >= 1 # ifndef __SSE__ #define __SSE__ 1 # endif # endif # if _M_IX86_FP >= 2 # ifndef __SSE2__ #define __SSE2__ 1 # endif # endif # elif defined(_M_AMD64) # ifndef __SSE__ #define __SSE__ 1 # endif # ifndef __SSE2__ #define __SSE2__ 1 # endif # endif #endif #if defined Vc_ICC && !defined __POPCNT__ # if defined __SSE4_2__ || defined __SSE4A__ #define __POPCNT__ 1 # endif #endif #ifdef VC_IMPL #error "You are using the old VC_IMPL macro. Since Vc 1.0 all Vc macros start with Vc_, i.e. a lower-case 'c'" #endif #ifndef Vc_IMPL # if defined(__AVX2__) #define Vc_IMPL_AVX2 1 #define Vc_IMPL_AVX 1 # elif defined(__AVX__) #define Vc_IMPL_AVX 1 # else # if defined(__SSE4_2__) #define Vc_IMPL_SSE 1 #define Vc_IMPL_SSE4_2 1 # endif # if defined(__SSE4_1__) #define Vc_IMPL_SSE 1 #define Vc_IMPL_SSE4_1 1 # endif # if defined(__SSE3__) #define Vc_IMPL_SSE 1 #define Vc_IMPL_SSE3 1 # endif # if defined(__SSSE3__) #define Vc_IMPL_SSE 1 #define Vc_IMPL_SSSE3 1 # endif # if defined(__SSE2__) #define Vc_IMPL_SSE 1 #define Vc_IMPL_SSE2 1 # endif # if defined(Vc_IMPL_SSE) # else #define Vc_IMPL_Scalar 1 # endif # endif # if !defined(Vc_IMPL_Scalar) # ifdef __FMA4__ #define Vc_IMPL_FMA4 1 # endif # ifdef __XOP__ #define Vc_IMPL_XOP 1 # endif # ifdef __F16C__ #define Vc_IMPL_F16C 1 # endif # ifdef __POPCNT__ #define Vc_IMPL_POPCNT 1 # endif # ifdef __SSE4A__ #define Vc_IMPL_SSE4a 1 # endif # ifdef __FMA__ #define Vc_IMPL_FMA 1 # endif # ifdef __BMI2__ #define Vc_IMPL_BMI2 1 # endif # endif #else # if (Vc_IMPL & IMPL_MASK) == AVX2 #define Vc_IMPL_AVX2 1 #define Vc_IMPL_AVX 1 # elif (Vc_IMPL & IMPL_MASK) == AVX #define Vc_IMPL_AVX 1 # elif (Vc_IMPL & IMPL_MASK) == Scalar #define Vc_IMPL_Scalar 1 # elif (Vc_IMPL & IMPL_MASK) == SSE4_2 #define Vc_IMPL_SSE4_2 1 #define Vc_IMPL_SSE4_1 1 #define Vc_IMPL_SSSE3 1 #define Vc_IMPL_SSE3 1 #define Vc_IMPL_SSE2 1 #define Vc_IMPL_SSE 1 # elif (Vc_IMPL & IMPL_MASK) == SSE4_1 #define Vc_IMPL_SSE4_1 1 #define Vc_IMPL_SSSE3 1 #define Vc_IMPL_SSE3 1 #define Vc_IMPL_SSE2 1 #define Vc_IMPL_SSE 1 # elif (Vc_IMPL & IMPL_MASK) == SSSE3 #define Vc_IMPL_SSSE3 1 #define Vc_IMPL_SSE3 1 #define Vc_IMPL_SSE2 1 #define Vc_IMPL_SSE 1 # elif (Vc_IMPL & IMPL_MASK) == SSE3 #define Vc_IMPL_SSE3 1 #define Vc_IMPL_SSE2 1 #define Vc_IMPL_SSE 1 # elif (Vc_IMPL & IMPL_MASK) == SSE2 #define Vc_IMPL_SSE2 1 #define Vc_IMPL_SSE 1 # elif (Vc_IMPL & IMPL_MASK) == SSE #define Vc_IMPL_SSE 1 # if defined(__SSE4_2__) #define Vc_IMPL_SSE4_2 1 # endif # if defined(__SSE4_1__) #define Vc_IMPL_SSE4_1 1 # endif # if defined(__SSE3__) #define Vc_IMPL_SSE3 1 # endif # if defined(__SSSE3__) #define Vc_IMPL_SSSE3 1 # endif # if defined(__SSE2__) #define Vc_IMPL_SSE2 1 # endif # elif (Vc_IMPL & IMPL_MASK) == 0 && (Vc_IMPL & SSE4a) #define Vc_IMPL_SSE3 1 #define Vc_IMPL_SSE2 1 #define Vc_IMPL_SSE 1 # endif # if (Vc_IMPL & XOP) #define Vc_IMPL_XOP 1 # endif # if (Vc_IMPL & FMA4) #define Vc_IMPL_FMA4 1 # endif # if (Vc_IMPL & F16C) #define Vc_IMPL_F16C 1 # endif # if (!defined(Vc_IMPL_Scalar) && defined(__POPCNT__)) || (Vc_IMPL & POPCNT) #define Vc_IMPL_POPCNT 1 # endif # if (Vc_IMPL & SSE4a) #define Vc_IMPL_SSE4a 1 # endif # if (Vc_IMPL & FMA) #define Vc_IMPL_FMA 1 # endif # if (Vc_IMPL & BMI2) #define Vc_IMPL_BMI2 1 # endif #undef Vc_IMPL #endif #ifdef __AVX__ #define Vc_USE_VEX_CODING 1 #endif #ifdef Vc_IMPL_AVX #define Vc_IMPL_SSE4_2 1 #define Vc_IMPL_SSE4_1 1 #define Vc_IMPL_SSSE3 1 #define Vc_IMPL_SSE3 1 #define Vc_IMPL_SSE2 1 #define Vc_IMPL_SSE 1 #endif #if defined(Vc_CLANG) && Vc_CLANG >= 0x30600 && Vc_CLANG < 0x30700 # if defined(Vc_IMPL_AVX) # warning "clang 3.6.x miscompiles AVX code, frequently losing 50% of the data. Vc will fall back to SSE4 instead." #undef Vc_IMPL_AVX # if defined(Vc_IMPL_AVX2) #undef Vc_IMPL_AVX2 # endif # endif #endif # if !defined(Vc_IMPL_Scalar) && !defined(Vc_IMPL_SSE) && !defined(Vc_IMPL_AVX) # error "No suitable Vc implementation was selected! Probably Vc_IMPL was set to an invalid value." # elif defined(Vc_IMPL_SSE) && !defined(Vc_IMPL_SSE2) # error "SSE requested but no SSE2 support. Vc needs at least SSE2!" # endif #undef Scalar #undef SSE #undef SSE2 #undef SSE3 #undef SSSE3 #undef SSE4_1 #undef SSE4_2 #undef AVX #undef AVX2 #undef XOP #undef FMA4 #undef F16C #undef POPCNT #undef SSE4a #undef FMA #undef BMI2 #undef IMPL_MASK #undef EXT_MASK #if defined Vc_IMPL_AVX2 #define Vc_DEFAULT_IMPL_AVX2 #elif defined Vc_IMPL_AVX #define Vc_DEFAULT_IMPL_AVX #elif defined Vc_IMPL_SSE #define Vc_DEFAULT_IMPL_SSE #elif defined Vc_IMPL_Scalar #define Vc_DEFAULT_IMPL_Scalar #else #error "Preprocessor logic broken. Please report a bug." #endif #endif namespace Vc_VERSIONED_NAMESPACE { typedef signed char int8_t; typedef unsigned char uint8_t; typedef signed short int16_t; typedef unsigned short uint16_t; typedef signed int int32_t; typedef unsigned int uint32_t; typedef signed long long int64_t; typedef unsigned long long uint64_t; enum MallocAlignment { AlignOnVector, AlignOnCacheline, AlignOnPage }; enum Implementation : std::uint_least32_t { ScalarImpl, SSE2Impl, SSE3Impl, SSSE3Impl, SSE41Impl, SSE42Impl, AVXImpl, AVX2Impl, MICImpl, ImplementationMask = 0xfff }; enum ExtraInstructions : std::uint_least32_t { Float16cInstructions = 0x01000, Fma4Instructions = 0x02000, XopInstructions = 0x04000, PopcntInstructions = 0x08000, Sse4aInstructions = 0x10000, FmaInstructions = 0x20000, VexInstructions = 0x40000, Bmi2Instructions = 0x80000, ExtraInstructionsMask = 0xfffff000u }; template struct ImplementationT { static constexpr Implementation current() { return static_cast(Features & ImplementationMask); } static constexpr bool is(Implementation impl) { return static_cast(impl) == current(); } static constexpr bool is_between(Implementation low, Implementation high) { return static_cast(low) <= current() && static_cast(high) >= current(); } static constexpr bool runs_on(unsigned int extraInstructions) { return (extraInstructions & Features & ExtraInstructionsMask) == (Features & ExtraInstructionsMask); } }; using CurrentImplementation = ImplementationT< #ifdef Vc_IMPL_Scalar ScalarImpl #elif defined(Vc_IMPL_AVX2) AVX2Impl #elif defined(Vc_IMPL_AVX) AVXImpl #elif defined(Vc_IMPL_SSE4_2) SSE42Impl #elif defined(Vc_IMPL_SSE4_1) SSE41Impl #elif defined(Vc_IMPL_SSSE3) SSSE3Impl #elif defined(Vc_IMPL_SSE3) SSE3Impl #elif defined(Vc_IMPL_SSE2) SSE2Impl #endif #ifdef Vc_IMPL_SSE4a + Vc::Sse4aInstructions #ifdef Vc_IMPL_XOP + Vc::XopInstructions #ifdef Vc_IMPL_FMA4 + Vc::Fma4Instructions #endif #endif #endif #ifdef Vc_IMPL_POPCNT + Vc::PopcntInstructions #endif #ifdef Vc_IMPL_FMA + Vc::FmaInstructions #endif #ifdef Vc_IMPL_BMI2 + Vc::Bmi2Instructions #endif #ifdef Vc_USE_VEX_CODING + Vc::VexInstructions #endif >; } #ifndef VC_VERSION_H_ #define VC_VERSION_H_ #define Vc_VERSION_STRING "1.4.2-dev" #define Vc_VERSION_NUMBER 0x010405 #define Vc_VERSION_CHECK(major,minor,patch) ((major << 16) | (minor << 8) | (patch << 1)) #define Vc_LIBRARY_ABI_VERSION 5 #define Vc_IS_VERSION_2 (Vc_VERSION_NUMBER >= Vc_VERSION_CHECK(1, 70, 0)) #define Vc_IS_VERSION_1 (Vc_VERSION_NUMBER < Vc_VERSION_CHECK(1, 70, 0)) namespace Vc_VERSIONED_NAMESPACE { inline const char *versionString() { return Vc_VERSION_STRING; } constexpr unsigned int versionNumber() { return Vc_VERSION_NUMBER; } } #endif #endif #ifndef VC_TRAITS_TYPE_TRAITS_H_ #define VC_TRAITS_TYPE_TRAITS_H_ #include #ifndef VC_TRAITS_DECAY_H_ #define VC_TRAITS_DECAY_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Traits { template using decay = typename std::decay::type; } } #endif #ifndef VC_TRAITS_HAS_NO_ALLOCATED_DATA_H_ #define VC_TRAITS_HAS_NO_ALLOCATED_DATA_H_ #include namespace Vc_VERSIONED_NAMESPACE { namespace Traits { template struct has_no_allocated_data_impl : public std::false_type {}; template struct has_no_allocated_data : public has_no_allocated_data_impl< typename std::remove_cv::type>::type> { }; template struct has_no_allocated_data_impl> : public std::true_type {}; template struct has_no_allocated_data_impl : public std::true_type {}; template struct has_no_allocated_data_impl : public std::true_type {}; } } #endif #ifndef VC_TRAITS_HAS_CONTIGUOUS_STORAGE_H_ #define VC_TRAITS_HAS_CONTIGUOUS_STORAGE_H_ #include #include #ifdef _LIBCPP_BEGIN_NAMESPACE_STD _LIBCPP_BEGIN_NAMESPACE_STD #else namespace std { #endif #ifdef _WIN32 template class array; #else template struct array; #endif template class vector; #ifdef _LIBCPP_END_NAMESPACE_STD _LIBCPP_END_NAMESPACE_STD #else } #endif namespace Vc_VERSIONED_NAMESPACE { namespace Traits { namespace has_contiguous_storage_detail { template std::is_base_of::iterator_category> test(int); template std::is_base_of::iterator_category> test(long); template std::false_type test(...); } template struct has_contiguous_storage_impl : public decltype(has_contiguous_storage_detail::test(int())) { }; template struct has_contiguous_storage : public has_contiguous_storage_impl< typename std::remove_cv::type>::type> { }; template struct has_contiguous_storage_impl : public std::true_type {}; template struct has_contiguous_storage_impl : public std::true_type {}; template struct has_contiguous_storage_impl> : public std::true_type {}; template struct has_contiguous_storage_impl> : public std::true_type {}; template struct has_contiguous_storage_impl : public std::true_type {}; template struct has_contiguous_storage_impl> : public std::true_type {}; template struct has_contiguous_storage_impl> : public std::true_type {}; } } #endif #ifndef VC_TRAITS_IS_FUNCTOR_ARGUMENT_IMMUTABLE_H_ #define VC_TRAITS_IS_FUNCTOR_ARGUMENT_IMMUTABLE_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Traits { namespace is_functor_argument_immutable_impl { template std::true_type test(void (F::*)(A)); template std::true_type test(void (F::*)(A) const); template std::is_const test(void (F::*)(A &)); template std::is_const test(void (F::*)(A &) const); template std::is_const test(void (F::*)(A &&)); template std::is_const test(void (F::*)(A &&) const); struct dummy {}; template < typename F, typename A, #ifdef Vc_MSVC #define Vc_TEMPLATE_ #else #define Vc_TEMPLATE_ template #endif typename MemberPtr = decltype(&F::Vc_TEMPLATE_ operator())> decltype(is_functor_argument_immutable_impl::test(std::declval())) test2(int); #undef Vc_TEMPLATE_ template decltype( is_functor_argument_immutable_impl::test(std::declval())) test2(float); template std::true_type test3(void(*)(A)); template std::is_const test3(void(*)(A &)); template std::is_const test3(void(*)(A &&)); } template ::value> struct is_functor_argument_immutable; template struct is_functor_argument_immutable : decltype(is_functor_argument_immutable_impl::test2< typename std::remove_reference::type, A>(int())) { }; template struct is_functor_argument_immutable : decltype(is_functor_argument_immutable_impl::test3(std::declval())) { }; } } #endif #ifndef VC_TRAITS_IS_OUTPUT_ITERATOR_H_ #define VC_TRAITS_IS_OUTPUT_ITERATOR_H_ #include namespace Vc_VERSIONED_NAMESPACE { namespace Traits { namespace is_output_iterator_impl { template ::value_type, typename = decltype(*std::declval() = std::declval< ValueType>()) > std::true_type test(int); template std::false_type test(...); } template struct is_output_iterator : public std::conditional< std::is_void::value_type>::value, std::true_type, decltype(is_output_iterator_impl::test(int()))>::type { }; static_assert(!std::is_void::value_type>::value, ""); static_assert(is_output_iterator::value, ""); static_assert(!is_output_iterator::value, ""); } } #endif #ifndef VC_IS_INDEX_SEQUENCE_H_ #define VC_IS_INDEX_SEQUENCE_H_ #ifndef VC_COMMON_INDEXSEQUENCE_H_ #define VC_COMMON_INDEXSEQUENCE_H_ namespace Vc_VERSIONED_NAMESPACE { template struct index_sequence { static constexpr std::size_t size() noexcept { return sizeof...(I); } }; template struct make_index_sequence_impl { template static index_sequence join(std::false_type, index_sequence); template static index_sequence join( std::true_type, index_sequence); using is_odd = std::integral_constant; using half = typename make_index_sequence_impl::type; using type = decltype(join<(N + 1) / 2>(is_odd(), half())); }; template <> struct make_index_sequence_impl<0> { using type = index_sequence<>; }; template <> struct make_index_sequence_impl<1> { using type = index_sequence<0>; }; template <> struct make_index_sequence_impl<2> { using type = index_sequence<0, 1>; }; template using make_index_sequence = typename make_index_sequence_impl::type; } #endif namespace Vc_VERSIONED_NAMESPACE { namespace Traits { template struct is_index_sequence : public std::false_type {}; template struct is_index_sequence> : public std::true_type {}; static_assert(!is_index_sequence::value, ""); static_assert(is_index_sequence>::value, ""); } } #endif #ifndef VC_TRAITS_IS_IMPLICIT_CAST_ALLOWED_H_ #define VC_TRAITS_IS_IMPLICIT_CAST_ALLOWED_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Traits { template ::value> struct is_implicit_cast_allowed : public std::integral_constant< bool, std::is_same::value || (std::is_integral::value && (std::is_same::type, To>::value || std::is_same::type, To>::value))> { }; template struct is_implicit_cast_allowed : public std::is_same::type { }; template struct is_implicit_cast_allowed_mask : public is_implicit_cast_allowed { }; } } #endif namespace Vc_VERSIONED_NAMESPACE { struct enable_if_default_type { constexpr enable_if_default_type() {} }; static constexpr enable_if_default_type nullarg; template using enable_if = typename std::enable_if::type; template using conditional_t = typename std::conditional::type; template using remove_cvref_t = typename std::remove_cv::type>::type; namespace Traits { #ifndef VC_TRAITS_HAS_SUBSCRIPT_OPERATOR_H_ #define VC_TRAITS_HAS_SUBSCRIPT_OPERATOR_H_ namespace has_subscript_operator_impl { template ()[std::declval()])> std::true_type test(int); template std::false_type test(float); } template struct has_subscript_operator : public decltype(has_subscript_operator_impl::test(1)) { }; #endif #ifndef VC_TRAITS_HAS_MULTIPLY_OPERATOR_H_ #define VC_TRAITS_HAS_MULTIPLY_OPERATOR_H_ namespace has_multiply_operator_impl { template () * std::declval())> std::true_type test(int); template std::false_type test(...); } template struct has_multiply_operator : public decltype(has_multiply_operator_impl::test(1)) { }; #endif #ifndef VC_TRAITS_HAS_ADDITION_OPERATOR_H_ #define VC_TRAITS_HAS_ADDITION_OPERATOR_H_ namespace has_addition_operator_impl { template () + std::declval())> std::true_type test(int); template std::false_type test(...); } template struct has_addition_operator : public decltype(has_addition_operator_impl::test(1)) { }; #endif #ifndef VC_TRAITS_HAS_EQUALITY_OPERATOR_H_ #define VC_TRAITS_HAS_EQUALITY_OPERATOR_H_ namespace has_equality_operator_impl { template () == std::declval())>::value>> std::true_type test(int); template std::false_type test(...); } template struct has_equality_operator : public decltype(has_equality_operator_impl::test(1)) { }; #endif template struct is_valid_vector_argument : public std::false_type {}; template <> struct is_valid_vector_argument : public std::true_type {}; template <> struct is_valid_vector_argument : public std::true_type {}; template <> struct is_valid_vector_argument : public std::true_type {}; template <> struct is_valid_vector_argument : public std::true_type {}; template <> struct is_valid_vector_argument : public std::true_type {}; template <> struct is_valid_vector_argument : public std::true_type {}; template struct is_simd_mask_internal : public std::false_type {}; template struct is_simd_vector_internal : public std::false_type {}; template struct is_simdarray_internal : public std::false_type {}; template struct is_simd_mask_array_internal : public std::false_type {}; template struct is_loadstoreflag_internal : public std::false_type {}; template ::value> struct is_integral_internal; template ::value> struct is_floating_point_internal; template ::value> struct is_signed_internal; template ::value> struct is_unsigned_internal; template struct is_integral_internal : public std::is_integral {}; template struct is_floating_point_internal : public std::is_floating_point {}; template struct is_signed_internal : public std::is_signed {}; template struct is_unsigned_internal : public std::is_unsigned {}; template struct is_integral_internal : public std::is_integral {}; template struct is_floating_point_internal : public std::is_floating_point {}; template struct is_signed_internal : public std::is_signed {}; template struct is_unsigned_internal : public std::is_unsigned {}; template struct is_arithmetic_internal : public std::integral_constant< bool, (is_floating_point_internal::value || is_integral_internal::value)> { }; template struct vector_size_internal : std::integral_constant { }; template struct vector_size_internal 0))> : std::integral_constant { }; template struct is_simd_mask : public std::integral_constant>::value || is_simd_mask_array_internal>::value)> { }; template struct is_simd_vector : public std::integral_constant>::value || is_simdarray_internal>::value)> { }; template struct isSimdArray : public is_simdarray_internal> { }; template struct isSimdMaskArray : public is_simd_mask_array_internal> { }; template struct is_load_store_flag : public is_loadstoreflag_internal> {}; template struct is_atomic_simdarray_internal : public std::false_type {}; template using isAtomicSimdArray = is_atomic_simdarray_internal>; template struct is_atomic_simd_mask_array_internal : public std::false_type {}; template using isAtomicSimdMaskArray = is_atomic_simd_mask_array_internal>; template struct simd_vector_size : public vector_size_internal> {}; template struct is_integral : public is_integral_internal> {}; template struct is_floating_point : public is_floating_point_internal> {}; template struct is_arithmetic : public is_arithmetic_internal> {}; template struct is_signed : public is_signed_internal> {}; template struct is_unsigned : public is_unsigned_internal> {}; template struct scalar_type_internal { using type = T; }; template struct scalar_type_internal { using type = typename T::EntryType; }; template using scalar_type = typename scalar_type_internal, is_simd_vector::value>::type; } } #ifndef VC_TRAITS_ENTRY_TYPE_OF_H_ #define VC_TRAITS_ENTRY_TYPE_OF_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Traits { namespace entry_type_of_internal { template ::value> struct entry_type; template struct entry_type { using type = typename decay::EntryType; }; template struct entry_type { using type = typename std::remove_cv::type>::type; }; } template using entry_type_of = typename entry_type_of_internal::entry_type::type; } } #endif #endif #ifndef VC_COMMON_PERMUTATION_H_ #define VC_COMMON_PERMUTATION_H_ #ifndef VC_COMMON_MACROS_H_ #define VC_COMMON_MACROS_H_ #ifdef Vc_MSVC #define Vc_ALIGNED_TYPEDEF(n_,type_,new_type_) \ typedef __declspec(align(n_)) type_ new_type_ #elif __GNUC__ #define Vc_ALIGNED_TYPEDEF(n_,type_,new_type_) \ typedef type_ new_type_[[gnu::aligned(n_)]] #else #define Vc_ALIGNED_TYPEDEF(n_,type_,new_type_) \ using new_type_ alignas(sizeof(n_)) = type_ #endif #ifdef WIN32 #define NOMINMAX 1 #if defined min #undef min #endif #if defined max #undef max #endif #endif #if defined Vc_GCC && Vc_GCC >= 0x60000 #define Vc_TEMPLATES_DROP_ATTRIBUTES 1 #endif #if Vc_IS_VERSION_2 || (defined Vc_GCC && Vc_GCC >= 0x60000) #define Vc_RECURSIVE_MEMORY 1 #endif #if defined Vc_CLANG || defined Vc_APPLECLANG #define Vc_UNREACHABLE __builtin_unreachable #define Vc_NEVER_INLINE [[gnu::noinline]] #define Vc_INTRINSIC_L inline #define Vc_INTRINSIC_R __attribute__((always_inline)) #define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R #define Vc_FLATTEN #define Vc_CONST __attribute__((const)) #define Vc_CONST_L #define Vc_CONST_R Vc_CONST #define Vc_PURE __attribute__((pure)) #define Vc_PURE_L #define Vc_PURE_R Vc_PURE #define Vc_MAY_ALIAS __attribute__((may_alias)) #define Vc_ALWAYS_INLINE_L inline #define Vc_ALWAYS_INLINE_R __attribute__((always_inline)) #define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R #define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0) #define Vc_IS_LIKELY(x) __builtin_expect(x, 1) #define Vc_RESTRICT __restrict__ #define Vc_DEPRECATED(msg) #define Vc_DEPRECATED_ALIAS(msg) #define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) #elif defined(__GNUC__) #define Vc_UNREACHABLE __builtin_unreachable # if defined Vc_GCC && !defined __OPTIMIZE__ #define Vc_MAY_ALIAS # else #define Vc_MAY_ALIAS __attribute__((__may_alias__)) # endif #define Vc_INTRINSIC_R __attribute__((__always_inline__, __artificial__)) #define Vc_INTRINSIC_L inline #define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R #define Vc_FLATTEN __attribute__((__flatten__)) #define Vc_ALWAYS_INLINE_L inline #define Vc_ALWAYS_INLINE_R __attribute__((__always_inline__)) #define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R # ifdef Vc_ICC #define Vc_PURE #define Vc_CONST #define Vc_NEVER_INLINE # else #define Vc_NEVER_INLINE [[gnu::noinline]] #define Vc_PURE __attribute__((__pure__)) #define Vc_CONST __attribute__((__const__)) # endif #define Vc_CONST_L #define Vc_CONST_R Vc_CONST #define Vc_PURE_L #define Vc_PURE_R Vc_PURE #define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0) #define Vc_IS_LIKELY(x) __builtin_expect(x, 1) #define Vc_RESTRICT __restrict__ # ifdef Vc_ICC #define Vc_DEPRECATED(msg) #define Vc_DEPRECATED_ALIAS(msg) # else #define Vc_DEPRECATED(msg) __attribute__((__deprecated__(msg))) #define Vc_DEPRECATED_ALIAS(msg) __attribute__((__deprecated__(msg))) # endif #define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) #else #define Vc_NEVER_INLINE #define Vc_FLATTEN # ifdef Vc_PURE #undef Vc_PURE # endif #define Vc_MAY_ALIAS # ifdef Vc_MSVC #define Vc_ALWAYS_INLINE inline __forceinline #define Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE #define Vc_ALWAYS_INLINE_R #define Vc_CONST __declspec(noalias) #define Vc_CONST_L Vc_CONST #define Vc_CONST_R #define Vc_PURE #define Vc_PURE_L Vc_PURE #define Vc_PURE_R #define Vc_INTRINSIC inline __forceinline #define Vc_INTRINSIC_L Vc_INTRINSIC #define Vc_INTRINSIC_R namespace Vc_VERSIONED_NAMESPACE { namespace detail { static Vc_INTRINSIC void unreachable() { __assume(0); } } } #define Vc_UNREACHABLE Vc::detail::unreachable # else #define Vc_ALWAYS_INLINE #define Vc_ALWAYS_INLINE_L #define Vc_ALWAYS_INLINE_R #define Vc_CONST #define Vc_CONST_L #define Vc_CONST_R #define Vc_PURE #define Vc_PURE_L #define Vc_PURE_R #define Vc_INTRINSIC #define Vc_INTRINSIC_L #define Vc_INTRINSIC_R #define Vc_UNREACHABLE std::abort # endif #define Vc_IS_UNLIKELY(x) x #define Vc_IS_LIKELY(x) x #define Vc_RESTRICT __restrict #define Vc_DEPRECATED(msg) __declspec(deprecated(msg)) #define Vc_DEPRECATED_ALIAS(msg) #define Vc_WARN_UNUSED_RESULT #endif #ifdef Vc_CXX14 #undef Vc_DEPRECATED #define Vc_DEPRECATED(msg_) [[deprecated(msg_)]] #endif #define Vc_NOTHING_EXPECTING_SEMICOLON static_assert(true, "") #define Vc_FREE_STORE_OPERATORS_ALIGNED(align_) \ \ \ \ Vc_ALWAYS_INLINE void *operator new(size_t size) \ { \ return Vc::Common::aligned_malloc(size); \ } \ \ Vc_ALWAYS_INLINE void *operator new(size_t, void *p) { return p; } \ \ Vc_ALWAYS_INLINE void *operator new[](size_t size) \ { \ return Vc::Common::aligned_malloc(size); \ } \ \ Vc_ALWAYS_INLINE void *operator new[](size_t, void *p) { return p; } \ \ Vc_ALWAYS_INLINE void operator delete(void *ptr, size_t) { Vc::Common::free(ptr); } \ \ Vc_ALWAYS_INLINE void operator delete(void *, void *) {} \ \ Vc_ALWAYS_INLINE void operator delete[](void *ptr, size_t) \ { \ Vc::Common::free(ptr); \ } \ \ Vc_ALWAYS_INLINE void operator delete[](void *, void *) {} \ \ Vc_NOTHING_EXPECTING_SEMICOLON #ifdef Vc_ASSERT #define Vc_EXTERNAL_ASSERT 1 #else #ifdef NDEBUG #define Vc_ASSERT(x) #else #include #define Vc_ASSERT(x) assert(x); #endif #endif #if defined Vc_CLANG || defined Vc_APPLECLANG #define Vc_HAS_BUILTIN(x) __has_builtin(x) #else #define Vc_HAS_BUILTIN(x) 0 #endif #define Vc_CAT_HELPER_(a,b,c,d) a ##b ##c ##d #define Vc_CAT(a,b,c,d) Vc_CAT_HELPER_(a, b, c, d) #define Vc_CAT_IMPL(a,b) a ##b #define Vc_CAT2(a,b) Vc_CAT_IMPL(a, b) #define Vc_APPLY_IMPL_1_(macro,a,b,c,d,e) macro(a) #define Vc_APPLY_IMPL_2_(macro,a,b,c,d,e) macro(a, b) #define Vc_APPLY_IMPL_3_(macro,a,b,c,d,e) macro(a, b, c) #define Vc_APPLY_IMPL_4_(macro,a,b,c,d,e) macro(a, b, c, d) #define Vc_APPLY_IMPL_5_(macro,a,b,c,d,e) macro(a, b, c, d, e) #define Vc_LIST_FLOAT_VECTOR_TYPES(size,macro,a,b,c,d) \ size(macro, double_v, a, b, c, d) \ size(macro, float_v, a, b, c, d) #define Vc_LIST_INT_VECTOR_TYPES(size,macro,a,b,c,d) \ size(macro, int_v, a, b, c, d) \ size(macro, uint_v, a, b, c, d) \ size(macro, short_v, a, b, c, d) \ size(macro, ushort_v, a, b, c, d) #define Vc_LIST_VECTOR_TYPES(size,macro,a,b,c,d) \ Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \ Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) #define Vc_LIST_COMPARES(size,macro,a,b,c,d) \ size(macro, ==, a, b, c, d) \ size(macro, !=, a, b, c, d) \ size(macro, <=, a, b, c, d) \ size(macro, >=, a, b, c, d) \ size(macro, < , a, b, c, d) \ size(macro, > , a, b, c, d) #define Vc_LIST_LOGICAL(size,macro,a,b,c,d) \ size(macro, &&, a, b, c, d) \ size(macro, ||, a, b, c, d) #define Vc_LIST_BINARY(size,macro,a,b,c,d) \ size(macro, |, a, b, c, d) \ size(macro, &, a, b, c, d) \ size(macro, ^, a, b, c, d) #define Vc_LIST_SHIFTS(size,macro,a,b,c,d) \ size(macro, <<, a, b, c, d) \ size(macro, >>, a, b, c, d) #define Vc_LIST_ARITHMETICS(size,macro,a,b,c,d) \ size(macro, +, a, b, c, d) \ size(macro, -, a, b, c, d) \ size(macro, *, a, b, c, d) \ size(macro, /, a, b, c, d) \ size(macro, %, a, b, c, d) #define Vc_APPLY_0(_list,macro) _list(Vc_APPLY_IMPL_1_, macro, 0, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_APPLY_1(_list,macro,a) _list(Vc_APPLY_IMPL_2_, macro, a, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_APPLY_2(_list,macro,a,b) _list(Vc_APPLY_IMPL_3_, macro, a, b, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_APPLY_3(_list,macro,a,b,c) _list(Vc_APPLY_IMPL_4_, macro, a, b, c, 0) Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_APPLY_4(_list,macro,a,b,c,d) _list(Vc_APPLY_IMPL_5_, macro, a, b, c, d) Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_ALL_COMPARES(macro) Vc_APPLY_0(Vc_LIST_COMPARES, macro) #define Vc_ALL_LOGICAL(macro) Vc_APPLY_0(Vc_LIST_LOGICAL, macro) #define Vc_ALL_BINARY(macro) Vc_APPLY_0(Vc_LIST_BINARY, macro) #define Vc_ALL_SHIFTS(macro) Vc_APPLY_0(Vc_LIST_SHIFTS, macro) #define Vc_ALL_ARITHMETICS(macro) Vc_APPLY_0(Vc_LIST_ARITHMETICS, macro) #define Vc_ALL_FLOAT_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_FLOAT_VECTOR_TYPES, macro) #define Vc_ALL_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_VECTOR_TYPES, macro) #define Vc_EXACT_TYPE(_test,_reference,_type) \ typename std::enable_if::value, _type>::type #define Vc_make_unique(name) Vc_CAT(Vc_,name,_,__LINE__) #if defined(Vc_ICC) || defined(Vc_CLANG) || defined Vc_APPLECLANG #define Vc_OFFSETOF(Type,member) (reinterpret_cast(&reinterpret_cast(0)->member) - reinterpret_cast(0)) #else #define Vc_OFFSETOF(Type,member) offsetof(Type, member) #endif #if defined(Vc_NO_NOEXCEPT) #define Vc_NOEXCEPT throw() #else #define Vc_NOEXCEPT noexcept #endif #ifdef Vc_NO_ALWAYS_INLINE #undef Vc_ALWAYS_INLINE #undef Vc_ALWAYS_INLINE_L #undef Vc_ALWAYS_INLINE_R #define Vc_ALWAYS_INLINE inline #define Vc_ALWAYS_INLINE_L inline #define Vc_ALWAYS_INLINE_R #undef Vc_INTRINSIC #undef Vc_INTRINSIC_L #undef Vc_INTRINSIC_R #define Vc_INTRINSIC inline #define Vc_INTRINSIC_L inline #define Vc_INTRINSIC_R #endif #endif namespace Vc_VERSIONED_NAMESPACE { namespace Permutation { struct ReversedTag {}; constexpr ReversedTag Reversed{}; } } #endif namespace Vc_VERSIONED_NAMESPACE { using std::size_t; using llong = long long; using ullong = unsigned long long; using ulong = unsigned long; using uint = unsigned int; using ushort = unsigned short; using uchar = unsigned char; using schar = signed char; struct VectorSpecialInitializerZero {}; struct VectorSpecialInitializerOne {}; struct VectorSpecialInitializerIndexesFromZero {}; constexpr VectorSpecialInitializerZero Zero = {}; constexpr VectorSpecialInitializerOne One = {}; constexpr VectorSpecialInitializerIndexesFromZero IndexesFromZero = {}; namespace Detail { template struct MayAliasImpl { #ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wattributes" #endif typedef T type Vc_MAY_ALIAS; #ifdef __GNUC__ #pragma GCC diagnostic pop #endif }; } #ifdef Vc_ICC template using MayAlias [[gnu::may_alias]] = T; #else template using MayAlias = typename Detail::MayAliasImpl::type; #endif template MayAlias &aliasing_cast(From &x) { return *reinterpret_cast *>(&x); } template const MayAlias &aliasing_cast(const From &x) { return *reinterpret_cast *>(&x); } template MayAlias *aliasing_cast(From *x) { return reinterpret_cast *>(x); } template const MayAlias *aliasing_cast(const From *x) { return reinterpret_cast *>(x); } enum class Operator : char { Assign, Multiply, MultiplyAssign, Divide, DivideAssign, Remainder, RemainderAssign, Plus, PlusAssign, Minus, MinusAssign, RightShift, RightShiftAssign, LeftShift, LeftShiftAssign, And, AndAssign, Xor, XorAssign, Or, OrAssign, PreIncrement, PostIncrement, PreDecrement, PostDecrement, LogicalAnd, LogicalOr, Comma, UnaryPlus, UnaryMinus, UnaryNot, UnaryOnesComplement, CompareEqual, CompareNotEqual, CompareLess, CompareGreater, CompareLessEqual, CompareGreaterEqual }; template struct array; namespace Common { template class span; } #ifndef Vc_CHECK_ALIGNMENT template static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *){} #else template static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *ptr) { const size_t s = alignof(_T); if((reinterpret_cast(ptr) & ((s ^ (s & (s - 1))) - 1)) != 0) { fprintf(stderr, "A vector with incorrect alignment has just been created. Look at the stacktrace to find the guilty object.\n"); abort(); } } #endif namespace Common { template struct Segment; template class SuccessiveEntries { #ifdef Vc_MSVC using size_type = unsigned; #else using size_type = size_t; #endif const size_type m_first; public: typedef SuccessiveEntries AsArg; Vc_INTRINSIC SuccessiveEntries(size_type first) : m_first(first) {} Vc_INTRINSIC Vc_PURE size_type operator[](size_type offset) const { return m_first + offset * StructSize; } Vc_INTRINSIC Vc_PURE size_type data() const { return m_first; } Vc_INTRINSIC Vc_PURE SuccessiveEntries operator+(const SuccessiveEntries &rhs) const { return SuccessiveEntries(m_first + rhs.m_first); } Vc_INTRINSIC Vc_PURE SuccessiveEntries operator*(const SuccessiveEntries &rhs) const { return SuccessiveEntries(m_first * rhs.m_first); } Vc_INTRINSIC Vc_PURE SuccessiveEntries operator<<(size_type x) const { return {m_first << x}; } friend Vc_INTRINSIC SuccessiveEntries &internal_data(SuccessiveEntries &x) { return x; } friend Vc_INTRINSIC const SuccessiveEntries &internal_data(const SuccessiveEntries &x) { return x; } }; template Vc_INTRINSIC_L void *aligned_malloc(std::size_t n) Vc_INTRINSIC_R; Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R; template using enable_if_mask_converts_implicitly = enable_if<(!std::is_same>::value && Traits::is_simd_mask::value && !Traits::isSimdMaskArray::value && Traits::is_implicit_cast_allowed_mask< Traits::entry_type_of::Vector>, T>::value)>; template using enable_if_mask_converts_explicitly = enable_if<( Traits::isSimdMaskArray::value || (Traits::is_simd_mask::value && !Traits::is_implicit_cast_allowed_mask< Traits::entry_type_of::Vector>, T>::value))>; template using WidthT = std::integral_constant; template class MaskBool; template class SubscriptOperation; template struct GatherArguments { static_assert(std::is_same>::value && !std::is_pointer::value, "GatherArguments expects an cv unqualified non-ref/ptr type"); const IndexVector indexes; const T *const address; }; template GatherArguments make_gather(const T *m, const I &i) { return {i, m}; } template struct ScatterArguments { const IndexVector indexes; T *const address; }; template Vc_INTRINSIC enable_if<(Begin >= End), void> unrolled_loop(F &&) { } template Vc_INTRINSIC Vc_FLATTEN enable_if<(Begin < End), void> unrolled_loop(F &&f) { f(Begin); unrolled_loop(f); } template Vc_INTRINSIC void for_all_vector_entries(F &&f) { unrolled_loop(std::forward(f)); } } } #ifndef VC_COMMON_VECTOR_H_ #define VC_COMMON_VECTOR_H_ #include #ifndef VC_COMMON_ELEMENTREFERENCE_H_ #define VC_COMMON_ELEMENTREFERENCE_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template class ElementReference { friend U; friend Accessor; Vc_INTRINSIC ElementReference(U &o, int i) noexcept : index(i), obj(o) {} static constexpr bool get_noexcept = noexcept(Accessor::get(std::declval(), int())); template static constexpr bool set_noexcept() { return noexcept(Accessor::set(std::declval(), int(), std::declval())); } public: using value_type = typename U::value_type; Vc_INTRINSIC ElementReference(const ElementReference &) = delete; Vc_INTRINSIC ElementReference(ElementReference &&) = default; Vc_INTRINSIC operator value_type() const noexcept(get_noexcept) { return Accessor::get(obj, index); } template Vc_INTRINSIC ElementReference &operator=(T &&x) && noexcept(noexcept(Accessor::set(std::declval(), int(), std::declval()))) { Accessor::set(obj, index, std::forward(x)); return *this; } #define Vc_OP_(op_) \ template () \ op_ std::declval())> \ Vc_INTRINSIC ElementReference &operator op_##=(T &&x) && \ noexcept(get_noexcept && noexcept(Accessor::set(std::declval(), int(), \ std::declval()))) \ { \ const value_type &lhs = Accessor::get(obj, index); \ Accessor::set(obj, index, lhs op_ std::forward(x)); \ return *this; \ } Vc_ALL_ARITHMETICS(Vc_OP_); Vc_ALL_SHIFTS(Vc_OP_); Vc_ALL_BINARY(Vc_OP_); #undef Vc_OP_ template Vc_INTRINSIC ElementReference &operator++() && noexcept(noexcept(std::declval() = Accessor::get(std::declval(), int())) && set_noexcept())>()) { value_type x = Accessor::get(obj, index); Accessor::set(obj, index, ++x); return *this; } template Vc_INTRINSIC value_type operator++(int) && noexcept(noexcept(std::declval() = Accessor::get(std::declval(), int())) && set_noexcept()++)>()) { const value_type r = Accessor::get(obj, index); value_type x = r; Accessor::set(obj, index, ++x); return r; } template Vc_INTRINSIC ElementReference &operator--() && noexcept(noexcept(std::declval() = Accessor::get(std::declval(), int())) && set_noexcept())>()) { value_type x = Accessor::get(obj, index); Accessor::set(obj, index, --x); return *this; } template Vc_INTRINSIC value_type operator--(int) && noexcept(noexcept(std::declval() = Accessor::get(std::declval(), int())) && set_noexcept()--)>()) { const value_type r = Accessor::get(obj, index); value_type x = r; Accessor::set(obj, index, --x); return r; } friend void swap(ElementReference &&a, ElementReference &&b) { value_type tmp(a); static_cast(a) = static_cast(b); static_cast(b) = tmp; } friend void swap(value_type &a, ElementReference &&b) { value_type tmp(a); a = static_cast(b); static_cast(b) = tmp; } friend void swap(ElementReference &&a, value_type &b) { value_type tmp(a); static_cast(a) = b; b = tmp; } private: int index; U &obj; }; } } #endif #ifndef VC_COMMON_VECTORABI_H_ #define VC_COMMON_VECTORABI_H_ namespace Vc_VERSIONED_NAMESPACE { namespace VectorAbi { template using Avx1Abi = typename std::conditional::value, VectorAbi::Sse, VectorAbi::Avx>::type; template struct DeduceCompatible { #ifdef __x86_64__ using type = Sse; #else using type = Scalar; #endif }; template struct DeduceBest { using type = typename std::conditional< CurrentImplementation::is(ScalarImpl), Scalar, typename std::conditional< CurrentImplementation::is_between(SSE2Impl, SSE42Impl), Sse, typename std::conditional< CurrentImplementation::is(AVXImpl), Avx1Abi, typename std::conditional::type>::type>::type>::type; }; template using Best = typename DeduceBest::type; } } #ifndef VC_COMMON_SIMDARRAYFWD_H_ #define VC_COMMON_SIMDARRAYFWD_H_ #ifndef VC_SSE_TYPES_H_ #define VC_SSE_TYPES_H_ #ifdef Vc_DEFAULT_IMPL_SSE #define Vc_DOUBLE_V_SIZE 2 #define Vc_FLOAT_V_SIZE 4 #define Vc_INT_V_SIZE 4 #define Vc_UINT_V_SIZE 4 #define Vc_SHORT_V_SIZE 8 #define Vc_USHORT_V_SIZE 8 #endif namespace Vc_VERSIONED_NAMESPACE { namespace SSE { template using Vector = Vc::Vector; typedef Vector double_v; typedef Vector float_v; typedef Vector int_v; typedef Vector uint_v; typedef Vector short_v; typedef Vector ushort_v; template using Mask = Vc::Mask; typedef Mask double_m; typedef Mask float_m; typedef Mask int_m; typedef Mask uint_m; typedef Mask short_m; typedef Mask ushort_m; template struct Const; template struct is_vector : public std::false_type {}; template struct is_vector> : public std::true_type {}; template struct is_mask : public std::false_type {}; template struct is_mask> : public std::true_type {}; } namespace Traits { template struct is_simd_vector_internal> : public is_valid_vector_argument {}; template struct is_simd_mask_internal> : public std::true_type {}; } } #endif #ifndef VC_AVX_TYPES_H_ #define VC_AVX_TYPES_H_ #ifndef VC_AVX_MACROS_H_ #define VC_AVX_MACROS_H_ #endif #ifdef Vc_DEFAULT_IMPL_AVX2 #define Vc_DOUBLE_V_SIZE 4 #define Vc_FLOAT_V_SIZE 8 #define Vc_INT_V_SIZE 8 #define Vc_UINT_V_SIZE 8 #define Vc_SHORT_V_SIZE 16 #define Vc_USHORT_V_SIZE 16 #elif defined Vc_DEFAULT_IMPL_AVX #define Vc_DOUBLE_V_SIZE 4 #define Vc_FLOAT_V_SIZE 8 #define Vc_INT_V_SIZE 4 #define Vc_UINT_V_SIZE 4 #define Vc_SHORT_V_SIZE 8 #define Vc_USHORT_V_SIZE 8 #endif namespace Vc_VERSIONED_NAMESPACE { namespace AVX { template using Vector = Vc::Vector>; typedef Vector double_v; typedef Vector float_v; typedef Vector int_v; typedef Vector uint_v; typedef Vector short_v; typedef Vector ushort_v; template using Mask = Vc::Mask>; typedef Mask double_m; typedef Mask float_m; typedef Mask int_m; typedef Mask uint_m; typedef Mask short_m; typedef Mask ushort_m; template struct Const; template struct is_vector : public std::false_type {}; template struct is_vector> : public std::true_type {}; template struct is_mask : public std::false_type {}; template struct is_mask> : public std::true_type {}; } namespace AVX2 { template using Vector = Vc::Vector; using double_v = Vector; using float_v = Vector< float>; using int_v = Vector< int>; using uint_v = Vector< uint>; using short_v = Vector< short>; using ushort_v = Vector; template using Mask = Vc::Mask; using double_m = Mask; using float_m = Mask< float>; using llong_m = Mask< llong>; using ullong_m = Mask; using long_m = Mask< long>; using ulong_m = Mask< ulong>; using int_m = Mask< int>; using uint_m = Mask< uint>; using short_m = Mask< short>; using ushort_m = Mask; using schar_m = Mask< schar>; using uchar_m = Mask< uchar>; template struct is_vector : public std::false_type {}; template struct is_vector> : public std::true_type {}; template struct is_mask : public std::false_type {}; template struct is_mask> : public std::true_type {}; } namespace Traits { template struct is_simd_vector_internal> : public is_valid_vector_argument {}; template struct is_simd_mask_internal> : public std::true_type {}; } } #endif #ifndef VC_COMMON_UTILITY_H_ #define VC_COMMON_UTILITY_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Common { template struct NextPowerOfTwo; template struct NextPowerOfTwo : public std::integral_constant { }; template struct NextPowerOfTwo : public std::integral_constant< size_t, NextPowerOfTwo<(x | (x >> 1) | (x >> 2) | (x >> 5)) + 1>::value> { }; template struct BoundedAlignment : public std::integral_constant { }; template static constexpr std::size_t left_size() { return Common::NextPowerOfTwo<(N + 1) / 2>::value; } template static constexpr std::size_t right_size() { return N - left_size(); } } } #endif namespace Vc_VERSIONED_NAMESPACE { template class Vector> : public SimdArray { using SimdArray::SimdArray; public: Vc_INTRINSIC Vector(const Vector &x) : SimdArray(x) {} Vc_INTRINSIC Vector &operator=(const Vector &x) { SimdArray::operator=(x); return *this; } Vector() = default; using abi_type = simd_abi::fixed_size; using abi = abi_type; Vc_DEPRECATED("use Vector([](int n) { return n; }) instead of " "Vector::IndexesFromZero()") static Vector IndexesFromZero() { return Vector([](size_t i) -> T { return i; }); } Vc_DEPRECATED("use 0 instead of Vector::Zero()") static Vector Zero() { return 0; } Vc_DEPRECATED("use 1 instead of Vector::One()") static Vector One() { return 1; } }; template class Mask> : public SimdMaskArray { using SimdMaskArray::SimdMaskArray; public: Vc_INTRINSIC Mask(const Mask &x) : SimdMaskArray(x) {} Vc_INTRINSIC Mask &operator=(const Mask &x) { SimdMaskArray::operator=(x); return *this; } Mask() = default; using abi_type = simd_abi::fixed_size; using abi = abi_type; }; template struct SimdArrayTraits { static constexpr std::size_t N0 = Common::left_size(); static constexpr std::size_t N1 = Common::right_size(); using storage_type0 = fixed_size_simd; using storage_type1 = fixed_size_simd; }; template Vc_INTRINSIC_L typename SimdArrayTraits::storage_type0 &internal_data0( SimdArray &x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L typename SimdArrayTraits::storage_type1 &internal_data1( SimdArray &x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L const typename SimdArrayTraits::storage_type0 &internal_data0( const SimdArray &x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L const typename SimdArrayTraits::storage_type1 &internal_data1( const SimdArray &x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L V &internal_data(SimdArray &x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L const V &internal_data(const SimdArray &x) Vc_INTRINSIC_R; namespace Traits { template struct is_fixed_size_simd : std::false_type { }; template struct is_fixed_size_simd> : std::true_type { }; template struct is_fixed_size_simd> : std::true_type { }; template struct is_simd_vector_internal> : is_valid_vector_argument {}; template struct is_simd_mask_internal> : is_valid_vector_argument {}; template struct is_atomic_simdarray_internal> : is_valid_vector_argument {}; template struct is_atomic_simdarray_internal> : is_atomic_simdarray_internal> { }; template struct is_atomic_simd_mask_array_internal> : is_valid_vector_argument { }; template struct is_atomic_simd_mask_array_internal> : is_atomic_simd_mask_array_internal> { }; template struct is_simdarray_internal> : is_valid_vector_argument { }; template struct is_simdarray_internal> : is_valid_vector_argument { }; template struct is_simd_mask_array_internal> : is_valid_vector_argument { }; template struct is_simd_mask_array_internal> : is_valid_vector_argument { }; template struct is_integral_internal, false> : std::is_integral { }; template struct is_floating_point_internal, false> : std::is_floating_point { }; template struct is_signed_internal, false> : std::is_signed { }; template struct is_unsigned_internal, false> : std::is_unsigned { }; template struct has_no_allocated_data_impl> : std::true_type { }; } } #endif namespace Vc_VERSIONED_NAMESPACE { namespace detail { template struct is_fixed_size_abi : std::false_type { }; template struct is_fixed_size_abi> : std::true_type { }; template using not_fixed_size_abi = typename std::enable_if::value, T>::type; } } #endif #ifndef VC_COMMON_VECTORTRAITS_H_ #define VC_COMMON_VECTORTRAITS_H_ namespace Vc_VERSIONED_NAMESPACE { template struct VectorTraits; } #endif #ifndef VC_COMMON_LOADSTOREFLAGS_H_ #define VC_COMMON_LOADSTOREFLAGS_H_ namespace Vc_VERSIONED_NAMESPACE { struct Exclusive {}; struct Shared {}; namespace LoadStoreFlags { struct StreamingFlag {}; struct UnalignedFlag {}; struct PrefetchFlagBase {}; template struct PrefetchFlag : public PrefetchFlagBase { typedef ExclusiveOrShared_ ExclusiveOrShared; static constexpr size_t L1Stride = L1; static constexpr size_t L2Stride = L2; static constexpr bool IsExclusive = std::is_same::value; static constexpr bool IsShared = std::is_same::value; }; template struct ExtractType { typedef Default type; }; template struct ExtractType { typedef typename std::conditional::value, T, typename ExtractType::type>::type type; }; #ifdef Vc_ICC #pragma warning(disable: 177) #endif template struct LoadStoreFlags { private: typedef typename ExtractType, Flags...>::type Prefetch; public: constexpr LoadStoreFlags() {} static constexpr bool IsStreaming = !std::is_same::type, void>::value; static constexpr bool IsUnaligned = !std::is_same::type, void>::value; static constexpr bool IsAligned = !IsUnaligned; static constexpr bool IsPrefetch = !std::is_same::type, void>::value; static constexpr bool IsExclusivePrefetch = Prefetch::IsExclusive; static constexpr bool IsSharedPrefetch = Prefetch::IsShared; static constexpr size_t L1Stride = Prefetch::L1Stride; static constexpr size_t L2Stride = Prefetch::L2Stride; typedef LoadStoreFlags::value, void, Flags>::type...> UnalignedRemoved; typedef typename std::conditional::type EnableIfAligned; typedef typename std::conditional::type EnableIfStreaming; typedef typename std::conditional::type EnableIfUnalignedNotStreaming; typedef typename std::conditional::type EnableIfUnalignedAndStreaming; typedef typename std::conditional::type EnableIfUnaligned; typedef typename std::conditional::type EnableIfNotUnaligned; typedef typename std::conditional::type EnableIfPrefetch; typedef typename std::conditional::type EnableIfNotPrefetch; }; template<> struct LoadStoreFlags<> { constexpr LoadStoreFlags() {} static constexpr bool IsStreaming = false; static constexpr bool IsUnaligned = false; static constexpr bool IsAligned = !IsUnaligned; static constexpr bool IsPrefetch = false; static constexpr bool IsExclusivePrefetch = false; static constexpr bool IsSharedPrefetch = false; static constexpr size_t L1Stride = 0; static constexpr size_t L2Stride = 0; typedef void* EnableIfAligned; typedef void* EnableIfNotUnaligned; typedef void* EnableIfNotPrefetch; }; template constexpr LoadStoreFlags operator|(LoadStoreFlags, LoadStoreFlags) { return LoadStoreFlags(); } } using LoadStoreFlags::PrefetchFlag; typedef LoadStoreFlags::LoadStoreFlags<> AlignedTag; typedef LoadStoreFlags::LoadStoreFlags StreamingTag; typedef LoadStoreFlags::LoadStoreFlags UnalignedTag; typedef UnalignedTag DefaultLoadTag; typedef UnalignedTag DefaultStoreTag; constexpr AlignedTag Aligned; constexpr UnalignedTag Unaligned; constexpr StreamingTag Streaming; constexpr LoadStoreFlags::LoadStoreFlags> PrefetchDefault; template ::L1Stride, size_t L2 = PrefetchFlag<>::L2Stride, typename ExclusiveOrShared = PrefetchFlag<>::ExclusiveOrShared> struct Prefetch : public LoadStoreFlags::LoadStoreFlags> { }; namespace Traits { template struct is_loadstoreflag_internal> : public std::true_type { }; template struct is_loadstoreflag_internal> : public std::true_type { }; } } #endif #ifndef VC_COMMON_WRITEMASKEDVECTOR_H_ #define VC_COMMON_WRITEMASKEDVECTOR_H_ #include namespace Vc_VERSIONED_NAMESPACE { namespace Common { template class WriteMaskedVector { static_assert( V::Size == M::Size, "incorrect use of Vc::Common::WriteMaskedVector. V and M must have the same «Size»."); public: typedef M Mask; static constexpr size_t Size = V::Size; Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask)); Vc_INTRINSIC WriteMaskedVector(V &v, const Mask &k) : mask(k), vec(v) { } Vc_INTRINSIC V &operator++() { V one = V::One(); one.setZeroInverted(mask); return vec += one; } Vc_INTRINSIC V &operator--() { V one = V::One(); one.setZeroInverted(mask); return vec -= one; } Vc_INTRINSIC V operator++(int) { V ret(vec); operator++(); return ret; } Vc_INTRINSIC V operator--(int) { V ret(vec); operator--(); return ret; } #define Vc_OPERATOR_(op) \ template Vc_ALWAYS_INLINE void operator op##=(U &&x) \ { \ operator=(static_cast(vec op std::forward(x))); \ } Vc_ALL_BINARY(Vc_OPERATOR_); Vc_ALL_ARITHMETICS(Vc_OPERATOR_); Vc_ALL_SHIFTS(Vc_OPERATOR_); #undef Vc_OPERATOR_ Vc_ALWAYS_INLINE void operator=(const V &x) { vec.assign(x, mask); } template Vc_ALWAYS_INLINE void operator=(SubscriptOperation &&x) { vec.gather(std::move(x).gatherArguments(), mask); } template Vc_INTRINSIC void call(const F &f) const { return vec.call(f, mask); } template Vc_INTRINSIC V apply(const F &f) const { return vec.apply(f, mask); } template Vc_INTRINSIC void call(F &&f) const { return vec.call(std::forward(f), mask); } template Vc_INTRINSIC V apply(F &&f) const { return vec.apply(std::forward(f), mask); } private: #ifdef Vc_ICC const Mask &mask; #else const Mask mask; #endif V &vec; }; } } #endif #ifndef VC_COMMON_DETAIL_H_ #define VC_COMMON_DETAIL_H_ #include namespace Vc_VERSIONED_NAMESPACE { namespace Common { template Vc_INTRINSIC enable_if<(Traits::is_simd_vector::value && sizeof(typename IV::EntryType) >= sizeof(int)), const IV &> convertIndexVector(const IV &indexVector) { return indexVector; } template Vc_INTRINSIC enable_if<(Traits::is_simd_vector::value && sizeof(typename IV::EntryType) < sizeof(int)), fixed_size_simd> convertIndexVector(const IV &indexVector) { return static_cast>(indexVector); } template using promoted_type = decltype(std::declval() + 1); template Vc_INTRINSIC enable_if::value, fixed_size_simd, N>> convertIndexVector(const std::array &indexVector) { return fixed_size_simd, N>{std::addressof(indexVector[0]), Vc::Unaligned}; } template Vc_INTRINSIC enable_if::value, fixed_size_simd, N>> convertIndexVector(const Vc::array &indexVector) { return fixed_size_simd, N>{std::addressof(indexVector[0]), Vc::Unaligned}; } template Vc_INTRINSIC enable_if::value, fixed_size_simd, N>> convertIndexVector(const T (&indexVector)[N]) { return fixed_size_simd, N>{std::addressof(indexVector[0]), Vc::Unaligned}; } #ifndef Vc_MSVC template enable_if::value, void> convertIndexVector(T indexVector) = delete; #endif template Vc_INTRINSIC std::vector> convertIndexVector( const std::initializer_list &indexVector) { return {begin(indexVector), end(indexVector)}; } template Vc_INTRINSIC enable_if<(std::is_integral::value && sizeof(T) >= sizeof(int)), std::vector> convertIndexVector(const std::vector &indexVector) { return indexVector; } template Vc_INTRINSIC enable_if<(std::is_integral::value && sizeof(T) < sizeof(int)), std::vector>> convertIndexVector(const std::vector &indexVector) { return {std::begin(indexVector), std::end(indexVector)}; } template ::value && !Traits::is_simd_vector::value && !std::is_lvalue_reference()[0])>::value)>> Vc_INTRINSIC const T &convertIndexVector(const T &i) { return i; } } } #endif namespace Vc_VERSIONED_NAMESPACE { template ::value && !detail::is_fixed_size_abi::value>> inline Vector copysign(Vector magnitude, Vector sign); template ::value && !detail::is_fixed_size_abi::value>> inline Vector exponent(Vector x); template Vc_INTRINSIC Vc_CONST typename Vector>::MaskType isnegative(Vector x) { return x < Vector::Zero(); } template> class Vector { public: static constexpr size_t size() { return VectorTraits::size(); } static constexpr size_t MemoryAlignment = VectorTraits::memoryAlignment(); using abi = Abi; using EntryType = typename VectorTraits::EntryType; using value_type = EntryType; using VectorEntryType = typename VectorTraits::VectorEntryType; using VectorType = typename VectorTraits::VectorType; using vector_type = VectorType; using MaskType = Vc::Mask; using mask_type = MaskType; using MaskArgument = MaskType; using VectorArgument = Vector; using IndexType = Vc::fixed_size_simd::size()>; using index_type = IndexType; using reference = Detail::ElementReference; static inline Vector Zero(); static inline Vector One(); static inline Vector IndexesFromZero(); static inline Vector Random(); template static inline Vector generate(G gen); inline Vector() = default; explicit inline Vector(VectorSpecialInitializerZero); explicit inline Vector(VectorSpecialInitializerOne); explicit inline Vector(VectorSpecialInitializerIndexesFromZero); template inline Vector(Vector x, enable_if::value> = nullarg); #if Vc_IS_VERSION_1 template Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between " "vector types") inline explicit Vector( Vector x, enable_if::value> = nullarg); #endif inline Vector(EntryType a); template inline Vector(U a, enable_if::value && !std::is_same::value> = nullarg); inline explicit Vector(reference a); explicit Vc_INTRINSIC Vector(const EntryType *mem) { load(mem); } template ::value>> explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags) { load(mem, flags); } template ::value || !std::is_integral::value || sizeof(EntryType) >= sizeof(U)) && std::is_arithmetic::value &&Traits::is_load_store_flag::value>> explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags()) { load(x, flags); } Vc_INTRINSIC void load(const EntryType *mem) { load(mem, DefaultLoadTag()); } template Vc_INTRINSIC enable_if::value, void> load(const EntryType *mem, Flags flags) { load(mem, flags); } private: template struct load_concept : public std::enable_if< (!std::is_integral::value || !std::is_integral::value || sizeof(EntryType) >= sizeof(U)) && std::is_arithmetic::value && Traits::is_load_store_flag::value, void> {}; public: template Vc_INTRINSIC_L typename load_concept::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R; template < typename U, typename Flags = DefaultStoreTag, typename = enable_if::value &&Traits::is_load_store_flag::value>> Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R; template < typename U, typename Flags = DefaultStoreTag, typename = enable_if::value &&Traits::is_load_store_flag::value>> Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R; Vc_INTRINSIC void store(EntryType *mem) const { store(mem, DefaultStoreTag()); } template ::value>> Vc_INTRINSIC void store(EntryType *mem, Flags flags) const { store(mem, flags); } Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const { store(mem, mask, DefaultStoreTag()); } template ::value>> Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const { store(mem, mask, flags); } inline void setZero(); inline void setZero(MaskType mask); inline void setZeroInverted(MaskType mask); inline void setQnan(); inline void setQnan(MaskType mask); #define Vc_CURRENT_CLASS_NAME Vector #ifndef Vc_CURRENT_CLASS_NAME #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors." #endif private: template inline void gatherImplementation(const Common::GatherArguments &); template inline void gatherImplementation(const Common::GatherArguments &, MaskArgument mask); public: #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that can be converted to the " \ "EntryType of this SIMD vector type."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes))); } template Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments &args) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args); } template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask); } template Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments &args, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args, mask); } template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes))); } template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask); } template Vc_INTRINSIC void gather(const Common::GatherArguments &args) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args); } template Vc_INTRINSIC void gather(const Common::GatherArguments &args, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args, mask); } #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_ private: template inline void scatterImplementation(MT *mem, IT &&indexes) const; template inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const; public: #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that the EntryType of this " \ "SIMD vector type can be converted to."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes)); } template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes), mask); } template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args) const { scatter(args.address, args.indexes); } template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args, MaskArgument mask) const { scatter(args.address, args.indexes, mask); } #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_ #undef Vc_CURRENT_CLASS_NAME inline reference operator[](size_t index) noexcept; inline EntryType operator[](size_t index) const noexcept; inline MaskType operator!() const; inline Vector operator~() const; inline Vector operator-() const; inline Vector operator+() const; inline Vector &operator++(); inline Vector operator++(int); inline Vector &operator--(); inline Vector operator--(int); #define Vc_OP(symbol) \ inline Vc_PURE Vector operator symbol(const Vector &x) const; Vc_ALL_ARITHMETICS(Vc_OP); Vc_ALL_BINARY(Vc_OP); Vc_ALL_SHIFTS(Vc_OP); #undef Vc_OP #define Vc_CMP_OP(symbol) inline Vc_PURE MaskType operator symbol(const Vector &x) const; Vc_ALL_COMPARES(Vc_CMP_OP); #undef Vc_CMP_OP inline Common::WriteMaskedVector operator()(MaskType mask); inline EntryType min() const; inline EntryType max() const; inline EntryType product() const; inline EntryType sum() const; inline Vector partialSum() const; inline EntryType min(MaskType mask) const; inline EntryType max(MaskType mask) const; inline EntryType product(MaskType mask) const; inline EntryType sum(MaskType mask) const; inline Vector shifted(int amount) const; inline Vector shifted(int amount, Vector shiftIn) const; inline Vector rotated(int amount) const; inline Vector reversed() const; inline Vector sorted() const; template void callWithValuesSorted(F &&f); template inline void call(F &&f) const; template inline void call(F &&f, MaskType mask) const; template inline Vector apply(F &&f) const; template inline Vector apply(F &&f, MaskType mask) const; template inline void fill(EntryType(&f)(IndexT)); inline void fill(EntryType(&f)()); inline Vector interleaveLow(Vector x) const; inline Vector interleaveHigh(Vector x) const; inline void assign(const Vector &v, const MaskType &m); inline VectorType &data(); inline const VectorType &data() const; Vc_DEPRECATED("use exponent(x) instead") inline Vector exponent() const; Vc_DEPRECATED("use isnegative(x) instead") inline MaskType isNegative() const; static constexpr size_t Size = VectorTraits::size(); template inline V2 staticCast() const; template Vc_DEPRECATED("use reinterpret_components_cast instead") inline V2 reinterpretCast() const; Vc_DEPRECATED("use copysign(x, y) instead") inline Vector copySign(Vector reference) const; Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Vector)); private: VectorType d; }; template Vc_ALWAYS_INLINE Vc_CONST enable_if< (V::size() == Vector::size() && sizeof(typename V::VectorEntryType) == sizeof(typename Vector::VectorEntryType) && sizeof(V) == sizeof(Vector) && alignof(V) <= alignof(Vector)), V> reinterpret_components_cast(const Vector &x) { return reinterpret_cast(x); } #define Vc_OP(symbol) \ template \ inline Vector &operator symbol##=(Vector &, \ const Vector &x); #undef Vc_OP } #endif #ifndef VC_COMMON_MASK_H_ #define VC_COMMON_MASK_H_ namespace Vc_VERSIONED_NAMESPACE { template > class Mask { public: static constexpr size_t size() { return VectorTraits::size(); } static constexpr size_t Size = VectorTraits::size(); static constexpr size_t MemoryAlignment = VectorTraits::maskMemoryAlignment(); using abi = Abi; using EntryType = bool; using value_type = EntryType; using EntryReference = typename VectorTraits::EntryReference; using value_reference = EntryReference; using VectorEntryType = typename VectorTraits::VectorEntryType; using VectorType = typename VectorTraits::VectorType; using vector_type = VectorType; Vc_INTRINSIC static Mask Zero(); Vc_INTRINSIC static Mask One(); template static Vc_INTRINSIC Mask generate(G &&gen); Vc_INTRINSIC Mask() = default; Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero); Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne); Vc_INTRINSIC explicit Mask(bool b); template Vc_INTRINSIC Mask(U &&otherMask, Common::enable_if_mask_converts_implicitly = nullarg); #if Vc_IS_VERSION_1 template Vc_DEPRECATED( "use simd_cast instead of explicit type casting to convert between mask types") Vc_INTRINSIC_L explicit Mask(U &&otherMask, Common::enable_if_mask_converts_explicitly = nullarg) Vc_INTRINSIC_R; #endif Vc_ALWAYS_INLINE explicit Mask(const bool *mem); template Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags flags); Vc_ALWAYS_INLINE void load(const bool *mem); template Vc_ALWAYS_INLINE void load(const bool *mem, Flags flags); Vc_ALWAYS_INLINE void store(bool *mem) const; template Vc_ALWAYS_INLINE void store(bool *mem, Flags flags) const; Vc_ALWAYS_INLINE bool operator==(const Mask &mask) const; Vc_ALWAYS_INLINE bool operator!=(const Mask &mask) const; Vc_ALWAYS_INLINE Mask operator&&(const Mask &mask) const; Vc_ALWAYS_INLINE Mask operator&(const Mask &mask) const; Vc_ALWAYS_INLINE Mask operator||(const Mask &mask) const; Vc_ALWAYS_INLINE Mask operator|(const Mask &mask) const; Vc_ALWAYS_INLINE Mask operator^(const Mask &mask) const; Vc_ALWAYS_INLINE Mask operator!() const; Vc_ALWAYS_INLINE Mask &operator&=(const Mask &mask); Vc_ALWAYS_INLINE Mask &operator|=(const Mask &mask); Vc_ALWAYS_INLINE Mask &operator^=(const Mask &mask); Vc_ALWAYS_INLINE bool isFull() const; Vc_ALWAYS_INLINE bool isNotEmpty() const; Vc_ALWAYS_INLINE bool isEmpty() const; Vc_ALWAYS_INLINE bool isMix() const; Vc_ALWAYS_INLINE bool data() const; Vc_ALWAYS_INLINE bool dataI() const; Vc_ALWAYS_INLINE bool dataD() const; Vc_ALWAYS_INLINE EntryReference operator[](size_t index); Vc_ALWAYS_INLINE EntryType operator[](size_t index) const; Vc_ALWAYS_INLINE int count() const; Vc_ALWAYS_INLINE int firstOne() const; Vc_ALWAYS_INLINE int toInt() const; Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const; Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask)); private: VectorType d; }; template constexpr bool all_of(const Mask &m) { return m.isFull(); } constexpr bool all_of(bool b) { return b; } template constexpr bool any_of(const Mask &m) { return m.isNotEmpty(); } constexpr bool any_of(bool b) { return b; } template constexpr bool none_of(const Mask &m) { return m.isEmpty(); } constexpr bool none_of(bool b) { return !b; } template constexpr bool some_of(const Mask &m) { return m.isMix(); } constexpr bool some_of(bool) { return false; } } #endif #ifndef VC_COMMON_MEMORYFWD_H_ #define VC_COMMON_MEMORYFWD_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Common { template class Memory; template class MemoryBase; } using Common::Memory; } #endif #endif #ifndef VC_SCALAR_TYPES_H_ #define VC_SCALAR_TYPES_H_ #ifdef Vc_DEFAULT_IMPL_Scalar #define Vc_DOUBLE_V_SIZE 1 #define Vc_FLOAT_V_SIZE 1 #define Vc_INT_V_SIZE 1 #define Vc_UINT_V_SIZE 1 #define Vc_SHORT_V_SIZE 1 #define Vc_USHORT_V_SIZE 1 #endif namespace Vc_VERSIONED_NAMESPACE { namespace Scalar { template using Vector = Vc::Vector; typedef Vector double_v; typedef Vector float_v; typedef Vector int_v; typedef Vector uint_v; typedef Vector short_v; typedef Vector ushort_v; template using Mask = Vc::Mask; typedef Mask double_m; typedef Mask float_m; typedef Mask int_m; typedef Mask uint_m; typedef Mask short_m; typedef Mask ushort_m; template struct is_vector : public std::false_type {}; template struct is_vector> : public std::true_type {}; template struct is_mask : public std::false_type {}; template struct is_mask> : public std::true_type {}; } namespace Traits { template struct is_simd_mask_internal> : public std::true_type {}; template struct is_simd_vector_internal> : public is_valid_vector_argument {}; } } #endif #ifndef VC_SCALAR_DETAIL_H_ #define VC_SCALAR_DETAIL_H_ #ifndef VC_SCALAR_MACROS_H_ #define VC_SCALAR_MACROS_H_ #endif namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template struct InterleaveImpl; template struct InterleaveImpl { template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1) { data[i[0] + 0] = v0.data(); data[i[0] + 1] = v1.data(); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { data[i[0] + 0] = v0.data(); data[i[0] + 1] = v1.data(); data[i[0] + 2] = v2.data(); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { data[i[0] + 0] = v0.data(); data[i[0] + 1] = v1.data(); data[i[0] + 2] = v2.data(); data[i[0] + 3] = v3.data(); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) { interleave(data, i, v0, v1, v2, v3); data[i[0] + 4] = v4.data(); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6, v7); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1) { v0.data() = data[i[0] + 0]; v1.data() = data[i[0] + 1]; } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2) { v0.data() = data[i[0] + 0]; v1.data() = data[i[0] + 1]; v2.data() = data[i[0] + 2]; } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3) { v0.data() = data[i[0] + 0]; v1.data() = data[i[0] + 1]; v2.data() = data[i[0] + 2]; v3.data() = data[i[0] + 3]; } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4) { deinterleave(data, i, v0, v1, v2, v3); v4.data() = data[i[0] + 4]; } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) { deinterleave(data, i, v0, v1, v2, v3); deinterleave(data + 4, i, v4, v5); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) { deinterleave(data, i, v0, v1, v2, v3); deinterleave(data + 4, i, v4, v5, v6); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) { deinterleave(data, i, v0, v1, v2, v3); deinterleave(data + 4, i, v4, v5, v6, v7); } }; } } #endif #ifndef VC_SCALAR_MASK_H_ #define VC_SCALAR_MASK_H_ namespace Vc_VERSIONED_NAMESPACE { template class Mask { friend class Mask< double, VectorAbi::Scalar>; friend class Mask< float, VectorAbi::Scalar>; friend class Mask< int32_t, VectorAbi::Scalar>; friend class Mask; friend class Mask< int16_t, VectorAbi::Scalar>; friend class Mask; public: using abi = VectorAbi::Scalar; static constexpr size_t Size = 1; static constexpr size_t MemoryAlignment = 1; static constexpr std::size_t size() { return 1; } typedef bool EntryType; using value_type = EntryType; using EntryReference = Vc::Detail::ElementReference; using reference = EntryReference; typedef bool VectorEntryType; using VectorType = bool; using Vector = Scalar::Vector; Vc_INTRINSIC Mask() = default; Vc_INTRINSIC explicit Mask(bool b) : m(b) {} Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : m(false) {} Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : m(true) {} Vc_INTRINSIC static Mask Zero() { return Mask(false); } Vc_INTRINSIC static Mask One() { return Mask(true); } template Vc_INTRINSIC Mask(U &&rhs, Common::enable_if_mask_converts_implicitly = nullarg) : m(rhs.m) {} #if Vc_IS_VERSION_1 template Vc_DEPRECATED( "use simd_cast instead of explicit type casting to convert between mask types") Vc_INTRINSIC_L explicit Mask(U &&rhs, Common::enable_if_mask_converts_explicitly = nullarg) Vc_INTRINSIC_R; #endif Vc_ALWAYS_INLINE explicit Mask(const bool *mem) : m(mem[0]) {} template Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags) : m(mem[0]) {} Vc_ALWAYS_INLINE void load(const bool *mem) { m = mem[0]; } template Vc_ALWAYS_INLINE void load(const bool *mem, Flags) { m = mem[0]; } Vc_ALWAYS_INLINE void store(bool *mem) const { *mem = m; } template Vc_ALWAYS_INLINE void store(bool *mem, Flags) const { *mem = m; } Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return m == rhs.m; } Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return m != rhs.m; } Vc_ALWAYS_INLINE Mask operator&&(const Mask &rhs) const { return Mask(m && rhs.m); } Vc_ALWAYS_INLINE Mask operator& (const Mask &rhs) const { return Mask(m && rhs.m); } Vc_ALWAYS_INLINE Mask operator||(const Mask &rhs) const { return Mask(m || rhs.m); } Vc_ALWAYS_INLINE Mask operator| (const Mask &rhs) const { return Mask(m || rhs.m); } Vc_ALWAYS_INLINE Mask operator^ (const Mask &rhs) const { return Mask(m ^ rhs.m); } Vc_ALWAYS_INLINE Mask operator!() const { return Mask(!m); } Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { m &= rhs.m; return *this; } Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { m |= rhs.m; return *this; } Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { m ^= rhs.m; return *this; } Vc_ALWAYS_INLINE bool isFull () const { return m; } Vc_ALWAYS_INLINE bool isNotEmpty() const { return m; } Vc_ALWAYS_INLINE bool isEmpty() const { return !m; } Vc_ALWAYS_INLINE bool isMix () const { return false; } Vc_ALWAYS_INLINE bool data () const { return m; } Vc_ALWAYS_INLINE bool dataI() const { return m; } Vc_ALWAYS_INLINE bool dataD() const { return m; } private: friend reference; static Vc_INTRINSIC bool get(const Mask &o, int) noexcept { return o.m; } template static Vc_INTRINSIC void set(Mask &o, int, U &&v) noexcept( noexcept(std::declval() = std::declval())) { o.m = std::forward(v); } public: Vc_ALWAYS_INLINE reference operator[](size_t i) noexcept { Vc_ASSERT(i == 0); if (i) {} return {*this, 0}; } Vc_ALWAYS_INLINE value_type operator[](size_t i) const noexcept { Vc_ASSERT(i == 0); if (i) {} return m; } Vc_ALWAYS_INLINE int count() const { return m ? 1 : 0; } Vc_ALWAYS_INLINE int firstOne() const { return 0; } Vc_ALWAYS_INLINE int toInt() const { return m ? 1 : 0; } template static Vc_INTRINSIC Mask generate(G &&gen) { return Mask(gen(0)); } Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const { if (amount == 0) { return *this; } else { return Zero(); } } private: bool m; }; template constexpr size_t Mask::Size; template constexpr size_t Mask::MemoryAlignment; } #endif namespace Vc_VERSIONED_NAMESPACE { #define Vc_CURRENT_CLASS_NAME Vector template class Vector { static_assert(std::is_arithmetic::value, "Vector only accepts arithmetic builtin types as template parameter T."); public: using abi = VectorAbi::Scalar; using EntryType = T; using VectorEntryType = EntryType; using value_type = EntryType; using VectorType = EntryType; using vector_type = VectorType; using reference = Detail::ElementReference; protected: VectorType m_data = VectorType(); template using V = Vector; public: typedef Scalar::Mask Mask; using MaskType = Mask; using mask_type = Mask; typedef Mask MaskArgument; typedef Vector AsArg; Vc_ALWAYS_INLINE VectorType &data() { return m_data; } Vc_ALWAYS_INLINE const VectorType &data() const { return m_data; } static constexpr size_t Size = 1; static constexpr size_t MemoryAlignment = alignof(VectorType); using IndexType = fixed_size_simd; using index_type = IndexType; public: Vc_INTRINSIC Vector() = default; static constexpr std::size_t size() { return Size; } explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R; explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R; explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R; static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); } static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); } static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero() { return Vector(Vc::IndexesFromZero); } template ()(size_t())), value_type>::value>::type> explicit Vector(G &&g) : Vector(generate(std::forward(g))) { } static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R; template Vc_INTRINSIC Vector( V x, typename std::enable_if::value, void *>::type = nullptr) : m_data(static_cast(x.data())) { } #if Vc_IS_VERSION_1 template Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between " "vector types") Vc_INTRINSIC explicit Vector( V x, typename std::enable_if::value, void *>::type = nullptr) : m_data(static_cast(x.data())) { } #endif Vc_INTRINSIC explicit Vector(reference a) : Vector(static_cast(a)) {} Vc_INTRINSIC Vector(EntryType a) : m_data(a) {} template Vc_INTRINSIC Vector(U a, typename std::enable_if::value && !std::is_same::value, void *>::type = nullptr) : Vector(static_cast(a)) { } explicit Vc_INTRINSIC Vector(const EntryType *mem) { load(mem); } template ::value>> explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags) { load(mem, flags); } template ::value || !std::is_integral::value || sizeof(EntryType) >= sizeof(U)) && std::is_arithmetic::value &&Traits::is_load_store_flag::value>> explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags()) { load(x, flags); } Vc_INTRINSIC void load(const EntryType *mem) { load(mem, DefaultLoadTag()); } template Vc_INTRINSIC enable_if::value, void> load(const EntryType *mem, Flags flags) { load(mem, flags); } private: template struct load_concept : public std::enable_if< (!std::is_integral::value || !std::is_integral::value || sizeof(EntryType) >= sizeof(U)) && std::is_arithmetic::value && Traits::is_load_store_flag::value, void> {}; public: template Vc_INTRINSIC_L typename load_concept::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R; template < typename U, typename Flags = DefaultStoreTag, typename = enable_if::value &&Traits::is_load_store_flag::value>> Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R; template < typename U, typename Flags = DefaultStoreTag, typename = enable_if::value &&Traits::is_load_store_flag::value>> Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R; Vc_INTRINSIC void store(EntryType *mem) const { store(mem, DefaultStoreTag()); } template ::value>> Vc_INTRINSIC void store(EntryType *mem, Flags flags) const { store(mem, flags); } Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const { store(mem, mask, DefaultStoreTag()); } template ::value>> Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const { store(mem, mask, flags); } Vc_ALWAYS_INLINE void setZero() { m_data = 0; } Vc_ALWAYS_INLINE void setZero(Mask k) { if (k.data()) m_data = 0; } Vc_ALWAYS_INLINE void setZeroInverted(Mask k) { if (!k.data()) m_data = 0; } Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R; Vc_INTRINSIC_L void setQnan(Mask m) Vc_INTRINSIC_R; #ifndef Vc_CURRENT_CLASS_NAME #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors." #endif private: template inline void gatherImplementation(const Common::GatherArguments &); template inline void gatherImplementation(const Common::GatherArguments &, MaskArgument mask); public: #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that can be converted to the " \ "EntryType of this SIMD vector type."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes))); } template Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments &args) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args); } template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask); } template Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments &args, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args, mask); } template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes))); } template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask); } template Vc_INTRINSIC void gather(const Common::GatherArguments &args) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args); } template Vc_INTRINSIC void gather(const Common::GatherArguments &args, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args, mask); } #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_ private: template inline void scatterImplementation(MT *mem, IT &&indexes) const; template inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const; public: #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that the EntryType of this " \ "SIMD vector type can be converted to."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes)); } template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes), mask); } template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args) const { scatter(args.address, args.indexes); } template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args, MaskArgument mask) const { scatter(args.address, args.indexes, mask); } #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_ Vc_ALWAYS_INLINE Vector &operator++() { ++m_data; return *this; } Vc_ALWAYS_INLINE Vector &operator--() { --m_data; return *this; } Vc_ALWAYS_INLINE Vector operator++(int) { return m_data++; } Vc_ALWAYS_INLINE Vector operator--(int) { return m_data--; } private: friend reference; Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept { Vc_ASSERT(i == 0); if (i) {} return o.m_data; } template Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept( noexcept(std::declval() = v)) { Vc_ASSERT(i == 0); if (i) {} o.m_data = v; } public: Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept { static_assert(noexcept(reference{std::declval(), int()}), ""); return {*this, int(index)}; } Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept { Vc_ASSERT(index == 0); if (index) {} return m_data; } Vc_ALWAYS_INLINE Mask operator!() const { return Mask(!m_data); } Vc_ALWAYS_INLINE Vector operator~() const { #ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS static_assert(std::is_integral::value, "bit-complement can only be used with Vectors of integral type"); #endif return Vector(~m_data); } Vc_ALWAYS_INLINE Vector operator-() const { return -m_data; } Vc_INTRINSIC Vector Vc_PURE operator+() const { return *this; } #define Vc_OP(symbol) \ Vc_ALWAYS_INLINE Vc_PURE Vector operator symbol(const Vector &x) const { return Vector(m_data symbol x.m_data); } Vc_ALL_SHIFTS(Vc_OP); #undef Vc_OP Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask isNegative() const { return Vc::isnegative(*this); } Vc_ALWAYS_INLINE void assign(const Vector &v, const Mask &m) { if (m.data()) m_data = v.m_data; } template Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2 staticCast() const { return V2(static_cast(m_data)); } Vc_ALWAYS_INLINE Common::WriteMaskedVector operator()(Mask m) { return {*this, m}; } Vc_ALWAYS_INLINE EntryType min() const { return m_data; } Vc_ALWAYS_INLINE EntryType max() const { return m_data; } Vc_ALWAYS_INLINE EntryType product() const { return m_data; } Vc_ALWAYS_INLINE EntryType sum() const { return m_data; } Vc_ALWAYS_INLINE Vector partialSum() const { return *this; } Vc_ALWAYS_INLINE EntryType min(Mask) const { return m_data; } Vc_ALWAYS_INLINE EntryType max(Mask) const { return m_data; } Vc_ALWAYS_INLINE EntryType product(Mask m) const { if (m.data()) { return m_data; } else { return EntryType(1); } } Vc_ALWAYS_INLINE EntryType sum(Mask m) const { if (m.data()) return m_data; return static_cast(0); } Vc_INTRINSIC Vector Vc_VDECL shifted(int amount, Vector shiftIn) const { Vc_ASSERT(amount >= -1 && amount <= 1); return amount == 0 ? *this : shiftIn; } Vc_INTRINSIC Vector shifted(int amount) const { return amount == 0 ? *this : Zero(); } Vc_INTRINSIC Vector rotated(int) const { return *this; } Vc_INTRINSIC Vector reversed() const { return *this; } Vc_INTRINSIC Vector sorted() const { return *this; } template void callWithValuesSorted(F &&f) { f(m_data); } template Vc_INTRINSIC void call(F &&f) const { f(m_data); } template Vc_INTRINSIC void call(F &&f, Mask mask) const { if (mask.data()) { f(m_data); } } template Vc_INTRINSIC Vector apply(F &&f) const { return Vector(f(m_data)); } template Vc_INTRINSIC Vector apply(F &&f, Mask mask) const { if (mask.data()) { return Vector(f(m_data)); } else { return *this; } } template Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) { m_data = f(0); } Vc_INTRINSIC void fill(EntryType (&f)()) { m_data = f(); } template static Vc_INTRINSIC Vector generate(G gen) { return gen(0); } Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector Vc_VDECL copySign(Vector x) const { return Vc::copysign(*this, x); } Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const { return Vc::exponent(*this); } Vc_INTRINSIC Vector Vc_VDECL interleaveLow(Vector) const { return *this; } Vc_INTRINSIC Vector Vc_VDECL interleaveHigh(Vector x) const { return x; } }; #undef Vc_CURRENT_CLASS_NAME template constexpr size_t Vector::Size; template constexpr size_t Vector::MemoryAlignment; #define Vc_OP(symbol) \ template () symbol## = std::declval())> \ Vc_INTRINSIC enable_if>::value, \ Vector> \ &operator symbol##=(Vector &lhs, U &&rhs) \ { \ lhs.data() symbol## = Vector(std::forward(rhs)).data(); \ return lhs; \ } Vc_ALL_SHIFTS(Vc_OP); #undef Vc_OP #define Vc_CONDITIONAL_ASSIGN(name_,op_) \ template \ Vc_INTRINSIC enable_if conditional_assign( \ Vector &lhs, M &&mask, U &&rhs) \ { \ if (mask.isFull()) { \ lhs op_ std::forward(rhs); \ } \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_CONDITIONAL_ASSIGN( Assign, =); Vc_CONDITIONAL_ASSIGN( PlusAssign, +=); Vc_CONDITIONAL_ASSIGN( MinusAssign, -=); Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=); Vc_CONDITIONAL_ASSIGN( DivideAssign, /=); Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=); Vc_CONDITIONAL_ASSIGN( XorAssign, ^=); Vc_CONDITIONAL_ASSIGN( AndAssign, &=); Vc_CONDITIONAL_ASSIGN( OrAssign, |=); Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=); Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=); #undef Vc_CONDITIONAL_ASSIGN #define Vc_CONDITIONAL_ASSIGN(name_,expr_) \ template \ Vc_INTRINSIC enable_if> \ conditional_assign(Vector &lhs, M &&mask) \ { \ return mask.isFull() ? (expr_) : lhs; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs++); Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs); Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs--); Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs); #undef Vc_CONDITIONAL_ASSIGN } #include #ifndef VC_COMMON_CONST_DATA_H_ #define VC_COMMON_CONST_DATA_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Common { alignas(64) extern unsigned int RandomState[]; alignas(32) extern const unsigned int AllBitsSet[8]; } } #endif #ifndef VC_COMMON_WHERE_H_ #define VC_COMMON_WHERE_H_ namespace Vc_VERSIONED_NAMESPACE { namespace WhereImpl { template struct MaskedLValue { typedef _Mask Mask; typedef _LValue LValue; const Mask &mask; LValue &lhs; constexpr MaskedLValue(const Mask &m, LValue &l) : mask(m), lhs(l) {} MaskedLValue(const MaskedLValue &) = delete; #ifndef __cpp_guaranteed_copy_elision constexpr MaskedLValue(MaskedLValue &&) = default; #endif template Vc_ALWAYS_INLINE void operator =(T &&rhs) { conditional_assign(lhs, mask, std::forward(rhs)); } template Vc_ALWAYS_INLINE void operator +=(T &&rhs) { conditional_assign(lhs, mask, std::forward(rhs)); } template Vc_ALWAYS_INLINE void operator -=(T &&rhs) { conditional_assign(lhs, mask, std::forward(rhs)); } template Vc_ALWAYS_INLINE void operator *=(T &&rhs) { conditional_assign(lhs, mask, std::forward(rhs)); } template Vc_ALWAYS_INLINE void operator /=(T &&rhs) { conditional_assign(lhs, mask, std::forward(rhs)); } template Vc_ALWAYS_INLINE void operator %=(T &&rhs) { conditional_assign(lhs, mask, std::forward(rhs)); } template Vc_ALWAYS_INLINE void operator ^=(T &&rhs) { conditional_assign(lhs, mask, std::forward(rhs)); } template Vc_ALWAYS_INLINE void operator &=(T &&rhs) { conditional_assign(lhs, mask, std::forward(rhs)); } template Vc_ALWAYS_INLINE void operator |=(T &&rhs) { conditional_assign(lhs, mask, std::forward(rhs)); } template Vc_ALWAYS_INLINE void operator<<=(T &&rhs) { conditional_assign(lhs, mask, std::forward(rhs)); } template Vc_ALWAYS_INLINE void operator>>=(T &&rhs) { conditional_assign(lhs, mask, std::forward(rhs)); } Vc_ALWAYS_INLINE void operator++() { conditional_assign(lhs, mask); } Vc_ALWAYS_INLINE void operator++(int) { conditional_assign(lhs, mask); } Vc_ALWAYS_INLINE void operator--() { conditional_assign(lhs, mask); } Vc_ALWAYS_INLINE void operator--(int) { conditional_assign(lhs, mask); } template Vc_INTRINSIC void operator=(Common::SubscriptOperation &&rhs) { lhs.gather(std::move(rhs).gatherArguments(), mask); } template void operator+=(Common::SubscriptOperation &&rhs) = delete; template void operator-=(Common::SubscriptOperation &&rhs) = delete; template void operator*=(Common::SubscriptOperation &&rhs) = delete; template void operator/=(Common::SubscriptOperation &&rhs) = delete; template void operator%=(Common::SubscriptOperation &&rhs) = delete; template void operator^=(Common::SubscriptOperation &&rhs) = delete; template void operator&=(Common::SubscriptOperation &&rhs) = delete; template void operator|=(Common::SubscriptOperation &&rhs) = delete; template void operator<<=(Common::SubscriptOperation &&rhs) = delete; template void operator>>=(Common::SubscriptOperation &&rhs) = delete; }; template struct MaskedLValue<_Mask, Common::SubscriptOperation> { typedef _Mask Mask; typedef Common::SubscriptOperation SO; const Mask &mask; SO &lhs; template using Decay = typename std::decay::type; constexpr MaskedLValue(const Mask &m, SO &&l) : mask(m), lhs(l) {} MaskedLValue(const MaskedLValue &) = delete; #ifndef __cpp_guaranteed_copy_elision constexpr MaskedLValue(MaskedLValue &&) = default; #endif template Vc_ALWAYS_INLINE void operator=(T &&rhs) && { std::forward(rhs).scatter(std::move(lhs).scatterArguments(), mask); } }; template struct MaskedLValue { typedef bool Mask; typedef _LValue LValue; const Mask &mask; LValue &lhs; constexpr MaskedLValue(const Mask &m, LValue &l) : mask(m), lhs(l) {} MaskedLValue(const MaskedLValue &) = delete; constexpr MaskedLValue(MaskedLValue &&) = default; template Vc_ALWAYS_INLINE void operator =(T &&rhs) { if (mask) lhs = std::forward(rhs); } template Vc_ALWAYS_INLINE void operator +=(T &&rhs) { if (mask) lhs += std::forward(rhs); } template Vc_ALWAYS_INLINE void operator -=(T &&rhs) { if (mask) lhs -= std::forward(rhs); } template Vc_ALWAYS_INLINE void operator *=(T &&rhs) { if (mask) lhs *= std::forward(rhs); } template Vc_ALWAYS_INLINE void operator /=(T &&rhs) { if (mask) lhs /= std::forward(rhs); } template Vc_ALWAYS_INLINE void operator %=(T &&rhs) { if (mask) lhs %= std::forward(rhs); } template Vc_ALWAYS_INLINE void operator ^=(T &&rhs) { if (mask) lhs ^= std::forward(rhs); } template Vc_ALWAYS_INLINE void operator &=(T &&rhs) { if (mask) lhs &= std::forward(rhs); } template Vc_ALWAYS_INLINE void operator |=(T &&rhs) { if (mask) lhs |= std::forward(rhs); } template Vc_ALWAYS_INLINE void operator<<=(T &&rhs) { if (mask) lhs <<= std::forward(rhs); } template Vc_ALWAYS_INLINE void operator>>=(T &&rhs) { if (mask) lhs >>= std::forward(rhs); } Vc_ALWAYS_INLINE void operator++() { if (mask) ++lhs; } Vc_ALWAYS_INLINE void operator++(int) { if (mask) lhs++; } Vc_ALWAYS_INLINE void operator--() { if (mask) --lhs; } Vc_ALWAYS_INLINE void operator--(int) { if (mask) lhs--; } }; template struct WhereMask { typedef _Mask Mask; const Mask &mask; constexpr WhereMask(const Mask &m) : mask(m) {} WhereMask(const WhereMask &) = delete; template constexpr Vc_WARN_UNUSED_RESULT MaskedLValue> operator|(Common::SubscriptOperation &&lhs) const { static_assert(!std::is_const::value, "masked scatter to constant memory not possible."); return {mask, std::move(lhs)}; } template constexpr Vc_WARN_UNUSED_RESULT MaskedLValue operator|(T &&lhs) const { static_assert(std::is_lvalue_reference::value, "Syntax error: Incorrect use of Vc::where. Maybe operator precedence got you by surprise. Examples of correct usage:\n" " Vc::where(x < 2) | x += 1;\n" " (Vc::where(x < 2) | x)++;\n" " Vc::where(x < 2)(x) += 1;\n" " Vc::where(x < 2)(x)++;\n" ); return { mask, lhs }; } template () = std::declval())> constexpr Vc_WARN_UNUSED_RESULT MaskedLValue operator()(T &&lhs) const { return operator|(std::forward(lhs)); } }; } template constexpr Vc_WARN_UNUSED_RESULT WhereImpl::WhereMask where(const M &mask) { return { mask }; } template constexpr Vc_WARN_UNUSED_RESULT WhereImpl::MaskedLValue where(const M &mask, V &value) { return {mask, value}; } template constexpr Vc_WARN_UNUSED_RESULT WhereImpl::MaskedLValue> where(const M &mask, Common::SubscriptOperation &&value) { return {mask, std::move(value)}; } template constexpr Vc_WARN_UNUSED_RESULT WhereImpl::WhereMask _if(const M &m) { return { m }; } } #endif #ifndef VC_COMMON_TRANSPOSE_H_ #define VC_COMMON_TRANSPOSE_H_ #include namespace Vc_VERSIONED_NAMESPACE { namespace Common { template struct TransposeProxy { TransposeProxy(const Inputs &... inputs) : in{inputs...} {} std::tuple in; }; template struct TransposeTag { }; } template Common::TransposeProxy transpose(Vs... vs) { return {vs...}; } } #endif #ifndef VC_SCALAR_OPERATORS_H_ #define VC_SCALAR_OPERATORS_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Detail { #define Vc_OP(op_) \ template \ Vc_INTRINSIC Scalar::Mask operator op_(Scalar::Vector a, Scalar::Vector b) \ { \ return Scalar::Mask(a.data() op_ b.data()); \ } Vc_ALL_COMPARES(Vc_OP); #undef Vc_OP #define Vc_OP(symbol) \ template \ Vc_INTRINSIC enable_if::value, Scalar::Vector> \ operator symbol(Scalar::Vector a, Scalar::Vector b) \ { \ return a.data() symbol b.data(); \ } \ template \ Vc_INTRINSIC enable_if::value, Scalar::Vector> \ operator symbol(Scalar::Vector &lhs, Scalar::Vector rhs) \ { \ using uinta = \ MayAlias::type>; \ uinta *left = reinterpret_cast(&lhs.data()); \ const uinta *right = reinterpret_cast(&rhs.data()); \ *left symbol## = *right; \ return lhs; \ } Vc_ALL_BINARY(Vc_OP); #undef Vc_OP template Vc_INTRINSIC Scalar::Vector operator+(Scalar::Vector a, Scalar::Vector b) { return a.data() + b.data(); } template Vc_INTRINSIC Scalar::Vector operator-(Scalar::Vector a, Scalar::Vector b) { return a.data() - b.data(); } template Vc_INTRINSIC Scalar::Vector operator*(Scalar::Vector a, Scalar::Vector b) { return a.data() * b.data(); } template Vc_INTRINSIC Scalar::Vector operator/(Scalar::Vector a, Scalar::Vector b) { return a.data() / b.data(); } template Vc_INTRINSIC Scalar::Vector operator%(Scalar::Vector a, Scalar::Vector b) { return a.data() % b.data(); } } } #endif namespace Vc_VERSIONED_NAMESPACE { template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerZero) : m_data(0) { } template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerOne) : m_data(1) { } template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerIndexesFromZero) : m_data(0) { } template template Vc_INTRINSIC typename Vector:: #ifndef Vc_MSVC template #endif load_concept::type Vector::load(const U *mem, Flags) { m_data = mem[0]; } template template Vc_INTRINSIC void Vector::store(U *mem, Flags) const { mem[0] = m_data; } template template Vc_INTRINSIC void Vector::store(U *mem, Mask mask, Flags) const { if (mask.data()) mem[0] = m_data; } template template Vc_ALWAYS_INLINE void Vector::gatherImplementation( const Common::GatherArguments &args) { m_data = args.address[Scale * args.indexes[0]]; } template template Vc_ALWAYS_INLINE void Vector::gatherImplementation( const Common::GatherArguments &args, MaskArgument mask) { if (mask.data()) { m_data = args.address[Scale * args.indexes[0]]; } } template template Vc_ALWAYS_INLINE void Vector::scatterImplementation(MT *mem, IT &&indexes) const { mem[indexes[0]] = m_data; } template template Vc_ALWAYS_INLINE void Vector::scatterImplementation( MT *mem, IT &&indexes, MaskArgument mask) const { if (mask.data()) { mem[indexes[0]] = m_data; } } Vc_INTRINSIC Vc_CONST Scalar::float_v exponent(Scalar::float_v x) { Vc_ASSERT(x.data() >= 0.f); union { float f; int i; } value; value.f = x.data(); return Scalar::float_v(static_cast((value.i >> 23) - 0x7f)); } Vc_INTRINSIC Vc_CONST Scalar::double_v Vc_VDECL exponent(Scalar::double_v x) { Vc_ASSERT(x.data() >= 0.); union { double f; long long i; } value; value.f = x.data(); return Scalar::double_v(static_cast((value.i >> 52) - 0x3ff)); } static Vc_ALWAYS_INLINE void _doRandomStep(Scalar::uint_v &state0, Scalar::uint_v &state1) { using Scalar::uint_v; state0.load(&Common::RandomState[0]); state1.load(&Common::RandomState[uint_v::Size]); Detail::operator+(Detail::operator*(state1, uint_v(0xdeece66du)), uint_v(11)) .store(&Common::RandomState[uint_v::Size]); uint_v(Detail::operator+(Detail::operator*(state0, uint_v(0xdeece66du)), uint_v(11)) .data() ^ (state1.data() >> 16)) .store(&Common::RandomState[0]); } template Vc_INTRINSIC Vector Vector::Random() { Scalar::uint_v state0, state1; _doRandomStep(state0, state1); return Vector(static_cast(state0.data())); } template<> Vc_INTRINSIC Scalar::float_v Scalar::float_v::Random() { Scalar::uint_v state0, state1; _doRandomStep(state0, state1); union { unsigned int i; float f; } x; x.i = (state0.data() & 0x0fffffffu) | 0x3f800000u; return Scalar::float_v(x.f - 1.f); } template<> Vc_INTRINSIC Scalar::double_v Scalar::double_v::Random() { typedef unsigned long long uint64 Vc_MAY_ALIAS; uint64 state0 = *reinterpret_cast(&Common::RandomState[8]); state0 = (state0 * 0x5deece66dull + 11) & 0x000fffffffffffffull; *reinterpret_cast(&Common::RandomState[8]) = state0; union { unsigned long long i; double f; } x; x.i = state0 | 0x3ff0000000000000ull; return Scalar::double_v(x.f - 1.); } Vc_INTRINSIC Vc_CONST Scalar::float_m isnegative(Scalar::float_v x) { static_assert(sizeof(float) == sizeof(unsigned int), "This code assumes float and unsigned int have the same number of " "Bytes. Please file a bug report if this is a problem."); union { float f; unsigned int i; } u; u.f = x.data(); return Scalar::float_m(0u != (u.i & 0x80000000u)); } Vc_INTRINSIC Vc_CONST Scalar::double_m Vc_VDECL isnegative(Scalar::double_v x) { static_assert(sizeof(double) == sizeof(unsigned long long), "This code assumes double and unsigned long long have the same number " "of Bytes. Please file a bug report if this is a problem."); union { double d; unsigned long long l; } u; u.d = x.data(); return Scalar::double_m(0ull != (u.l & 0x8000000000000000ull)); } template Vc_INTRINSIC void Vector::setQnan() { union { float f; unsigned int i; } u; u.i = 0xffffffffu; m_data = u.f; } template<> Vc_INTRINSIC void Scalar::double_v::setQnan() { union { double d; unsigned long long l; } u; u.l = 0xffffffffffffffffull; m_data = u.d; } template Vc_INTRINSIC void Vector::setQnan(Mask m) { if (m.data()) { setQnan(); } } template<> Vc_INTRINSIC void Scalar::double_v::setQnan(Scalar::double_v::Mask m) { if (m.data()) { setQnan(); } } namespace Common { Vc_ALWAYS_INLINE void transpose_impl(TransposeTag<1, 1>, Scalar::float_v *Vc_RESTRICT r[], const TransposeProxy &proxy) { *r[0] = std::get<0>(proxy.in).data(); } } } #ifndef VC_SCALAR_SIMD_CAST_H_ #define VC_SCALAR_SIMD_CAST_H_ #ifndef VC_COMMON_SIMD_CAST_H_ #define VC_COMMON_SIMD_CAST_H_ #include template void simd_cast(); namespace Vc_VERSIONED_NAMESPACE { template Vc_INTRINSIC Vc_CONST To simd_cast(From &&x, enable_if>::value> = nullarg) { return std::forward(x); } template Vc_INTRINSIC Vc_CONST To simd_cast() { return To(); } } #endif #ifndef VC_SCALAR_TYPE_TRAITS_H_ #define VC_SCALAR_TYPE_TRAITS_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Scalar { namespace Traits { template struct is_vector : public std::false_type {}; template struct is_vector> : public std::true_type {}; template struct is_mask : public std::false_type {}; template struct is_mask> : public std::true_type {}; } } } #endif namespace Vc_VERSIONED_NAMESPACE { template Vc_INTRINSIC Vc_CONST To simd_cast(Scalar::Vector x, enable_if::value> = nullarg) { return static_cast(x.data()); } template Vc_INTRINSIC Vc_CONST To simd_cast(Scalar::Mask x, enable_if::value> = nullarg) { return static_cast(x.data()); } template Vc_INTRINSIC Vc_CONST Return simd_cast( T &&x, enable_if::value && Scalar::is_vector::value> = nullarg) { return Return(x[offset]); } template Vc_INTRINSIC Vc_CONST enable_if::value && !Scalar::is_vector::value, Return> simd_cast(Scalar::Vector x) { Return r{}; r[0] = static_cast(x.data()); return r; } template Vc_INTRINSIC Vc_CONST Return simd_cast( T &&x, enable_if::value && Scalar::is_mask::value> = nullarg) { return Return(bool(x[offset])); } template Vc_INTRINSIC Vc_CONST enable_if< offset == 0 && Traits::is_simd_mask::value && !Scalar::is_mask::value, Return> simd_cast(Scalar::Mask x) { Return r(false); r[0] = x[0]; return r; } } #endif #endif #if defined(Vc_IMPL_SSE) #ifndef VC_SSE_VECTOR_H_ #define VC_SSE_VECTOR_H_ #ifndef VC_SSE_INTRINSICS_H_ #define VC_SSE_INTRINSICS_H_ #ifdef Vc_MSVC #include #else #include #endif #ifndef VC_COMMON_STORAGE_H_ #define VC_COMMON_STORAGE_H_ #ifndef VC_COMMON_ALIASINGENTRYHELPER_H_ #define VC_COMMON_ALIASINGENTRYHELPER_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Common { template class AliasingEntryHelper { private: typedef typename StorageType::EntryType T; #ifdef Vc_ICC StorageType *const m_storage; const int m_index; public: Vc_ALWAYS_INLINE AliasingEntryHelper(StorageType *d, int index) : m_storage(d), m_index(index) {} Vc_ALWAYS_INLINE AliasingEntryHelper(const AliasingEntryHelper &) = default; Vc_ALWAYS_INLINE AliasingEntryHelper(AliasingEntryHelper &&) = default; Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) { m_storage->assign(m_index, rhs); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_storage->assign(m_index, x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator +=(T x) { m_storage->assign(m_index, m_storage->m(m_index) + x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator -=(T x) { m_storage->assign(m_index, m_storage->m(m_index) - x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator /=(T x) { m_storage->assign(m_index, m_storage->m(m_index) / x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator *=(T x) { m_storage->assign(m_index, m_storage->m(m_index) * x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator |=(T x) { m_storage->assign(m_index, m_storage->m(m_index) | x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator &=(T x) { m_storage->assign(m_index, m_storage->m(m_index) & x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator ^=(T x) { m_storage->assign(m_index, m_storage->m(m_index) ^ x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator %=(T x) { m_storage->assign(m_index, m_storage->m(m_index) % x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_storage->assign(m_index, m_storage->m(m_index)<< x); return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_storage->assign(m_index, m_storage->m(m_index)>> x); return *this; } #define m_data m_storage->read(m_index) #else typedef T A Vc_MAY_ALIAS; A &m_data; public: template Vc_ALWAYS_INLINE AliasingEntryHelper(T2 &d) : m_data(reinterpret_cast(d)) {} Vc_ALWAYS_INLINE AliasingEntryHelper(A &d) : m_data(d) {} Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) { m_data = rhs.m_data; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_data = x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator+=(T x) { m_data += x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator-=(T x) { m_data -= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator/=(T x) { m_data /= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator*=(T x) { m_data *= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator|=(T x) { m_data |= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator&=(T x) { m_data &= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator^=(T x) { m_data ^= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator%=(T x) { m_data %= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_data <<= x; return *this; } Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_data >>= x; return *this; } #endif Vc_ALWAYS_INLINE Vc_PURE operator const T() const { return m_data; } Vc_ALWAYS_INLINE Vc_PURE bool operator==(T x) const { return static_cast(m_data) == x; } Vc_ALWAYS_INLINE Vc_PURE bool operator!=(T x) const { return static_cast(m_data) != x; } Vc_ALWAYS_INLINE Vc_PURE bool operator<=(T x) const { return static_cast(m_data) <= x; } Vc_ALWAYS_INLINE Vc_PURE bool operator>=(T x) const { return static_cast(m_data) >= x; } Vc_ALWAYS_INLINE Vc_PURE bool operator< (T x) const { return static_cast(m_data) < x; } Vc_ALWAYS_INLINE Vc_PURE bool operator> (T x) const { return static_cast(m_data) > x; } Vc_ALWAYS_INLINE Vc_PURE T operator-() const { return -static_cast(m_data); } Vc_ALWAYS_INLINE Vc_PURE T operator~() const { return ~static_cast(m_data); } Vc_ALWAYS_INLINE Vc_PURE T operator+(T x) const { return static_cast(m_data) + x; } Vc_ALWAYS_INLINE Vc_PURE T operator-(T x) const { return static_cast(m_data) - x; } Vc_ALWAYS_INLINE Vc_PURE T operator/(T x) const { return static_cast(m_data) / x; } Vc_ALWAYS_INLINE Vc_PURE T operator*(T x) const { return static_cast(m_data) * x; } Vc_ALWAYS_INLINE Vc_PURE T operator|(T x) const { return static_cast(m_data) | x; } Vc_ALWAYS_INLINE Vc_PURE T operator&(T x) const { return static_cast(m_data) & x; } Vc_ALWAYS_INLINE Vc_PURE T operator^(T x) const { return static_cast(m_data) ^ x; } Vc_ALWAYS_INLINE Vc_PURE T operator%(T x) const { return static_cast(m_data) % x; } #ifdef m_data #undef m_data #endif }; } } #endif #ifndef VC_COMMON_MASKENTRY_H_ #define VC_COMMON_MASKENTRY_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Common { namespace { template struct MaskBoolStorage; template<> struct MaskBoolStorage<1> { typedef std::int8_t type; }; template<> struct MaskBoolStorage<2> { typedef std::int16_t type; }; template<> struct MaskBoolStorage<4> { typedef std::int32_t type; }; template<> struct MaskBoolStorage<8> { typedef std::int64_t type; }; } template class MaskBool { typedef typename MaskBoolStorage::type storage_type Vc_MAY_ALIAS; storage_type data; public: constexpr MaskBool(bool x) noexcept : data(x ? -1 : 0) {} Vc_ALWAYS_INLINE MaskBool &operator=(bool x) noexcept { data = x ? -1 : 0; return *this; } template ::value && std::is_fundamental::value)>> Vc_ALWAYS_INLINE MaskBool &operator=(T x) noexcept { data = reinterpret_cast(x); return *this; } Vc_ALWAYS_INLINE MaskBool(const MaskBool &) noexcept = default; Vc_ALWAYS_INLINE MaskBool &operator=(const MaskBool &) noexcept = default; template ::value || (std::is_fundamental::value && sizeof(storage_type) == sizeof(T)))>> constexpr operator T() const noexcept { return std::is_same::value ? T((data & 1) != 0) : aliasing_cast(data); } } Vc_MAY_ALIAS; template ::value &&std::is_convertible::value, int>::type = 0> constexpr bool operator==(A &&a, B &&b) { return static_cast(a) == static_cast(b); } template ::value &&std::is_convertible::value, int>::type = 0> constexpr bool operator!=(A &&a, B &&b) { return static_cast(a) != static_cast(b); } } } #endif #ifdef Vc_IMPL_AVX #ifndef VC_AVX_INTRINSICS_H_ #define VC_AVX_INTRINSICS_H_ extern "C" { #include #if (defined(Vc_IMPL_XOP) || defined(Vc_IMPL_FMA4)) && !defined(Vc_MSVC) #include #endif } #ifndef VC_COMMON_FIX_CLANG_EMMINTRIN_H_ #define VC_COMMON_FIX_CLANG_EMMINTRIN_H_ #if (defined Vc_CLANG && Vc_CLANG < 0x30700) || (defined Vc_APPLECLANG && Vc_APPLECLANG < 0x70000) #ifdef _mm_slli_si128 #undef _mm_slli_si128 #define _mm_slli_si128(a,count) __extension__ ({ \ (__m128i)__builtin_ia32_pslldqi128((__m128i)(a), (count)*8); }) #endif #ifdef _mm_srli_si128 #undef _mm_srli_si128 #define _mm_srli_si128(a,count) __extension__ ({ \ (__m128i)__builtin_ia32_psrldqi128((__m128i)(a), (count)*8); }) #endif #ifdef _mm_shuffle_epi32 #undef _mm_shuffle_epi32 #define _mm_shuffle_epi32(a,imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), (__v4si) _mm_set1_epi32(0), \ (imm) & 0x3, ((imm) & 0xc) >> 2, \ ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) #endif #ifdef _mm_shufflelo_epi16 #undef _mm_shufflelo_epi16 #define _mm_shufflelo_epi16(a,imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \ (imm) & 0x3, ((imm) & 0xc) >> 2, \ ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 4, 5, 6, 7); }) #endif #ifdef _mm_shufflehi_epi16 #undef _mm_shufflehi_epi16 #define _mm_shufflehi_epi16(a,imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \ 0, 1, 2, 3, \ 4 + (((imm) & 0x03) >> 0), \ 4 + (((imm) & 0x0c) >> 2), \ 4 + (((imm) & 0x30) >> 4), \ 4 + (((imm) & 0xc0) >> 6)); }) #endif #ifdef _mm_shuffle_pd #undef _mm_shuffle_pd #define _mm_shuffle_pd(a,b,i) __extension__ ({ \ __builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, (((i) & 2) >> 1) + 2); }) #endif #endif #endif #ifndef VC_AVX_CONST_DATA_H_ #define VC_AVX_CONST_DATA_H_ namespace Vc_VERSIONED_NAMESPACE { namespace AVX { alignas(64) extern const unsigned int _IndexesFromZero32[ 8]; alignas(16) extern const unsigned short _IndexesFromZero16[16]; alignas(16) extern const unsigned char _IndexesFromZero8 [32]; struct alignas(64) c_general { static const float oneFloat; static const unsigned int absMaskFloat[2]; static const unsigned int signMaskFloat[2]; static const unsigned int highMaskFloat; static const unsigned short minShort[2]; static const unsigned short one16[2]; static const float _2power31; static const double oneDouble; static const unsigned long long frexpMask; static const unsigned long long highMaskDouble; }; template struct c_trig { alignas(64) static const T data[]; }; #ifndef Vc_MSVC template <> alignas(64) const float c_trig::data[]; template <> alignas(64) const double c_trig::data[]; #endif template struct c_log { typedef float floatAlias Vc_MAY_ALIAS; static Vc_ALWAYS_INLINE float d(int i) { return *reinterpret_cast(&data[i]); } alignas(64) static const unsigned int data[21]; }; #ifndef Vc_MSVC template<> alignas(64) const unsigned int c_log::data[21]; #endif template<> struct c_log { enum VectorSize { Size = 16 / sizeof(double) }; typedef double doubleAlias Vc_MAY_ALIAS; static Vc_ALWAYS_INLINE double d(int i) { return *reinterpret_cast(&data[i]); } alignas(64) static const unsigned long long data[21]; }; } } namespace Vc_VERSIONED_NAMESPACE { namespace AVX2 { using AVX::_IndexesFromZero8; using AVX::_IndexesFromZero16; using AVX::_IndexesFromZero32; using AVX::c_general; using AVX::c_trig; using AVX::c_log; } } #endif #include #if (defined Vc_CLANG && Vc_CLANG >= 0x30900 && Vc_CLANG < 0x70000) #ifdef _mm256_permute2f128_si256 #undef _mm256_permute2f128_si256 #define _mm256_permute2f128_si256(V1,V2,M) __extension__ ({ \ (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ (__v8si)(__m256i)(V2), (char)(M)); }) #endif #ifdef _mm256_permute2f128_ps #undef _mm256_permute2f128_ps #define _mm256_permute2f128_ps(V1,V2,M) __extension__ ({ \ (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ (__v8sf)(__m256)(V2), (char)(M)); }) #endif #ifdef _mm256_permute2x128_si256 #undef _mm256_permute2x128_si256 #define _mm256_permute2x128_si256(V1,V2,M) __extension__ ({ \ (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (char)(M)); }) #endif #endif namespace Vc_VERSIONED_NAMESPACE { namespace AvxIntrinsics { using AVX::c_general; using AVX::_IndexesFromZero32; using AVX::_IndexesFromZero16; using AVX::_IndexesFromZero8; typedef __m128 m128 ; typedef __m128d m128d; typedef __m128i m128i; typedef __m256 m256 ; typedef __m256d m256d; typedef __m256i m256i; #ifdef Vc_GCC static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast(static_cast<__v4df>(a) * static_cast<__v4df>(b)); } static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) { return static_cast(static_cast<__v4df>(a) + static_cast<__v4df>(b)); } static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) { return static_cast(static_cast<__v4df>(a) - static_cast<__v4df>(b)); } static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) { return static_cast(static_cast<__v8sf>(a) * static_cast<__v8sf>(b)); } static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) { return static_cast(static_cast<__v8sf>(a) + static_cast<__v8sf>(b)); } static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); } #endif static Vc_INTRINSIC m256d Vc_CONST set1_pd (double a) { return _mm256_set1_pd (a); } static Vc_INTRINSIC m256i Vc_CONST set1_epi32(int a) { return _mm256_set1_epi32(a); } static Vc_INTRINSIC Vc_CONST m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC Vc_CONST m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC Vc_CONST m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC Vc_CONST m256i setallone_si256() { return _mm256_castps_si256(_mm256_load_ps(reinterpret_cast(Common::AllBitsSet))); } static Vc_INTRINSIC Vc_CONST m256d setallone_pd() { return _mm256_load_pd(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC Vc_CONST m256 setallone_ps() { return _mm256_load_ps(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC m256i Vc_CONST setone_epi8 () { return _mm256_set1_epi8(1); } static Vc_INTRINSIC m256i Vc_CONST setone_epu8 () { return setone_epi8(); } static Vc_INTRINSIC m256i Vc_CONST setone_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(c_general::one16))); } static Vc_INTRINSIC m256i Vc_CONST setone_epu16() { return setone_epi16(); } static Vc_INTRINSIC m256i Vc_CONST setone_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&_IndexesFromZero32[1]))); } static Vc_INTRINSIC m256i Vc_CONST setone_epu32() { return setone_epi32(); } static Vc_INTRINSIC m256 Vc_CONST setone_ps() { return _mm256_broadcast_ss(&c_general::oneFloat); } static Vc_INTRINSIC m256d Vc_CONST setone_pd() { return _mm256_broadcast_sd(&c_general::oneDouble); } static Vc_INTRINSIC m256d Vc_CONST setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast(&c_general::absMaskFloat[0])); } static Vc_INTRINSIC m256 Vc_CONST setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast(&c_general::absMaskFloat[1])); } static Vc_INTRINSIC m256d Vc_CONST setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast(&c_general::signMaskFloat[0])); } static Vc_INTRINSIC m256 Vc_CONST setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1])); } static Vc_INTRINSIC m256 Vc_CONST set2power31_ps() { return _mm256_broadcast_ss(&c_general::_2power31); } static Vc_INTRINSIC m128 Vc_CONST _mm_set2power31_ps() { return _mm_broadcast_ss(&c_general::_2power31); } static Vc_INTRINSIC m256i Vc_CONST set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } static Vc_INTRINSIC m128i Vc_CONST _mm_set2power31_epu32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } static Vc_INTRINSIC m256i Vc_CONST setmin_epi8 () { return _mm256_set1_epi8(-0x80); } static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(c_general::minShort))); } static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } static Vc_INTRINSIC m256i Vc_CONST setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(c_general::minShort))); } static Vc_INTRINSIC m256i Vc_CONST setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast(&c_general::signMaskFloat[1]))); } template static Vc_INTRINSIC Vc_CONST unsigned int extract_epu32(__m128i x) { return _mm_extract_epi32(x, i); } template Vc_INTRINSIC __m256 insert128(__m256 a, __m128 b) { return _mm256_insertf128_ps(a, b, offset); } template Vc_INTRINSIC __m256d insert128(__m256d a, __m128d b) { return _mm256_insertf128_pd(a, b, offset); } template Vc_INTRINSIC __m256i insert128(__m256i a, __m128i b) { #ifdef Vc_IMPL_AVX2 return _mm256_inserti128_si256(a, b, offset); #else return _mm256_insertf128_si256(a, b, offset); #endif } template Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_extractf128_ps(a, offset); } template Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); } template Vc_INTRINSIC __m128i extract128(__m256i a) { #ifdef Vc_IMPL_AVX2 return _mm256_extracti128_si256(a, offset); #else return _mm256_extractf128_si256(a, offset); #endif } #ifdef Vc_GCC Vc_INTRINSIC __m256d cmpeq_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a == b); } Vc_INTRINSIC __m256d cmpneq_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a != b); } Vc_INTRINSIC __m256d cmplt_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a < b); } Vc_INTRINSIC __m256d cmpge_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a >= b); } Vc_INTRINSIC __m256d cmple_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a <= b); } Vc_INTRINSIC __m256d cmpgt_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a > b); } Vc_INTRINSIC __m256 cmpeq_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a == b); } Vc_INTRINSIC __m256 cmpneq_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a != b); } Vc_INTRINSIC __m256 cmplt_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a < b); } Vc_INTRINSIC __m256 cmpge_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a >= b); } Vc_INTRINSIC __m256 cmple_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a <= b); } Vc_INTRINSIC __m256 cmpgt_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a > b); } #else Vc_INTRINSIC __m256d cmpeq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } Vc_INTRINSIC __m256d cmpneq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } Vc_INTRINSIC __m256d cmplt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } Vc_INTRINSIC __m256d cmpge_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } Vc_INTRINSIC __m256d cmple_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } Vc_INTRINSIC __m256d cmpgt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } Vc_INTRINSIC __m256 cmpeq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } Vc_INTRINSIC __m256 cmpneq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } Vc_INTRINSIC __m256 cmplt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } Vc_INTRINSIC __m256 cmpge_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); } Vc_INTRINSIC __m256 cmple_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); } Vc_INTRINSIC __m256 cmpgt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); } #endif Vc_INTRINSIC __m256d cmpnlt_pd (__m256d a, __m256d b) { return cmpge_pd(a, b); } Vc_INTRINSIC __m256d cmpnle_pd (__m256d a, __m256d b) { return cmpgt_pd(a, b); } Vc_INTRINSIC __m256 cmpnlt_ps (__m256 a, __m256 b) { return cmpge_ps(a, b); } Vc_INTRINSIC __m256 cmpnle_ps (__m256 a, __m256 b) { return cmpgt_ps(a, b); } Vc_INTRINSIC __m256d cmpord_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); } Vc_INTRINSIC __m256d cmpunord_pd(__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); } Vc_INTRINSIC __m256 cmpord_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); } Vc_INTRINSIC __m256 cmpunord_ps(__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); } #if defined(Vc_IMPL_XOP) static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) { return _mm_comlt_epu16(a, b); } static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) { return _mm_comgt_epu16(a, b); } #else static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) { return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) { return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16())); } #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2) { return _mm256_alignr_epi8(s1, s2, shift); } #else template Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2) { return insert128<1>( _mm256_castsi128_si256(_mm_alignr_epi8(_mm256_castsi256_si128(s1), _mm256_castsi256_si128(s2), shift)), _mm_alignr_epi8(extract128<1>(s1), extract128<1>(s2), shift)); } #endif #ifdef Vc_IMPL_AVX2 #define Vc_AVX_TO_SSE_2_NEW(name) \ Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \ { \ return _mm256_##name(a0, b0); \ } #define Vc_AVX_TO_SSE_256_128(name) \ Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \ { \ return _mm256_##name(a0, b0); \ } #define Vc_AVX_TO_SSE_1i(name) \ template Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \ { \ return _mm256_##name(a0, i); \ } #define Vc_AVX_TO_SSE_1(name) \ Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) { return _mm256_##name(a0); } #define Vc_AVX_TO_SSE_1_128(name,shift__) \ Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) { return _mm256_##name(a0); } #else #define Vc_AVX_TO_SSE_1(name) \ Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) \ { \ __m128i a1 = extract128<1>(a0); \ __m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \ __m128i r1 = _mm_##name(a1); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } #define Vc_AVX_TO_SSE_1_128(name,shift__) \ Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) \ { \ __m128i r0 = _mm_##name(a0); \ __m128i r1 = _mm_##name(_mm_srli_si128(a0, shift__)); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } #define Vc_AVX_TO_SSE_2_NEW(name) \ Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \ { \ m128i a1 = extract128<1>(a0); \ m128i b1 = extract128<1>(b0); \ m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \ m128i r1 = _mm_##name(a1, b1); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } #define Vc_AVX_TO_SSE_256_128(name) \ Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \ { \ m128i a1 = extract128<1>(a0); \ m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), b0); \ m128i r1 = _mm_##name(a1, b0); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } #define Vc_AVX_TO_SSE_1i(name) \ template Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \ { \ m128i a1 = extract128<1>(a0); \ m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \ m128i r1 = _mm_##name(a1, i); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } #endif Vc_INTRINSIC Vc_CONST __m128i sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); } Vc_INTRINSIC Vc_CONST __m128i sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); } Vc_INTRINSIC Vc_CONST __m128i sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); } Vc_INTRINSIC Vc_CONST __m128i srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); } Vc_INTRINSIC Vc_CONST __m128i srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); } Vc_INTRINSIC Vc_CONST __m128i srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); } Vc_INTRINSIC Vc_CONST __m128i sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); } Vc_INTRINSIC Vc_CONST __m128i sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); } Vc_AVX_TO_SSE_1i(slli_epi16) Vc_AVX_TO_SSE_1i(slli_epi32) Vc_AVX_TO_SSE_1i(slli_epi64) Vc_AVX_TO_SSE_1i(srai_epi16) Vc_AVX_TO_SSE_1i(srai_epi32) Vc_AVX_TO_SSE_1i(srli_epi16) Vc_AVX_TO_SSE_1i(srli_epi32) Vc_AVX_TO_SSE_1i(srli_epi64) Vc_AVX_TO_SSE_256_128(sll_epi16) Vc_AVX_TO_SSE_256_128(sll_epi32) Vc_AVX_TO_SSE_256_128(sll_epi64) Vc_AVX_TO_SSE_256_128(srl_epi16) Vc_AVX_TO_SSE_256_128(srl_epi32) Vc_AVX_TO_SSE_256_128(srl_epi64) Vc_AVX_TO_SSE_256_128(sra_epi16) Vc_AVX_TO_SSE_256_128(sra_epi32) Vc_AVX_TO_SSE_2_NEW(cmpeq_epi8) Vc_AVX_TO_SSE_2_NEW(cmpeq_epi16) Vc_AVX_TO_SSE_2_NEW(cmpeq_epi32) Vc_AVX_TO_SSE_2_NEW(cmpeq_epi64) Vc_AVX_TO_SSE_2_NEW(cmpgt_epi8) Vc_AVX_TO_SSE_2_NEW(cmpgt_epi16) Vc_AVX_TO_SSE_2_NEW(cmpgt_epi32) Vc_AVX_TO_SSE_2_NEW(cmpgt_epi64) Vc_AVX_TO_SSE_2_NEW(unpackhi_epi16) Vc_AVX_TO_SSE_2_NEW(unpacklo_epi16) Vc_AVX_TO_SSE_2_NEW(add_epi16) Vc_AVX_TO_SSE_2_NEW(add_epi32) Vc_AVX_TO_SSE_2_NEW(add_epi64) Vc_AVX_TO_SSE_2_NEW(sub_epi16) Vc_AVX_TO_SSE_2_NEW(sub_epi32) Vc_AVX_TO_SSE_2_NEW(mullo_epi16) Vc_AVX_TO_SSE_2_NEW(sign_epi16) Vc_AVX_TO_SSE_2_NEW(sign_epi32) Vc_AVX_TO_SSE_2_NEW(min_epi8) Vc_AVX_TO_SSE_2_NEW(max_epi8) Vc_AVX_TO_SSE_2_NEW(min_epu16) Vc_AVX_TO_SSE_2_NEW(max_epu16) Vc_AVX_TO_SSE_2_NEW(min_epi32) Vc_AVX_TO_SSE_2_NEW(max_epi32) Vc_AVX_TO_SSE_2_NEW(min_epu32) Vc_AVX_TO_SSE_2_NEW(max_epu32) Vc_AVX_TO_SSE_2_NEW(mullo_epi32) Vc_AVX_TO_SSE_1(abs_epi8) Vc_AVX_TO_SSE_1(abs_epi16) Vc_AVX_TO_SSE_1(abs_epi32) Vc_AVX_TO_SSE_1_128(cvtepi8_epi16, 8) Vc_AVX_TO_SSE_1_128(cvtepi8_epi32, 4) Vc_AVX_TO_SSE_1_128(cvtepi8_epi64, 2) Vc_AVX_TO_SSE_1_128(cvtepi16_epi32, 8) Vc_AVX_TO_SSE_1_128(cvtepi16_epi64, 4) Vc_AVX_TO_SSE_1_128(cvtepi32_epi64, 8) Vc_AVX_TO_SSE_1_128(cvtepu8_epi16, 8) Vc_AVX_TO_SSE_1_128(cvtepu8_epi32, 4) Vc_AVX_TO_SSE_1_128(cvtepu8_epi64, 2) Vc_AVX_TO_SSE_1_128(cvtepu16_epi32, 8) Vc_AVX_TO_SSE_1_128(cvtepu16_epi64, 4) Vc_AVX_TO_SSE_1_128(cvtepu32_epi64, 8) #ifndef Vc_IMPL_AVX2 static Vc_INTRINSIC m256i Vc_CONST and_si256(__m256i x, __m256i y) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } static Vc_INTRINSIC m256i Vc_CONST andnot_si256(__m256i x, __m256i y) { return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } static Vc_INTRINSIC m256i Vc_CONST or_si256(__m256i x, __m256i y) { return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } static Vc_INTRINSIC m256i Vc_CONST xor_si256(__m256i x, __m256i y) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0) { m128i a1 = extract128<1>(a0); return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0)); } template Vc_INTRINSIC Vc_CONST m256i blend_epi16(__m256i a0, __m256i b0) { m128i a1 = extract128<1>(a0); m128i b1 = extract128<1>(b0); m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff); m128i r1 = _mm_blend_epi16(a1, b1, m >> 8); return insert128<1>(_mm256_castsi128_si256(r0), r1); } Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0) { m128i a1 = extract128<1>(a0); m128i b1 = extract128<1>(b0); m128i m1 = extract128<1>(m0); m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0)); m128i r1 = _mm_blendv_epi8(a1, b1, m1); return insert128<1>(_mm256_castsi128_si256(r0), r1); } #else static Vc_INTRINSIC Vc_CONST m256i xor_si256(__m256i x, __m256i y) { return _mm256_xor_si256(x, y); } static Vc_INTRINSIC Vc_CONST m256i or_si256(__m256i x, __m256i y) { return _mm256_or_si256(x, y); } static Vc_INTRINSIC Vc_CONST m256i and_si256(__m256i x, __m256i y) { return _mm256_and_si256(x, y); } static Vc_INTRINSIC Vc_CONST m256i andnot_si256(__m256i x, __m256i y) { return _mm256_andnot_si256(x, y); } Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0) { return _mm256_blendv_epi8(a0, b0, m0); } Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0) { return _mm256_movemask_epi8(a0); } #endif static Vc_INTRINSIC m256i cmplt_epi64(__m256i a, __m256i b) { return cmpgt_epi64(b, a); } static Vc_INTRINSIC m256i cmplt_epi32(__m256i a, __m256i b) { return cmpgt_epi32(b, a); } static Vc_INTRINSIC m256i cmplt_epi16(__m256i a, __m256i b) { return cmpgt_epi16(b, a); } static Vc_INTRINSIC m256i cmplt_epi8(__m256i a, __m256i b) { return cmpgt_epi8(b, a); } static Vc_INTRINSIC m256i cmpgt_epu8(__m256i a, __m256i b) { return cmpgt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8())); } #if defined(Vc_IMPL_XOP) Vc_AVX_TO_SSE_2_NEW(comlt_epu32) Vc_AVX_TO_SSE_2_NEW(comgt_epu32) Vc_AVX_TO_SSE_2_NEW(comlt_epu16) Vc_AVX_TO_SSE_2_NEW(comgt_epu16) static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i a, __m256i b) { return comlt_epu32(a, b); } static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i a, __m256i b) { return comgt_epu32(a, b); } static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i a, __m256i b) { return comlt_epu16(a, b); } static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i a, __m256i b) { return comgt_epu16(a, b); } #else static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i _a, __m256i _b) { m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32()))); m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32()))); return cmplt_epi32(a, b); } static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i _a, __m256i _b) { m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32()))); m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32()))); return cmpgt_epi32(a, b); } static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i _a, __m256i _b) { m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16()))); m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16()))); return cmplt_epi16(a, b); } static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i _a, __m256i _b) { m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16()))); m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16()))); return cmpgt_epi16(a, b); } #endif static Vc_INTRINSIC void _mm256_maskstore(float *mem, const __m256 mask, const __m256 v) { _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v); } static Vc_INTRINSIC void _mm256_maskstore(double *mem, const __m256d mask, const __m256d v) { _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v); } static Vc_INTRINSIC void _mm256_maskstore(int *mem, const __m256i mask, const __m256i v) { #ifdef Vc_IMPL_AVX2 _mm256_maskstore_epi32(mem, mask, v); #else _mm256_maskstore_ps(reinterpret_cast(mem), mask, _mm256_castsi256_ps(v)); #endif } static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const __m256i mask, const __m256i v) { _mm256_maskstore(reinterpret_cast(mem), mask, v); } static Vc_INTRINSIC void _mm256_maskstore(short *mem, const __m256i mask, const __m256i v) { using namespace AVX; _mm_maskmoveu_si128(_mm256_castsi256_si128(v), _mm256_castsi256_si128(mask), reinterpret_cast(&mem[0])); _mm_maskmoveu_si128(extract128<1>(v), extract128<1>(mask), reinterpret_cast(&mem[8])); } static Vc_INTRINSIC void _mm256_maskstore(unsigned short *mem, const __m256i mask, const __m256i v) { _mm256_maskstore(reinterpret_cast(mem), mask, v); } #undef Vc_AVX_TO_SSE_1 #undef Vc_AVX_TO_SSE_1_128 #undef Vc_AVX_TO_SSE_2_NEW #undef Vc_AVX_TO_SSE_256_128 #undef Vc_AVX_TO_SSE_1i template Vc_INTRINSIC_L R stream_load(const float *mem) Vc_INTRINSIC_R; template<> Vc_INTRINSIC m128 stream_load(const float *mem) { return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem)))); } template<> Vc_INTRINSIC m256 stream_load(const float *mem) { return insert128<1>(_mm256_castps128_ps256(stream_load(mem)), stream_load(mem + 4)); } template Vc_INTRINSIC_L R stream_load(const double *mem) Vc_INTRINSIC_R; template<> Vc_INTRINSIC m128d stream_load(const double *mem) { return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem)))); } template<> Vc_INTRINSIC m256d stream_load(const double *mem) { return insert128<1>(_mm256_castpd128_pd256(stream_load(mem)), stream_load(mem + 2)); } template Vc_INTRINSIC_L R stream_load(const void *mem) Vc_INTRINSIC_R; template<> Vc_INTRINSIC m128i stream_load(const void *mem) { return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem))); } template<> Vc_INTRINSIC m256i stream_load(const void *mem) { return insert128<1>(_mm256_castsi128_si256(stream_load(mem)), stream_load(static_cast(mem) + 1)); } Vc_INTRINSIC void stream_store(float *mem, __m128 value, __m128 mask) { _mm_maskmoveu_si128(_mm_castps_si128(value), _mm_castps_si128(mask), reinterpret_cast(mem)); } Vc_INTRINSIC void stream_store(float *mem, __m256 value, __m256 mask) { stream_store(mem, _mm256_castps256_ps128(value), _mm256_castps256_ps128(mask)); stream_store(mem + 4, extract128<1>(value), extract128<1>(mask)); } Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask) { _mm_maskmoveu_si128(_mm_castpd_si128(value), _mm_castpd_si128(mask), reinterpret_cast(mem)); } Vc_INTRINSIC void stream_store(double *mem, __m256d value, __m256d mask) { stream_store(mem, _mm256_castpd256_pd128(value), _mm256_castpd256_pd128(mask)); stream_store(mem + 2, extract128<1>(value), extract128<1>(mask)); } Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask) { _mm_maskmoveu_si128(value, mask, reinterpret_cast(mem)); } Vc_INTRINSIC void stream_store(void *mem, __m256i value, __m256i mask) { stream_store(mem, _mm256_castsi256_si128(value), _mm256_castsi256_si128(mask)); stream_store(static_cast<__m128i *>(mem) + 1, extract128<1>(value), extract128<1>(mask)); } #ifndef __x86_64__ Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) { return _mm_castpd_si128(_mm_load_sd(reinterpret_cast(&x))); } #endif #ifdef Vc_IMPL_AVX2 template __m256 gather(const float *addr, __m256i idx) { return _mm256_i32gather_ps(addr, idx, Scale); } template __m256d gather(const double *addr, __m128i idx) { return _mm256_i32gather_pd(addr, idx, Scale); } template __m256i gather(const int *addr, __m256i idx) { return _mm256_i32gather_epi32(addr, idx, Scale); } template __m256i gather(const unsigned *addr, __m256i idx) { return _mm256_i32gather_epi32(aliasing_cast(addr), idx, Scale); } template __m256 gather(__m256 src, __m256 k, const float *addr, __m256i idx) { return _mm256_mask_i32gather_ps(src, addr, idx, k, Scale); } template __m256d gather(__m256d src, __m256d k, const double *addr, __m128i idx) { return _mm256_mask_i32gather_pd(src, addr, idx, k, Scale); } template __m256i gather(__m256i src, __m256i k, const int *addr, __m256i idx) { return _mm256_mask_i32gather_epi32(src, addr, idx, k, Scale); } template __m256i gather(__m256i src, __m256i k, const unsigned *addr, __m256i idx) { return _mm256_mask_i32gather_epi32(src, aliasing_cast(addr), idx, k, Scale); } #endif } } namespace Vc_VERSIONED_NAMESPACE { namespace AVX { using namespace AvxIntrinsics; } namespace AVX2 { using namespace AvxIntrinsics; } namespace AVX { template struct VectorTypeHelper; template<> struct VectorTypeHelper< char > { typedef __m256i Type; }; template<> struct VectorTypeHelper< signed char > { typedef __m256i Type; }; template<> struct VectorTypeHelper { typedef __m256i Type; }; template<> struct VectorTypeHelper< short> { typedef __m256i Type; }; template<> struct VectorTypeHelper { typedef __m256i Type; }; template<> struct VectorTypeHelper< int > { typedef __m256i Type; }; template<> struct VectorTypeHelper { typedef __m256i Type; }; template<> struct VectorTypeHelper< long > { typedef __m256i Type; }; template<> struct VectorTypeHelper { typedef __m256i Type; }; template<> struct VectorTypeHelper< long long> { typedef __m256i Type; }; template<> struct VectorTypeHelper { typedef __m256i Type; }; template<> struct VectorTypeHelper< float> { typedef __m256 Type; }; template<> struct VectorTypeHelper< double> { typedef __m256d Type; }; template using IntegerVectorType = typename std::conditional::type; template using DoubleVectorType = typename std::conditional::type; template using FloatVectorType = typename std::conditional::type; template struct VectorHelper {}; template struct VectorHelperSize; } } #endif #endif namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template inline V zero(); } namespace Common { namespace Detail { #ifdef Vc_IMPL_AVX template struct IntrinsicType { using type = typename std::conditional< std::is_integral::value, typename std::conditional::type, typename std::conditional< std::is_same::value, typename std::conditional::type, typename std::conditional::type>::type>::type; }; #elif defined Vc_IMPL_SSE template struct IntrinsicType { using type = typename std::conditional< std::is_integral::value, __m128i, typename std::conditional::value, __m128d, __m128>::type>::type; }; #else template struct IntrinsicType { static_assert(Size == 1, "IntrinsicType without SIMD target support may only have Size = 1"); using type = ValueType; }; #endif template struct BuiltinType; #ifdef Vc_USE_BUILTIN_VECTOR_TYPES #define Vc_VECBUILTIN __attribute__((__vector_size__(16))) template struct BuiltinType< double , Size, 16> { typedef double type Vc_VECBUILTIN; }; template struct BuiltinType< float , Size, 16> { typedef float type Vc_VECBUILTIN; }; template struct BuiltinType< long long, Size, 16> { typedef long long type Vc_VECBUILTIN; }; template struct BuiltinType { typedef unsigned long long type Vc_VECBUILTIN; }; template struct BuiltinType< long , Size, 16> { typedef long type Vc_VECBUILTIN; }; template struct BuiltinType { typedef unsigned long type Vc_VECBUILTIN; }; template struct BuiltinType< int , Size, 16> { typedef int type Vc_VECBUILTIN; }; template struct BuiltinType { typedef unsigned int type Vc_VECBUILTIN; }; template struct BuiltinType< short , Size, 16> { typedef short type Vc_VECBUILTIN; }; template struct BuiltinType { typedef unsigned short type Vc_VECBUILTIN; }; template struct BuiltinType< char , Size, 16> { typedef char type Vc_VECBUILTIN; }; template struct BuiltinType { typedef unsigned char type Vc_VECBUILTIN; }; template struct BuiltinType< signed char , Size, 16> { typedef signed char type Vc_VECBUILTIN; }; template struct BuiltinType< bool , Size, 16> { typedef unsigned char type Vc_VECBUILTIN; }; #undef Vc_VECBUILTIN #define Vc_VECBUILTIN __attribute__((__vector_size__(32))) template struct BuiltinType< double , Size, 32> { typedef double type Vc_VECBUILTIN; }; template struct BuiltinType< float , Size, 32> { typedef float type Vc_VECBUILTIN; }; template struct BuiltinType< long long, Size, 32> { typedef long long type Vc_VECBUILTIN; }; template struct BuiltinType { typedef unsigned long long type Vc_VECBUILTIN; }; template struct BuiltinType< long , Size, 32> { typedef long type Vc_VECBUILTIN; }; template struct BuiltinType { typedef unsigned long type Vc_VECBUILTIN; }; template struct BuiltinType< int , Size, 32> { typedef int type Vc_VECBUILTIN; }; template struct BuiltinType { typedef unsigned int type Vc_VECBUILTIN; }; template struct BuiltinType< short , Size, 32> { typedef short type Vc_VECBUILTIN; }; template struct BuiltinType { typedef unsigned short type Vc_VECBUILTIN; }; template struct BuiltinType< char , Size, 32> { typedef char type Vc_VECBUILTIN; }; template struct BuiltinType { typedef unsigned char type Vc_VECBUILTIN; }; template struct BuiltinType< signed char , Size, 32> { typedef signed char type Vc_VECBUILTIN; }; template struct BuiltinType< bool , Size, 32> { typedef unsigned char type Vc_VECBUILTIN; }; #undef Vc_VECBUILTIN #endif } template using IntrinsicType = typename Detail::IntrinsicType::type; template using BuiltinType = typename Detail::BuiltinType::type; namespace AliasStrategy { struct Union {}; struct MayAlias {}; struct VectorBuiltin {}; struct UnionMembers {}; } using DefaultStrategy = #if defined Vc_USE_BUILTIN_VECTOR_TYPES AliasStrategy::VectorBuiltin; #elif defined Vc_MSVC AliasStrategy::UnionMembers; #elif defined Vc_ICC AliasStrategy::Union; #elif defined __GNUC__ AliasStrategy::MayAlias; #else AliasStrategy::Union; #endif template class Storage; template class Storage { static_assert(std::is_fundamental::value && std::is_arithmetic::value, "Only works for fundamental arithmetic types."); public: using VectorType = IntrinsicType; using EntryType = ValueType; union Alias { Vc_INTRINSIC Alias(VectorType vv) : v(vv) {} VectorType v; EntryType m[Size]; }; Vc_INTRINSIC Storage() : data(Vc::Detail::zero()) {} Vc_INTRINSIC Storage(const VectorType &x) : data(x) { assertCorrectAlignment(&data); } template Vc_INTRINSIC explicit Storage(const U &x, enable_if = nullarg) : data(reinterpret_cast(x)) { assertCorrectAlignment(&data); } Vc_INTRINSIC Storage(const Storage &) = default; Vc_INTRINSIC Storage &operator=(const Storage &) = default; Vc_INTRINSIC operator const VectorType &() const { return data; } Vc_INTRINSIC Vc_PURE VectorType &v() { return data; } Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; } Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return Alias(data).m[i]; } Vc_INTRINSIC void set(size_t i, EntryType x) { Alias a(data); a.m[i] = x; data = a.v; } private: VectorType data; }; template class Storage { static_assert(std::is_fundamental::value && std::is_arithmetic::value, "Only works for fundamental arithmetic types."); public: using VectorType = IntrinsicType; using EntryType = ValueType; Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); } Vc_INTRINSIC Storage(const VectorType &x) : data(x) { assertCorrectAlignment(&data); } template Vc_INTRINSIC explicit Storage(const U &x, enable_if = nullarg) : data(reinterpret_cast(x)) { assertCorrectAlignment(&data); } Vc_INTRINSIC Storage &operator=(const VectorType &x) { data = x; return *this; } Vc_INTRINSIC Storage(const Storage &) = default; Vc_INTRINSIC Storage &operator=(const Storage &) = default; Vc_INTRINSIC operator const VectorType &() const { return v(); } Vc_INTRINSIC Vc_PURE VectorType &v() { return data; } Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; } Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return aliasing_cast(&data)[i]; } Vc_INTRINSIC void set(size_t i, EntryType x) { aliasing_cast(&data)[i] = x; } private: VectorType data; }; template class Storage { static_assert(std::is_fundamental::value && std::is_arithmetic::value, "Only works for fundamental arithmetic types."); using Builtin = BuiltinType; public: using VectorType = #ifdef Vc_TEMPLATES_DROP_ATTRIBUTES MayAlias>; #else IntrinsicType; #endif using EntryType = ValueType; Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); } Vc_INTRINSIC Storage(const Storage &) = default; Vc_INTRINSIC Storage &operator=(const Storage &) = default; Vc_INTRINSIC Storage(const VectorType &x) : data(aliasing_cast(x)) { assertCorrectAlignment(&data); } template Vc_INTRINSIC explicit Storage(const U &x, enable_if = nullarg) : data(aliasing_cast(x)) { assertCorrectAlignment(&data); } Vc_INTRINSIC Storage &operator=(const VectorType &x) { data = aliasing_cast(x); return *this; } Vc_INTRINSIC operator const VectorType &() const { return v(); } Vc_INTRINSIC Vc_PURE VectorType &v() { return reinterpret_cast(data); } Vc_INTRINSIC Vc_PURE const VectorType &v() const { return reinterpret_cast(data); } Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return data[i]; } Vc_INTRINSIC void set(size_t i, EntryType x) { data[i] = x; } Vc_INTRINSIC Builtin &builtin() { return data; } Vc_INTRINSIC const Builtin &builtin() const { return data; } private: Builtin data; }; template class Storage { static_assert(std::is_fundamental::value && std::is_arithmetic::value, "Only works for fundamental arithmetic types."); public: using VectorType = IntrinsicType; using EntryType = ValueType; Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); } Vc_INTRINSIC Storage(const VectorType &x) : data(x) { assertCorrectAlignment(&data); } template Vc_INTRINSIC explicit Storage(const U &x, enable_if = nullarg) : data(reinterpret_cast(x)) { assertCorrectAlignment(&data); } Vc_INTRINSIC Storage &operator=(const VectorType &x) { data = x; return *this; } Vc_INTRINSIC Storage(const Storage &) = default; Vc_INTRINSIC Storage &operator=(const Storage &) = default; Vc_INTRINSIC Vc_PURE VectorType &v() { return data; } Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; } Vc_INTRINSIC_L Vc_PURE_L EntryType m(size_t i) const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC void set(size_t i, EntryType x) { ref(i) = x; } private: Vc_INTRINSIC_L Vc_PURE_L EntryType &ref(size_t i) Vc_INTRINSIC_R Vc_PURE_R; VectorType data; }; #ifdef Vc_MSVC template <> Vc_INTRINSIC Vc_PURE double Storage< double, 2, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128d_f64[i]; } template <> Vc_INTRINSIC Vc_PURE float Storage< float , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128_f32[i]; } template <> Vc_INTRINSIC Vc_PURE signed int Storage< signed int , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i32[i]; } template <> Vc_INTRINSIC Vc_PURE signed short Storage< signed short , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i16[i]; } template <> Vc_INTRINSIC Vc_PURE signed char Storage< signed char ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i8[i]; } template <> Vc_INTRINSIC Vc_PURE unsigned int Storage::m(size_t i) const { return data.m128i_u32[i]; } template <> Vc_INTRINSIC Vc_PURE unsigned short Storage::m(size_t i) const { return data.m128i_u16[i]; } template <> Vc_INTRINSIC Vc_PURE unsigned char Storage::m(size_t i) const { return data.m128i_u8[i]; } template <> Vc_INTRINSIC Vc_PURE double &Storage< double, 2, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128d_f64[i]; } template <> Vc_INTRINSIC Vc_PURE float &Storage< float , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128_f32[i]; } template <> Vc_INTRINSIC Vc_PURE signed int &Storage< signed int , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i32[i]; } template <> Vc_INTRINSIC Vc_PURE signed short &Storage< signed short , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i16[i]; } template <> Vc_INTRINSIC Vc_PURE signed char &Storage< signed char ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast(data.m128i_i8[i]); } template <> Vc_INTRINSIC Vc_PURE unsigned int &Storage::ref(size_t i) { return data.m128i_u32[i]; } template <> Vc_INTRINSIC Vc_PURE unsigned short &Storage::ref(size_t i) { return data.m128i_u16[i]; } template <> Vc_INTRINSIC Vc_PURE unsigned char &Storage::ref(size_t i) { return data.m128i_u8[i]; } #ifdef Vc_IMPL_AVX template <> Vc_INTRINSIC Vc_PURE double Storage< double, 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256d_f64[i]; } template <> Vc_INTRINSIC Vc_PURE float Storage< float , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256_f32[i]; } template <> Vc_INTRINSIC Vc_PURE signed int Storage< signed int , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i32[i]; } template <> Vc_INTRINSIC Vc_PURE signed short Storage< signed short ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i16[i]; } template <> Vc_INTRINSIC Vc_PURE signed char Storage< signed char ,32, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i8[i]; } template <> Vc_INTRINSIC Vc_PURE unsigned int Storage::m(size_t i) const { return data.m256i_u32[i]; } template <> Vc_INTRINSIC Vc_PURE unsigned short Storage::m(size_t i) const { return data.m256i_u16[i]; } template <> Vc_INTRINSIC Vc_PURE unsigned char Storage::m(size_t i) const { return data.m256i_u8[i]; } template <> Vc_INTRINSIC Vc_PURE double &Storage< double, 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256d_f64[i]; } template <> Vc_INTRINSIC Vc_PURE float &Storage< float , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256_f32[i]; } template <> Vc_INTRINSIC Vc_PURE signed int &Storage< signed int , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i32[i]; } template <> Vc_INTRINSIC Vc_PURE signed short &Storage< signed short ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i16[i]; } template <> Vc_INTRINSIC Vc_PURE signed char &Storage< signed char ,32, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast(data.m256i_i8[i]); } template <> Vc_INTRINSIC Vc_PURE unsigned int &Storage::ref(size_t i) { return data.m256i_u32[i]; } template <> Vc_INTRINSIC Vc_PURE unsigned short &Storage::ref(size_t i) { return data.m256i_u16[i]; } template <> Vc_INTRINSIC Vc_PURE unsigned char &Storage::ref(size_t i) { return data.m256i_u8[i]; } #endif #endif template using VectorMemoryUnion = Storage; } } #endif #ifndef VC_SSE_CONST_DATA_H_ #define VC_SSE_CONST_DATA_H_ #ifndef VC_SSE_MACROS_H_ #define VC_SSE_MACROS_H_ #if defined(Vc_IMPL_SSE4_1) && !defined(Vc_DISABLE_PTEST) #define Vc_USE_PTEST #endif #endif namespace Vc_VERSIONED_NAMESPACE { namespace SSE { alignas(16) extern const unsigned int _IndexesFromZero4[4]; alignas(16) extern const unsigned short _IndexesFromZero8[8]; alignas(16) extern const unsigned char _IndexesFromZero16[16]; struct c_general { alignas(64) static const int absMaskFloat[4]; alignas(16) static const unsigned int signMaskFloat[4]; alignas(16) static const unsigned int highMaskFloat[4]; alignas(16) static const short minShort[8]; alignas(16) static const unsigned short one16[8]; alignas(16) static const unsigned int one32[4]; alignas(16) static const float oneFloat[4]; alignas(16) static const unsigned long long highMaskDouble[2]; alignas(16) static const double oneDouble[2]; alignas(16) static const long long absMaskDouble[2]; alignas(16) static const unsigned long long signMaskDouble[2]; alignas(16) static const unsigned long long frexpMask[2]; }; template struct c_trig { alignas(64) static const T data[]; }; #ifndef Vc_MSVC template <> alignas(64) const float c_trig::data[]; template <> alignas(64) const double c_trig::data[]; #endif template struct c_log { enum VectorSize { Size = 16 / sizeof(T) }; static Vc_ALWAYS_INLINE Vc_CONST const float *d(int i) { return reinterpret_cast(&data[i * Size]); } alignas(64) static const unsigned int data[21 * Size]; }; #ifndef Vc_MSVC template<> alignas(64) const unsigned int c_log::data[21 * 4]; #endif template<> struct c_log { enum VectorSize { Size = 16 / sizeof(double) }; static Vc_ALWAYS_INLINE Vc_CONST const double *d(int i) { return reinterpret_cast(&data[i * Size]); } alignas(64) static const unsigned long long data[21 * Size]; }; } } #endif #include #if defined(Vc_GCC) && !defined(__OPTIMIZE__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif namespace Vc_VERSIONED_NAMESPACE { namespace SseIntrinsics { using SSE::c_general; constexpr std::size_t VectorAlignment = 16; #if defined(Vc_GCC) && Vc_GCC < 0x40600 && !defined(Vc_DONT_FIX_SSE_SHIFT) static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; } static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; } static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; } static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; } static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; } static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; } #endif #ifdef Vc_GCC static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); } static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); } static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); } static Vc_INTRINSIC Vc_CONST __m128 _mm_mul_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); } static Vc_INTRINSIC Vc_CONST __m128 _mm_add_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); } static Vc_INTRINSIC Vc_CONST __m128 _mm_sub_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); } #endif static Vc_INTRINSIC Vc_CONST __m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC Vc_CONST __m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC Vc_CONST __m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast(c_general::one16)); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast(c_general::one32)); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); } static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); } static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); } static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast(c_general::absMaskDouble)); } static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast(c_general::absMaskFloat)); } static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast(c_general::signMaskDouble)); } static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast(c_general::signMaskFloat)); } static Vc_INTRINSIC __m128i Vc_CONST setmin_epi8 () { return _mm_set1_epi8(-0x80); } static Vc_INTRINSIC __m128i Vc_CONST setmin_epi16() { return _mm_load_si128(reinterpret_cast(c_general::minShort)); } static Vc_INTRINSIC __m128i Vc_CONST setmin_epi32() { return _mm_load_si128(reinterpret_cast(c_general::signMaskFloat)); } #if defined(Vc_IMPL_XOP) static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b) { return _mm_comgt_epu8(a, b); } static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b) { return _mm_comlt_epu16(a, b); } static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b) { return _mm_comgt_epu16(a, b); } static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b) { return _mm_comlt_epu32(a, b); } static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b) { return _mm_comgt_epu32(a, b); } static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu64(__m128i a, __m128i b) { return _mm_comlt_epu64(a, b); } #else static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b) { return _mm_cmpgt_epi8(_mm_xor_si128(a, setmin_epi8()), _mm_xor_si128(b, setmin_epi8())); } static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b) { return _mm_cmplt_epi16(_mm_xor_si128(a, setmin_epi16()), _mm_xor_si128(b, setmin_epi16())); } static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b) { return _mm_cmpgt_epi16(_mm_xor_si128(a, setmin_epi16()), _mm_xor_si128(b, setmin_epi16())); } static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b) { return _mm_cmplt_epi32(_mm_xor_si128(a, setmin_epi32()), _mm_xor_si128(b, setmin_epi32())); } static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b) { return _mm_cmpgt_epi32(_mm_xor_si128(a, setmin_epi32()), _mm_xor_si128(b, setmin_epi32())); } Vc_INTRINSIC __m128i Vc_CONST cmpgt_epi64(__m128i a, __m128i b) { #ifdef Vc_IMPL_SSE4_2 return _mm_cmpgt_epi64(a, b); #else const auto aa = _mm_xor_si128(a, _mm_srli_epi64(setmin_epi32(),32)); const auto bb = _mm_xor_si128(b, _mm_srli_epi64(setmin_epi32(),32)); const auto gt = _mm_cmpgt_epi32(aa, bb); const auto eq = _mm_cmpeq_epi32(aa, bb); const auto gt2 = _mm_shuffle_epi32(gt, 0xf5); const auto lo = _mm_shuffle_epi32(_mm_and_si128(_mm_srli_epi64(eq, 32), gt), 0xa0); return _mm_or_si128(gt2, lo); #endif } #endif } } #ifdef Vc_IMPL_SSSE3 namespace Vc_VERSIONED_NAMESPACE { namespace SseIntrinsics { Vc_INTRINSIC Vc_CONST __m128i abs_epi8(__m128i a) { return _mm_abs_epi8(a); } Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) { return _mm_abs_epi16(a); } Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) { return _mm_abs_epi32(a); } template Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b) { return _mm_alignr_epi8(a, b, s & 0x1fu); } } } #else namespace Vc_VERSIONED_NAMESPACE { namespace SseIntrinsics { Vc_INTRINSIC Vc_CONST __m128i abs_epi8 (__m128i a) { __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128()); return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_set1_epi8(1))); } Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) { __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128()); return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15)); } Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) { __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128()); return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31)); } template Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b) { switch (s & 0x1fu) { case 0: return b; case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1)); case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2)); case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3)); case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4)); case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5)); case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6)); case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7)); case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8)); case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9)); case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10)); case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11)); case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12)); case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13)); case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14)); case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15)); case 16: return a; case 17: return _mm_srli_si128(a, 1); case 18: return _mm_srli_si128(a, 2); case 19: return _mm_srli_si128(a, 3); case 20: return _mm_srli_si128(a, 4); case 21: return _mm_srli_si128(a, 5); case 22: return _mm_srli_si128(a, 6); case 23: return _mm_srli_si128(a, 7); case 24: return _mm_srli_si128(a, 8); case 25: return _mm_srli_si128(a, 9); case 26: return _mm_srli_si128(a, 10); case 27: return _mm_srli_si128(a, 11); case 28: return _mm_srli_si128(a, 12); case 29: return _mm_srli_si128(a, 13); case 30: return _mm_srli_si128(a, 14); case 31: return _mm_srli_si128(a, 15); } return _mm_setzero_si128(); } } } #endif #ifdef Vc_IMPL_SSE4_1 namespace Vc_VERSIONED_NAMESPACE { namespace SseIntrinsics { Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b) { return _mm_cmpeq_epi64(a, b); } template Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v) { return _mm_extract_epi32(v, index); } Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c) { return _mm_blendv_pd(a, b, c); } Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c) { return _mm_blendv_ps(a, b, c); } Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c) { return _mm_blendv_epi8(a, b, c); } template Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b) { return _mm_blend_pd(a, b, mask); } template Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b) { return _mm_blend_ps(a, b, mask); } template Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b) { return _mm_blend_epi16(a, b, mask); } Vc_INTRINSIC Vc_CONST __m128i max_epi8(__m128i a, __m128i b) { return _mm_max_epi8(a, b); } Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b) { return _mm_max_epi32(a, b); } Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b) { return _mm_max_epu16(a, b); } Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b) { return _mm_max_epu32(a, b); } Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b) { return _mm_min_epu16(a, b); } Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b) { return _mm_min_epu32(a, b); } Vc_INTRINSIC Vc_CONST __m128i min_epi8(__m128i a, __m128i b) { return _mm_min_epi8(a, b); } Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b) { return _mm_min_epi32(a, b); } Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8) { return _mm_cvtepu8_epi16(epu8); } Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8) { return _mm_cvtepi8_epi16(epi8); } Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16) { return _mm_cvtepu16_epi32(epu16); } Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16) { return _mm_cvtepi16_epi32(epu16); } Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8) { return _mm_cvtepu8_epi32(epu8); } Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8) { return _mm_cvtepi8_epi32(epi8); } } } #else namespace Vc_VERSIONED_NAMESPACE { namespace SseIntrinsics { Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b) { auto tmp = _mm_cmpeq_epi32(a, b); return _mm_and_si128(tmp, _mm_shuffle_epi32(tmp, 1*1 + 0*4 + 3*16 + 2*64)); } template Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v) { #ifdef Vc_USE_BUILTIN_VECTOR_TYPES typedef int int32v4 __attribute__((__vector_size__(16))); return aliasing_cast(v)[index]; #else return _mm_cvtsi128_si32(_mm_srli_si128(v, index * 4)); #endif } Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c) { #ifdef Vc_GCC return reinterpret_cast<__m128d>( (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) | (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b))); #else return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b)); #endif } Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c) { #ifdef Vc_GCC return reinterpret_cast<__m128>( (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) | (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b))); #else return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b)); #endif } Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c) { #ifdef Vc_GCC return (~c & a) | (c & b); #else return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b)); #endif } template Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b) { switch (mask) { case 0x0: return a; case 0x1: return _mm_shuffle_pd(b, a, 2); case 0x2: return _mm_shuffle_pd(a, b, 2); case 0x3: return b; default: abort(); return a; } } template Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b) { __m128i c; switch (mask) { case 0x0: return a; case 0x1: c = _mm_srli_si128(_mm_setallone_si128(), 12); break; case 0x2: c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4); break; case 0x3: c = _mm_srli_si128(_mm_setallone_si128(), 8); break; case 0x4: c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8); break; case 0x5: c = _mm_set_epi32(0, -1, 0, -1); break; case 0x6: c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4); break; case 0x7: c = _mm_srli_si128(_mm_setallone_si128(), 4); break; case 0x8: c = _mm_slli_si128(_mm_setallone_si128(), 12); break; case 0x9: c = _mm_set_epi32(-1, 0, 0, -1); break; case 0xa: c = _mm_set_epi32(-1, 0, -1, 0); break; case 0xb: c = _mm_set_epi32(-1, 0, -1, -1); break; case 0xc: c = _mm_slli_si128(_mm_setallone_si128(), 8); break; case 0xd: c = _mm_set_epi32(-1, -1, 0, -1); break; case 0xe: c = _mm_slli_si128(_mm_setallone_si128(), 4); break; case 0xf: return b; default: abort(); c = _mm_setzero_si128(); break; } __m128 _c = _mm_castsi128_ps(c); return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b)); } template Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b) { __m128i c; switch (mask) { case 0x00: return a; case 0x01: c = _mm_srli_si128(_mm_setallone_si128(), 14); break; case 0x03: c = _mm_srli_si128(_mm_setallone_si128(), 12); break; case 0x07: c = _mm_srli_si128(_mm_setallone_si128(), 10); break; case 0x0f: return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a); case 0x1f: c = _mm_srli_si128(_mm_setallone_si128(), 6); break; case 0x3f: c = _mm_srli_si128(_mm_setallone_si128(), 4); break; case 0x7f: c = _mm_srli_si128(_mm_setallone_si128(), 2); break; case 0x80: c = _mm_slli_si128(_mm_setallone_si128(), 14); break; case 0xc0: c = _mm_slli_si128(_mm_setallone_si128(), 12); break; case 0xe0: c = _mm_slli_si128(_mm_setallone_si128(), 10); break; case 0xf0: c = _mm_slli_si128(_mm_setallone_si128(), 8); break; case 0xf8: c = _mm_slli_si128(_mm_setallone_si128(), 6); break; case 0xfc: c = _mm_slli_si128(_mm_setallone_si128(), 4); break; case 0xfe: c = _mm_slli_si128(_mm_setallone_si128(), 2); break; case 0xff: return b; case 0xcc: return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1))); case 0x33: return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1))); default: const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff); c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15); break; } return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b)); } Vc_INTRINSIC Vc_CONST __m128i max_epi8 (__m128i a, __m128i b) { return blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b)); } Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b) { return blendv_epi8(b, a, _mm_cmpgt_epi32(a, b)); } Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b) { return blendv_epi8(b, a, cmpgt_epu16(a, b)); } Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b) { return blendv_epi8(b, a, cmpgt_epu32(a, b)); } Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b) { return blendv_epi8(a, b, cmpgt_epu16(a, b)); } Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b) { return blendv_epi8(a, b, cmpgt_epu32(a, b)); } Vc_INTRINSIC Vc_CONST __m128i min_epi8 (__m128i a, __m128i b) { return blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b)); } Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b) { return blendv_epi8(a, b, _mm_cmpgt_epi32(a, b)); } Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8) { return _mm_unpacklo_epi8(epu8, _mm_setzero_si128()); } Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8) { return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128())); } Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16) { return _mm_unpacklo_epi16(epu16, _mm_setzero_si128()); } Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16) { return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128())); } Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8) { return cvtepu16_epi32(cvtepu8_epi16(epu8)); } Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8) { const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128()); const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg); return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg)); } } } #endif namespace Vc_VERSIONED_NAMESPACE { namespace SseIntrinsics { static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem) { #ifdef Vc_IMPL_SSE4_1 return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem)))); #else return _mm_load_ps(mem); #endif } static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) { #ifdef Vc_IMPL_SSE4_1 return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem)))); #else return _mm_load_pd(mem); #endif } static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) { #ifdef Vc_IMPL_SSE4_1 return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast(mem))); #else return _mm_load_si128(reinterpret_cast(mem)); #endif } static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) { return _mm_stream_load(reinterpret_cast(mem)); } static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) { return _mm_stream_load(reinterpret_cast(mem)); } static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) { return _mm_stream_load(reinterpret_cast(mem)); } static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) { return _mm_stream_load(reinterpret_cast(mem)); } static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) { return _mm_stream_load(reinterpret_cast(mem)); } #ifndef __x86_64__ Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) { return _mm_castpd_si128(_mm_load_sd(reinterpret_cast(&x))); } #endif #ifdef Vc_IMPL_AVX2 template __m128 gather(const float *addr, __m128i idx) { return _mm_i32gather_ps(addr, idx, Scale); } template __m128d gather(const double *addr, __m128i idx) { return _mm_i32gather_pd(addr, idx, Scale); } template __m128i gather(const int *addr, __m128i idx) { return _mm_i32gather_epi32(addr, idx, Scale); } template __m128i gather(const unsigned *addr, __m128i idx) { return _mm_i32gather_epi32(aliasing_cast(addr), idx, Scale); } template __m128 gather(__m128 src, __m128 k, const float *addr, __m128i idx) { return _mm_mask_i32gather_ps(src, addr, idx, k, Scale); } template __m128d gather(__m128d src, __m128d k, const double *addr, __m128i idx) { return _mm_mask_i32gather_pd(src, addr, idx, k, Scale); } template __m128i gather(__m128i src, __m128i k, const int *addr, __m128i idx) { return _mm_mask_i32gather_epi32(src, addr, idx, k, Scale); } template __m128i gather(__m128i src, __m128i k, const unsigned *addr, __m128i idx) { return _mm_mask_i32gather_epi32(src, aliasing_cast(addr), idx, k, Scale); } #endif } } namespace Vc_VERSIONED_NAMESPACE { namespace SSE { using namespace SseIntrinsics; template struct ParameterHelper { typedef T ByValue; typedef T &Reference; typedef const T &ConstRef; }; template struct VectorHelper { }; template struct VectorTypeHelper { typedef __m128i Type; }; template <> struct VectorTypeHelper { typedef __m128d Type; }; template <> struct VectorTypeHelper { typedef __m128 Type; }; template struct DetermineGatherMask { typedef T Type; }; template struct VectorTraits { typedef typename VectorTypeHelper::Type VectorType; using EntryType = T; static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType); typedef Mask MaskType; typedef typename DetermineGatherMask::Type GatherMaskType; typedef Common::VectorMemoryUnion StorageType; }; template struct VectorHelperSize; } } #if defined(Vc_GCC) && !defined(__OPTIMIZE__) #pragma GCC diagnostic pop #endif #ifndef VC_SSE_SHUFFLE_H_ #define VC_SSE_SHUFFLE_H_ namespace Vc_VERSIONED_NAMESPACE { enum VecPos { X0, X1, X2, X3, X4, X5, X6, X7, Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7, Const0 }; namespace Mem { template static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range"); return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); } template static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) { static_assert(Dst0 >= X0 && Dst1 >= Y0, "Incorrect_Range"); static_assert(Dst0 <= X1 && Dst1 <= Y1, "Incorrect_Range"); return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2); } template Vc_INTRINSIC Vc_CONST __m128i shuffle(__m128i x, __m128i y) { return _mm_castps_si128(shuffle(_mm_castsi128_ps(x), _mm_castsi128_ps(y))); } template static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) { static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range"); static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range"); return Vc::SseIntrinsics::blend_pd<(Dst0 / Y0) + (Dst1 / Y0) * 2>(x, y); } template static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) { static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range"); static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range"); static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range"); static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range"); return Vc::SseIntrinsics::blend_ps<(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8>(x, y); } template static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) { static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range"); static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range"); static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range"); static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range"); static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range"); static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range"); static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range"); static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range"); return Vc::SseIntrinsics::blend_epi16< (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 + (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + (Dst6 / Y6) * 64 + (Dst7 / Y7) * 128>(x, y); } template static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE Vc_CONST __m128d permute(__m128d x) { static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range"); return _mm_shuffle_pd(x, x, Dst0 + Dst1 * 4); } template static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) { static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range"); static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range"); return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64); } template static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); static_assert(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, "Incorrect_Range"); static_assert(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, "Incorrect_Range"); if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) { x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) { x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64); } return x; } } namespace Reg { template static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) { return Mem::shuffle(x, y); } template static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) { return Mem::shuffle(x, y); } template static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range"); return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64)); } template static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) { return Mem::blend(x, y); } template static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) { return Mem::blend(x, y); } } } #endif #endif #ifndef VC_SSE_VECTORHELPER_H_ #define VC_SSE_VECTORHELPER_H_ #include namespace Vc_VERSIONED_NAMESPACE { namespace SSE { #define Vc_OP0(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; } #define Vc_OP1(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a) { return code; } #define Vc_OP2(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b) { return code; } #define Vc_OP3(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b, const VectorType c) { return code; } template<> struct VectorHelper<__m128> { typedef __m128 VectorType; template static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_ps(x); } template static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_ps(x); } template static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); } template static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_ps(mem, x); } template static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_ps(mem, x); } template static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_ps(mem, x); } template static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_setallone_si128(), reinterpret_cast(mem)); } template static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast(mem)); } Vc_OP0(allone, _mm_setallone_ps()) Vc_OP0(zero, _mm_setzero_ps()) Vc_OP3(blend, blendv_ps(a, b, c)) }; template<> struct VectorHelper<__m128d> { typedef __m128d VectorType; template static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_pd(x); } template static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_pd(x); } template static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); } template static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_pd(mem, x); } template static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_pd(mem, x); } template static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_pd(mem, x); } template static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_setallone_si128(), reinterpret_cast(mem)); } template static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast(mem)); } Vc_OP0(allone, _mm_setallone_pd()) Vc_OP0(zero, _mm_setzero_pd()) Vc_OP3(blend, blendv_pd(a, b, c)) }; template<> struct VectorHelper<__m128i> { typedef __m128i VectorType; template static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_si128(reinterpret_cast(x)); } template static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_si128(reinterpret_cast(x)); } template static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); } template static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_si128(reinterpret_cast(mem), x); } template static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_si128(reinterpret_cast(mem), x); } template static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_si128(reinterpret_cast(mem), x); } template static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast(mem)); } template static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(x, m, reinterpret_cast(mem)); } Vc_OP0(allone, _mm_setallone_si128()) Vc_OP0(zero, _mm_setzero_si128()) Vc_OP3(blend, blendv_epi8(a, b, c)) }; #undef Vc_OP1 #undef Vc_OP2 #undef Vc_OP3 #define Vc_OP1(op) \ static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return Vc_CAT2(_mm_##op##_, Vc_SUFFIX)(a); } #define Vc_OP(op) \ static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op##_ , Vc_SUFFIX)(a, b); } #define Vc_OP_(op) \ static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op , Vc_SUFFIX)(a, b); } #define Vc_OPx(op,op2) \ static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op2##_, Vc_SUFFIX)(a, b); } #define Vc_OP_CAST_(op) \ static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_castps_, Vc_SUFFIX)( \ _mm_##op##ps(Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(a), \ Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(b))); \ } #define Vc_MINMAX \ static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return Vc_CAT2(_mm_min_, Vc_SUFFIX)(a, b); } \ static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return Vc_CAT2(_mm_max_, Vc_SUFFIX)(a, b); } template<> struct VectorHelper { typedef __m128d VectorType; typedef double EntryType; #define Vc_SUFFIX pd Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_pd(mask), a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b); } static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); } #ifdef Vc_IMPL_FMA4 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = _mm_macc_pd(v1, v2, v3); } #else static inline void fma(VectorType &v1, VectorType v2, VectorType v3) { VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast(&c_general::highMaskDouble))); VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast(&c_general::highMaskDouble))); #if defined(Vc_GCC) && Vc_GCC < 0x40703 asm("":"+x"(h1), "+x"(h2)); #endif const VectorType l1 = _mm_sub_pd(v1, h1); const VectorType l2 = _mm_sub_pd(v2, h2); const VectorType ll = mul(l1, l2); const VectorType lh = add(mul(l1, h2), mul(h1, l2)); const VectorType hh = mul(h1, h2); const VectorType lh_lt_v3 = _mm_cmplt_pd(abs(lh), abs(v3)); const VectorType b = blendv_pd(v3, lh, lh_lt_v3); const VectorType c = blendv_pd(lh, v3, lh_lt_v3); v1 = add(add(ll, b), add(c, hh)); } #endif Vc_OP(add) Vc_OP(sub) Vc_OP(mul) Vc_OP1(sqrt) static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) { return _mm_div_pd(one(), sqrt(x)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) { return _mm_div_pd(one(), x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) { return _mm_cmpunord_pd(x, x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) { return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) { return _mm_castsi128_pd(cmpeq_epi64(_mm_castpd_si128(abs(x)), _mm_castpd_si128(_mm_load_pd(c_log::d(1))))); } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_pd()); } Vc_MINMAX static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { a = _mm_min_sd(a, _mm_unpackhi_pd(a, a)); return _mm_cvtsd_f64(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { a = _mm_max_sd(a, _mm_unpackhi_pd(a, a)); return _mm_cvtsd_f64(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1))); return _mm_cvtsd_f64(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1))); return _mm_cvtsd_f64(a); } #undef Vc_SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { #ifdef Vc_IMPL_SSE4_1 return _mm_round_pd(a, _MM_FROUND_NINT); #else return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a)); #endif } }; template<> struct VectorHelper { typedef float EntryType; typedef __m128 VectorType; #define Vc_SUFFIX ps Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(mask, a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); } static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST __m128 concat(__m128d a, __m128d b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); } #ifdef Vc_IMPL_FMA4 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = _mm_macc_ps(v1, v2, v3); } #else static inline void fma(VectorType &v1, VectorType v2, VectorType v3) { __m128d v1_0 = _mm_cvtps_pd(v1); __m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1)); __m128d v2_0 = _mm_cvtps_pd(v2); __m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2)); __m128d v3_0 = _mm_cvtps_pd(v3); __m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3)); v1 = _mm_movelh_ps( _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)), _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1))); } #endif Vc_OP(add) Vc_OP(sub) Vc_OP(mul) Vc_OP1(sqrt) Vc_OP1(rsqrt) static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) { return _mm_cmpunord_ps(x, x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) { return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) { return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(abs(x)), _mm_castps_si128(_mm_load_ps(c_log::d(1))))); } static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) { return _mm_rcp_ps(x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_ps()); } Vc_MINMAX static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { a = _mm_min_ps(a, _mm_movehl_ps(a, a)); a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtss_f32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { a = _mm_max_ps(a, _mm_movehl_ps(a, a)); a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtss_f32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3))); a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1))); return _mm_cvtss_f32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3))); a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1))); return _mm_cvtss_f32(a); } #undef Vc_SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { #ifdef Vc_IMPL_SSE4_1 return _mm_round_ps(a, _MM_FROUND_NINT); #else return _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); #endif } }; template<> struct VectorHelper { typedef int EntryType; typedef __m128i VectorType; #define Vc_SUFFIX si128 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); } #undef Vc_SUFFIX #define Vc_SUFFIX epi32 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); } static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi32(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epi32(a, b); } static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epi32(a, b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } #ifdef Vc_IMPL_SSE4_1 static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } #else static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) { const VectorType aShift = _mm_srli_si128(a, 4); const VectorType ab02 = _mm_mul_epu32(a, b); const VectorType bShift = _mm_srli_si128(b, 4); const VectorType ab13 = _mm_mul_epu32(aShift, bShift); return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8)); } #endif Vc_OP(add) Vc_OP(sub) #undef Vc_SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } }; template<> struct VectorHelper { typedef unsigned int EntryType; typedef __m128i VectorType; #define Vc_SUFFIX si128 Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); } #undef Vc_SUFFIX #define Vc_SUFFIX epu32 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu32(a, b); } static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu32(a, b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) { return VectorHelper::mul(a, b); } #undef Vc_SUFFIX #define Vc_SUFFIX epi32 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); } Vc_OP(add) Vc_OP(sub) #undef Vc_SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } }; template<> struct VectorHelper { typedef __m128i VectorType; typedef signed short EntryType; #define Vc_SUFFIX si128 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); } static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packs_epi32(a, b); } static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); } static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); } #undef Vc_SUFFIX #define Vc_SUFFIX epi16 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d, const EntryType e, const EntryType f, const EntryType g, const EntryType h) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); } static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi16(a); } Vc_OPx(mul, mullo) Vc_OP(min) Vc_OP(max) static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); } Vc_OP(add) Vc_OP(sub) #undef Vc_SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } }; template<> struct VectorHelper { typedef __m128i VectorType; typedef unsigned short EntryType; #define Vc_SUFFIX si128 Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); } #ifdef Vc_IMPL_SSE4_1 static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); } #else static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { auto tmp0 = _mm_unpacklo_epi16(a, b); auto tmp1 = _mm_unpackhi_epi16(a, b); auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); return _mm_unpacklo_epi16(tmp2, tmp3); } #endif static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_unpacklo_epi16(x, _mm_setzero_si128()); } static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); } #undef Vc_SUFFIX #define Vc_SUFFIX epu16 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); } #if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || Vc_IMPL_SSE4_1 static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu16(a, b); } static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu16(a, b); } #endif #undef Vc_SUFFIX #define Vc_SUFFIX epi16 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift); } static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } Vc_OPx(mul, mullo) #if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(Vc_IMPL_SSE4_1) Vc_OP(min) Vc_OP(max) #endif static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtsi128_si32(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d, const EntryType e, const EntryType f, const EntryType g, const EntryType h) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); } Vc_OP(add) Vc_OP(sub) #undef Vc_SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } }; #undef Vc_OP1 #undef Vc_OP #undef Vc_OP_ #undef Vc_OPx #undef Vc_OP_CAST_ #undef Vc_MINMAX } } #endif #ifndef VC_SSE_MASK_H_ #define VC_SSE_MASK_H_ #ifndef VC_SSE_DETAIL_H_ #define VC_SSE_DETAIL_H_ #ifndef VC_SSE_CASTS_H_ #define VC_SSE_CASTS_H_ namespace Vc_VERSIONED_NAMESPACE { namespace SSE { using uint = unsigned int; using ushort = unsigned short; using uchar = unsigned char; using schar = signed char; template Vc_ALWAYS_INLINE Vc_CONST To sse_cast(From v) { return v; } template<> Vc_ALWAYS_INLINE Vc_CONST __m128i sse_cast<__m128i, __m128 >(__m128 v) { return _mm_castps_si128(v); } template<> Vc_ALWAYS_INLINE Vc_CONST __m128i sse_cast<__m128i, __m128d>(__m128d v) { return _mm_castpd_si128(v); } template<> Vc_ALWAYS_INLINE Vc_CONST __m128 sse_cast<__m128 , __m128d>(__m128d v) { return _mm_castpd_ps(v); } template<> Vc_ALWAYS_INLINE Vc_CONST __m128 sse_cast<__m128 , __m128i>(__m128i v) { return _mm_castsi128_ps(v); } template<> Vc_ALWAYS_INLINE Vc_CONST __m128d sse_cast<__m128d, __m128i>(__m128i v) { return _mm_castsi128_pd(v); } template<> Vc_ALWAYS_INLINE Vc_CONST __m128d sse_cast<__m128d, __m128 >(__m128 v) { return _mm_castps_pd(v); } template struct ConvertTag { }; template Vc_INTRINSIC typename VectorTraits::VectorType convert( typename VectorTraits::VectorType v) { return convert(v, ConvertTag()); } Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag) { return _mm_cvttps_epi32(v); } Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag) { return _mm_cvttpd_epi32(v); } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { return v; } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { return v; } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { #ifdef Vc_IMPL_SSE4_1 return _mm_cvtepi16_epi32(v); #else return _mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16); #endif } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { #ifdef Vc_IMPL_SSE4_1 return _mm_cvtepu16_epi32(v); #else return _mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16); #endif } Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag) { return _mm_castps_si128( blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(v)), _mm_castsi128_ps(_mm_xor_si128( _mm_cvttps_epi32(_mm_sub_ps(v, _mm_set1_ps(1u << 31))), _mm_set1_epi32(1 << 31))), _mm_cmpge_ps(v, _mm_set1_ps(1u << 31)))); } Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag) { #ifdef Vc_IMPL_SSE4_1 return _mm_xor_si128(_mm_cvttpd_epi32(_mm_sub_pd(_mm_floor_pd(v), _mm_set1_pd(0x80000000u))), _mm_cvtsi64_si128(0x8000000080000000ull)); #else return blendv_epi8(_mm_cvttpd_epi32(v), _mm_xor_si128(_mm_cvttpd_epi32(_mm_sub_pd(v, _mm_set1_pd(0x80000000u))), _mm_cvtsi64_si128(0x8000000080000000ull)), _mm_castpd_si128(_mm_cmpge_pd(v, _mm_set1_pd(0x80000000u)))); #endif } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { return v; } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { return v; } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { return convert(v, ConvertTag()); } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { return convert(v, ConvertTag()); } Vc_INTRINSIC __m128 convert(__m128 v, ConvertTag) { return v; } Vc_INTRINSIC __m128 convert(__m128d v, ConvertTag) { return _mm_cvtpd_ps(v); } Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag) { return _mm_cvtepi32_ps(v); } Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag) { using namespace SSE; return blendv_ps(_mm_cvtepi32_ps(v), _mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(v, _mm_set1_epi32(0x7ffffe00))), _mm_add_ps(_mm_set1_ps(1u << 31), _mm_cvtepi32_ps(_mm_and_si128( v, _mm_set1_epi32(0x000001ff))))), _mm_castsi128_ps(_mm_cmplt_epi32(v, _mm_setzero_si128()))); } Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag) { return convert(convert(v, ConvertTag()), ConvertTag()); } Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag) { return convert(convert(v, ConvertTag()), ConvertTag()); } Vc_INTRINSIC __m128d convert(__m128 v, ConvertTag) { return _mm_cvtps_pd(v); } Vc_INTRINSIC __m128d convert(__m128d v, ConvertTag) { return v; } Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag) { return _mm_cvtepi32_pd(v); } Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag) { return _mm_add_pd(_mm_cvtepi32_pd(_mm_xor_si128(v, setmin_epi32())), _mm_set1_pd(1u << 31)); } Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag) { return convert(convert(v, ConvertTag()), ConvertTag()); } Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag) { return convert(convert(v, ConvertTag()), ConvertTag()); } Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag) { return _mm_packs_epi32(_mm_cvttps_epi32(v), _mm_setzero_si128()); } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { return _mm_packs_epi32(v, _mm_setzero_si128()); } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { return _mm_packs_epi32(v, _mm_setzero_si128()); } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { return v; } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { return v; } Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag) { return convert(convert(v, ConvertTag()), ConvertTag()); } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { auto tmp0 = _mm_unpacklo_epi16(v, _mm_setzero_si128()); auto tmp1 = _mm_unpackhi_epi16(v, _mm_setzero_si128()); auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); return _mm_unpacklo_epi16(tmp2, tmp3); } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { auto tmp0 = _mm_unpacklo_epi16(v, _mm_setzero_si128()); auto tmp1 = _mm_unpackhi_epi16(v, _mm_setzero_si128()); auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); return _mm_unpacklo_epi16(tmp2, tmp3); } Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag) { return convert(_mm_cvttps_epi32(v), ConvertTag()); } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { return v; } Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag) { return v; } Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag) { return convert(convert(v, ConvertTag()), ConvertTag()); } } } #endif #ifdef Vc_IMPL_AVX #endif namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template struct LoadTag { }; class when_aligned { public: template constexpr when_aligned(F, typename F::EnableIfAligned = nullptr) { } }; class when_unaligned { public: template constexpr when_unaligned(F, typename F::EnableIfUnaligned = nullptr) { } }; class when_streaming { public: template constexpr when_streaming(F, typename F::EnableIfStreaming = nullptr) { } }; Vc_INTRINSIC __m128 load16(const float *mem, when_aligned) { return _mm_load_ps(mem); } Vc_INTRINSIC __m128 load16(const float *mem, when_unaligned) { return _mm_loadu_ps(mem); } Vc_INTRINSIC __m128 load16(const float *mem, when_streaming) { return SseIntrinsics::_mm_stream_load(mem); } Vc_INTRINSIC __m128d load16(const double *mem, when_aligned) { return _mm_load_pd(mem); } Vc_INTRINSIC __m128d load16(const double *mem, when_unaligned) { return _mm_loadu_pd(mem); } Vc_INTRINSIC __m128d load16(const double *mem, when_streaming) { return SseIntrinsics::_mm_stream_load(mem); } template Vc_INTRINSIC __m128i load16(const T *mem, when_aligned) { static_assert(std::is_integral::value, "load16 is only intended for integral T"); return _mm_load_si128(reinterpret_cast(mem)); } template Vc_INTRINSIC __m128i load16(const T *mem, when_unaligned) { static_assert(std::is_integral::value, "load16 is only intended for integral T"); return _mm_loadu_si128(reinterpret_cast(mem)); } template Vc_INTRINSIC __m128i load16(const T *mem, when_streaming) { static_assert(std::is_integral::value, "load16 is only intended for integral T"); return SseIntrinsics::_mm_stream_load(mem); } #ifdef Vc_MSVC template Vc_INTRINSIC __m128d load(const double *mem, F f, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return load16(mem, f); } template Vc_INTRINSIC __m128 load(const float *mem, F f, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return load16(mem, f); } template Vc_INTRINSIC __m128i load(const uint *mem, F f, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return load16(mem, f); } template Vc_INTRINSIC __m128i load(const int *mem, F f, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return load16(mem, f); } template Vc_INTRINSIC __m128i load(const short *mem, F f, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return load16(mem, f); } template Vc_INTRINSIC __m128i load(const ushort *mem, F f, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return load16(mem, f); } #endif template ::value && #endif (!std::is_integral::value || !std::is_integral::value || sizeof(DstT) >= sizeof(SrcT))>> Vc_INTRINSIC V load(const SrcT *mem, Flags flags) { return load(mem, flags, LoadTag()); } template Vc_INTRINSIC V load(const T *mem, Flags, LoadTag, enable_if = nullarg) { return SSE::VectorHelper::template load(mem); } template Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, short>) { return SSE::VectorHelper<__m128i>::load(mem); } template Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, short>) { return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast(mem))); } template Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, short>) { return SSE::cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast(mem))); } template Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, ushort>) { return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast(mem))); } template Vc_INTRINSIC __m128i load(const uint *mem, Flags, LoadTag<__m128i, int>) { return SSE::VectorHelper<__m128i>::load(mem); } template Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, int>) { return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); } template Vc_INTRINSIC __m128i load(const short *mem, Flags, LoadTag<__m128i, int>) { return SSE::cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); } template Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, int>) { return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*aliasing_cast(mem))); } template Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, int>) { return SSE::cvtepi8_epi32(_mm_cvtsi32_si128(*aliasing_cast(mem))); } template Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, uint>) { return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); } template Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, uint>) { return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*aliasing_cast(mem))); } template Vc_INTRINSIC __m128d load(const float *mem, Flags, LoadTag<__m128d, double>) { return SSE::convert( _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast(mem))); } template Vc_INTRINSIC __m128d load(const uint *mem, Flags, LoadTag<__m128d, double>) { return SSE::convert( _mm_loadl_epi64(reinterpret_cast(mem))); } template Vc_INTRINSIC __m128d load(const int *mem, Flags, LoadTag<__m128d, double>) { return SSE::convert( _mm_loadl_epi64(reinterpret_cast(mem))); } template Vc_INTRINSIC __m128d load(const ushort *mem, Flags, LoadTag<__m128d, double>) { return SSE::convert( _mm_cvtsi32_si128(*aliasing_cast(mem))); } template Vc_INTRINSIC __m128d load(const short *mem, Flags, LoadTag<__m128d, double>) { return SSE::convert( _mm_cvtsi32_si128(*aliasing_cast(mem))); } template Vc_INTRINSIC __m128d load(const uchar *mem, Flags, LoadTag<__m128d, double>) { return SSE::convert( _mm_set1_epi16(*aliasing_cast(mem))); } template Vc_INTRINSIC __m128d load(const schar *mem, Flags, LoadTag<__m128d, double>) { return SSE::convert( _mm_set1_epi16(*aliasing_cast(mem))); } template Vc_INTRINSIC __m128 load(const double *mem, Flags, LoadTag<__m128, float>) { #ifdef Vc_IMPL_AVX if (Flags::IsUnaligned) { return _mm256_cvtpd_ps(_mm256_loadu_pd(mem)); } else if (Flags::IsStreaming) { return _mm256_cvtpd_ps(AvxIntrinsics::stream_load<__m256d>(mem)); } else { return _mm256_cvtpd_ps(_mm256_load_pd(mem)); } #else return _mm_movelh_ps(_mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load(&mem[0])), _mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load(&mem[2]))); #endif } template Vc_INTRINSIC __m128 load(const uint *mem, Flags f, LoadTag<__m128, float>) { return SSE::convert(load<__m128i, uint>(mem, f)); } template ::value>> Vc_INTRINSIC __m128 load(const T *mem, Flags f, LoadTag<__m128, float>) { return _mm_cvtepi32_ps(load<__m128i, int>(mem, f)); } template Vc_INTRINSIC Vc_CONST enable_if shifted(T k) { return k; } template Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount > 0), T> shifted(T k) { return _mm_srli_si128(k, amount); } template Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount < 0), T> shifted(T k) { return _mm_slli_si128(k, -amount); } template Vc_INTRINSIC Vc_CONST const T *IndexesFromZero() { if (Size == 4) { return reinterpret_cast(SSE::_IndexesFromZero4); } else if (Size == 8) { return reinterpret_cast(SSE::_IndexesFromZero8); } else if (Size == 16) { return reinterpret_cast(SSE::_IndexesFromZero16); } return 0; } Vc_INTRINSIC Vc_CONST unsigned int popcnt4(unsigned int n) { #ifdef Vc_IMPL_POPCNT return _mm_popcnt_u32(n); #else n = (n & 0x5U) + ((n >> 1) & 0x5U); n = (n & 0x3U) + ((n >> 2) & 0x3U); return n; #endif } Vc_INTRINSIC Vc_CONST unsigned int popcnt8(unsigned int n) { #ifdef Vc_IMPL_POPCNT return _mm_popcnt_u32(n); #else n = (n & 0x55U) + ((n >> 1) & 0x55U); n = (n & 0x33U) + ((n >> 2) & 0x33U); n = (n & 0x0fU) + ((n >> 4) & 0x0fU); return n; #endif } Vc_INTRINSIC Vc_CONST unsigned int popcnt16(unsigned int n) { #ifdef Vc_IMPL_POPCNT return _mm_popcnt_u32(n); #else n = (n & 0x5555U) + ((n >> 1) & 0x5555U); n = (n & 0x3333U) + ((n >> 2) & 0x3333U); n = (n & 0x0f0fU) + ((n >> 4) & 0x0f0fU); n = (n & 0x00ffU) + ((n >> 8) & 0x00ffU); return n; #endif } Vc_INTRINSIC Vc_CONST unsigned int popcnt32(unsigned int n) { #ifdef Vc_IMPL_POPCNT return _mm_popcnt_u32(n); #else n = (n & 0x55555555U) + ((n >> 1) & 0x55555555U); n = (n & 0x33333333U) + ((n >> 2) & 0x33333333U); n = (n & 0x0f0f0f0fU) + ((n >> 4) & 0x0f0f0f0fU); n = (n & 0x00ff00ffU) + ((n >> 8) & 0x00ff00ffU); n = (n & 0x0000ffffU) + ((n >>16) & 0x0000ffffU); return n; #endif } template Vc_INTRINSIC Vc_CONST R mask_cast(__m128i k) { static_assert(From == To, "Incorrect mask cast."); static_assert(std::is_same::value, "Incorrect mask cast."); return SSE::sse_cast<__m128>(k); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 4, __m128>(__m128i k) { return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128())); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 8, __m128>(__m128i k) { return SSE::sse_cast<__m128>( _mm_packs_epi16(_mm_packs_epi16(k, _mm_setzero_si128()), _mm_setzero_si128())); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 2, __m128>(__m128i k) { return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(k, k)); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m128i k) { return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128())); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 2, __m128>(__m128i k) { const auto tmp = _mm_unpacklo_epi16(k, k); return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp)); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m128i k) { return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(k, k)); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 8, __m128>(__m128i k) { return SSE::sse_cast<__m128>(_mm_unpacklo_epi8(k, k)); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 4, __m128>(__m128i k) { const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 8, __m128>(k)); return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(tmp, tmp)); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 2, __m128>(__m128i k) { const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 4, __m128>(k)); return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp)); } template Vc_INTRINSIC_L Vc_CONST_L V allone() Vc_INTRINSIC_R Vc_CONST_R; template<> Vc_INTRINSIC Vc_CONST __m128 allone<__m128 >() { return SSE::_mm_setallone_ps(); } template<> Vc_INTRINSIC Vc_CONST __m128i allone<__m128i>() { return SSE::_mm_setallone_si128(); } template<> Vc_INTRINSIC Vc_CONST __m128d allone<__m128d>() { return SSE::_mm_setallone_pd(); } template inline V zero(); template<> Vc_INTRINSIC Vc_CONST __m128 zero<__m128 >() { return _mm_setzero_ps(); } template<> Vc_INTRINSIC Vc_CONST __m128i zero<__m128i>() { return _mm_setzero_si128(); } template<> Vc_INTRINSIC Vc_CONST __m128d zero<__m128d>() { return _mm_setzero_pd(); } Vc_ALWAYS_INLINE Vc_CONST __m128 negate(__m128 v, std::integral_constant) { return _mm_xor_ps(v, SSE::_mm_setsignmask_ps()); } Vc_ALWAYS_INLINE Vc_CONST __m128d negate(__m128d v, std::integral_constant) { return _mm_xor_pd(v, SSE::_mm_setsignmask_pd()); } Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant) { #ifdef Vc_IMPL_SSSE3 return _mm_sign_epi32(v, allone<__m128i>()); #else return _mm_sub_epi32(_mm_setzero_si128(), v); #endif } Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant) { #ifdef Vc_IMPL_SSSE3 return _mm_sign_epi16(v, allone<__m128i>()); #else return _mm_sub_epi16(_mm_setzero_si128(), v); #endif } Vc_INTRINSIC __m128 xor_(__m128 a, __m128 b) { return _mm_xor_ps(a, b); } Vc_INTRINSIC __m128d xor_(__m128d a, __m128d b) { return _mm_xor_pd(a, b); } Vc_INTRINSIC __m128i xor_(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } Vc_INTRINSIC __m128 or_(__m128 a, __m128 b) { return _mm_or_ps(a, b); } Vc_INTRINSIC __m128d or_(__m128d a, __m128d b) { return _mm_or_pd(a, b); } Vc_INTRINSIC __m128i or_(__m128i a, __m128i b) { return _mm_or_si128(a, b); } Vc_INTRINSIC __m128 and_(__m128 a, __m128 b) { return _mm_and_ps(a, b); } Vc_INTRINSIC __m128d and_(__m128d a, __m128d b) { return _mm_and_pd(a, b); } Vc_INTRINSIC __m128i and_(__m128i a, __m128i b) { return _mm_and_si128(a, b); } Vc_INTRINSIC __m128 andnot_(__m128 a, __m128 b) { return _mm_andnot_ps(a, b); } Vc_INTRINSIC __m128d andnot_(__m128d a, __m128d b) { return _mm_andnot_pd(a, b); } Vc_INTRINSIC __m128i andnot_(__m128i a, __m128i b) { return _mm_andnot_si128(a, b); } Vc_INTRINSIC __m128 not_(__m128 a) { return andnot_(a, allone<__m128 >()); } Vc_INTRINSIC __m128d not_(__m128d a) { return andnot_(a, allone<__m128d>()); } Vc_INTRINSIC __m128i not_(__m128i a) { return andnot_(a, allone<__m128i>()); } Vc_INTRINSIC __m128 add(__m128 a, __m128 b, float) { return _mm_add_ps(a, b); } Vc_INTRINSIC __m128d add(__m128d a, __m128d b, double) { return _mm_add_pd(a, b); } Vc_INTRINSIC __m128i add(__m128i a, __m128i b, int) { return _mm_add_epi32(a, b); } Vc_INTRINSIC __m128i add(__m128i a, __m128i b, uint) { return _mm_add_epi32(a, b); } Vc_INTRINSIC __m128i add(__m128i a, __m128i b, short) { return _mm_add_epi16(a, b); } Vc_INTRINSIC __m128i add(__m128i a, __m128i b, ushort) { return _mm_add_epi16(a, b); } Vc_INTRINSIC __m128i add(__m128i a, __m128i b, schar) { return _mm_add_epi8 (a, b); } Vc_INTRINSIC __m128i add(__m128i a, __m128i b, uchar) { return _mm_add_epi8 (a, b); } Vc_INTRINSIC __m128 sub(__m128 a, __m128 b, float) { return _mm_sub_ps(a, b); } Vc_INTRINSIC __m128d sub(__m128d a, __m128d b, double) { return _mm_sub_pd(a, b); } Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, int) { return _mm_sub_epi32(a, b); } Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, uint) { return _mm_sub_epi32(a, b); } Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, short) { return _mm_sub_epi16(a, b); } Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, ushort) { return _mm_sub_epi16(a, b); } Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, schar) { return _mm_sub_epi8 (a, b); } Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, uchar) { return _mm_sub_epi8 (a, b); } Vc_INTRINSIC __m128 mul(__m128 a, __m128 b, float) { return _mm_mul_ps(a, b); } Vc_INTRINSIC __m128d mul(__m128d a, __m128d b, double) { return _mm_mul_pd(a, b); } Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, int) { #ifdef Vc_IMPL_SSE4_1 return _mm_mullo_epi32(a, b); #else const __m128i aShift = _mm_srli_si128(a, 4); const __m128i ab02 = _mm_mul_epu32(a, b); const __m128i bShift = _mm_srli_si128(b, 4); const __m128i ab13 = _mm_mul_epu32(aShift, bShift); return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8)); #endif } Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, uint) { return mul(a, b, int()); } Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, short) { return _mm_mullo_epi16(a, b); } Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, ushort) { return _mm_mullo_epi16(a, b); } Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, schar) { #ifdef Vc_USE_BUILTIN_VECTOR_TYPES using B = Common::BuiltinType; const auto x = aliasing_cast(a) * aliasing_cast(b); return reinterpret_cast(x); #else return or_( and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)), _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8)); #endif } Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, uchar) { #ifdef Vc_USE_BUILTIN_VECTOR_TYPES using B = Common::BuiltinType; const auto x = aliasing_cast(a) * aliasing_cast(b); return reinterpret_cast(x); #else return or_( and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)), _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8)); #endif } Vc_INTRINSIC __m128 div(__m128 a, __m128 b, float) { return _mm_div_ps(a, b); } Vc_INTRINSIC __m128d div(__m128d a, __m128d b, double) { return _mm_div_pd(a, b); } Vc_INTRINSIC __m128 min(__m128 a, __m128 b, float) { return _mm_min_ps(a, b); } Vc_INTRINSIC __m128d min(__m128d a, __m128d b, double) { return _mm_min_pd(a, b); } Vc_INTRINSIC __m128i min(__m128i a, __m128i b, int) { return SSE::min_epi32(a, b); } Vc_INTRINSIC __m128i min(__m128i a, __m128i b, uint) { return SSE::min_epu32(a, b); } Vc_INTRINSIC __m128i min(__m128i a, __m128i b, short) { return _mm_min_epi16(a, b); } Vc_INTRINSIC __m128i min(__m128i a, __m128i b, ushort) { return SSE::min_epu16(a, b); } Vc_INTRINSIC __m128i min(__m128i a, __m128i b, schar) { return SSE::min_epi8 (a, b); } Vc_INTRINSIC __m128i min(__m128i a, __m128i b, uchar) { return _mm_min_epu8 (a, b); } Vc_INTRINSIC __m128 max(__m128 a, __m128 b, float) { return _mm_max_ps(a, b); } Vc_INTRINSIC __m128d max(__m128d a, __m128d b, double) { return _mm_max_pd(a, b); } Vc_INTRINSIC __m128i max(__m128i a, __m128i b, int) { return SSE::max_epi32(a, b); } Vc_INTRINSIC __m128i max(__m128i a, __m128i b, uint) { return SSE::max_epu32(a, b); } Vc_INTRINSIC __m128i max(__m128i a, __m128i b, short) { return _mm_max_epi16(a, b); } Vc_INTRINSIC __m128i max(__m128i a, __m128i b, ushort) { return SSE::max_epu16(a, b); } Vc_INTRINSIC __m128i max(__m128i a, __m128i b, schar) { return SSE::max_epi8 (a, b); } Vc_INTRINSIC __m128i max(__m128i a, __m128i b, uchar) { return _mm_max_epu8 (a, b); } Vc_INTRINSIC float add(__m128 a, float) { a = _mm_add_ps(a, _mm_movehl_ps(a, a)); a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtss_f32(a); } Vc_INTRINSIC double add(__m128d a, double) { a = _mm_add_sd(a, _mm_unpackhi_pd(a, a)); return _mm_cvtsd_f64(a); } Vc_INTRINSIC int add(__m128i a, int) { a = add(a, _mm_srli_si128(a, 8), int()); a = add(a, _mm_srli_si128(a, 4), int()); return _mm_cvtsi128_si32(a); } Vc_INTRINSIC uint add(__m128i a, uint) { return add(a, int()); } Vc_INTRINSIC short add(__m128i a, short) { a = add(a, _mm_srli_si128(a, 8), short()); a = add(a, _mm_srli_si128(a, 4), short()); a = add(a, _mm_srli_si128(a, 2), short()); return _mm_cvtsi128_si32(a); } Vc_INTRINSIC ushort add(__m128i a, ushort) { return add(a, short()); } Vc_INTRINSIC schar add(__m128i a, schar) { a = add(a, _mm_srli_si128(a, 8), schar()); a = add(a, _mm_srli_si128(a, 4), schar()); a = add(a, _mm_srli_si128(a, 2), schar()); a = add(a, _mm_srli_si128(a, 1), schar()); return _mm_cvtsi128_si32(a); } Vc_INTRINSIC uchar add(__m128i a, uchar) { return add(a, schar()); } Vc_INTRINSIC float mul(__m128 a, float) { a = _mm_mul_ps(a, _mm_movehl_ps(a, a)); a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtss_f32(a); } Vc_INTRINSIC double mul(__m128d a, double) { a = _mm_mul_sd(a, _mm_unpackhi_pd(a, a)); return _mm_cvtsd_f64(a); } Vc_INTRINSIC int mul(__m128i a, int) { a = mul(a, _mm_srli_si128(a, 8), int()); a = mul(a, _mm_srli_si128(a, 4), int()); return _mm_cvtsi128_si32(a); } Vc_INTRINSIC uint mul(__m128i a, uint) { return mul(a, int()); } Vc_INTRINSIC short mul(__m128i a, short) { a = mul(a, _mm_srli_si128(a, 8), short()); a = mul(a, _mm_srli_si128(a, 4), short()); a = mul(a, _mm_srli_si128(a, 2), short()); return _mm_cvtsi128_si32(a); } Vc_INTRINSIC ushort mul(__m128i a, ushort) { return mul(a, short()); } Vc_INTRINSIC schar mul(__m128i a, schar) { const __m128i s0 = _mm_srai_epi16(a, 1); const __m128i s1 = Detail::and_(a, _mm_set1_epi32(0x0f0f0f0f)); return mul(mul(s0, s1, short()), short()); } Vc_INTRINSIC uchar mul(__m128i a, uchar) { return mul(a, schar()); } Vc_INTRINSIC float min(__m128 a, float) { a = _mm_min_ps(a, _mm_movehl_ps(a, a)); a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtss_f32(a); } Vc_INTRINSIC double min(__m128d a, double) { a = _mm_min_sd(a, _mm_unpackhi_pd(a, a)); return _mm_cvtsd_f64(a); } Vc_INTRINSIC int min(__m128i a, int) { a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int()); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int()); return _mm_cvtsi128_si32(a); } Vc_INTRINSIC uint min(__m128i a, uint) { a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint()); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint()); return _mm_cvtsi128_si32(a); } Vc_INTRINSIC short min(__m128i a, short) { a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short()); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short()); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short()); return _mm_cvtsi128_si32(a); } Vc_INTRINSIC ushort min(__m128i a, ushort) { a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort()); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort()); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort()); return _mm_cvtsi128_si32(a); } Vc_INTRINSIC schar min(__m128i a, schar) { a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar()); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar()); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar()); return std::min(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a))); } Vc_INTRINSIC uchar min(__m128i a, uchar) { a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar()); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar()); a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar()); return std::min((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff); } Vc_INTRINSIC float max(__m128 a, float) { a = _mm_max_ps(a, _mm_movehl_ps(a, a)); a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtss_f32(a); } Vc_INTRINSIC double max(__m128d a, double) { a = _mm_max_sd(a, _mm_unpackhi_pd(a, a)); return _mm_cvtsd_f64(a); } Vc_INTRINSIC int max(__m128i a, int) { a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int()); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int()); return _mm_cvtsi128_si32(a); } Vc_INTRINSIC uint max(__m128i a, uint) { a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint()); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint()); return _mm_cvtsi128_si32(a); } Vc_INTRINSIC short max(__m128i a, short) { a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short()); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short()); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short()); return _mm_cvtsi128_si32(a); } Vc_INTRINSIC ushort max(__m128i a, ushort) { a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort()); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort()); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort()); return _mm_cvtsi128_si32(a); } Vc_INTRINSIC schar max(__m128i a, schar) { a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar()); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar()); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar()); return std::max(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a))); } Vc_INTRINSIC uchar max(__m128i a, uchar) { a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar()); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar()); a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar()); return std::max((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff); } template Vc_CONST_L SSE::Vector Vc_VDECL sorted(SSE::Vector x) Vc_CONST_R; template Vc_INTRINSIC Vc_CONST SSE::Vector sorted(SSE::Vector x) { static_assert(!CurrentImplementation::is(ScalarImpl), "Detail::sorted can only be instantiated if a non-Scalar " "implementation is selected."); return sorted < CurrentImplementation::is_between(SSE2Impl, SSSE3Impl) ? SSE2Impl : CurrentImplementation::is_between(SSE41Impl, SSE42Impl) ? SSE41Impl : CurrentImplementation::current() > (x); } template constexpr int sanitize(int n) { return (n >= int(sizeof(V)) || n <= -int(sizeof(V))) ? 0 : n; } template static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> rotated(V v, int amount) { using namespace SSE; switch (static_cast(amount) % N) { case 0: return v; case 1: return sse_cast(_mm_alignr_epi8(v, v, sanitize(1 * sizeof(T)))); case 2: return sse_cast(_mm_alignr_epi8(v, v, sanitize(2 * sizeof(T)))); case 3: return sse_cast(_mm_alignr_epi8(v, v, sanitize(3 * sizeof(T)))); case 4: return sse_cast(_mm_alignr_epi8(v, v, sanitize(4 * sizeof(T)))); case 5: return sse_cast(_mm_alignr_epi8(v, v, sanitize(5 * sizeof(T)))); case 6: return sse_cast(_mm_alignr_epi8(v, v, sanitize(6 * sizeof(T)))); case 7: return sse_cast(_mm_alignr_epi8(v, v, sanitize(7 * sizeof(T)))); } return sse_cast(_mm_setzero_si128()); } template struct InterleaveImpl; template struct InterleaveImpl { template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1) { const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data()); const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data()); #ifdef __x86_64__ const long long tmp00 = _mm_cvtsi128_si64(tmp0); const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0)); const long long tmp10 = _mm_cvtsi128_si64(tmp1); const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1)); aliasing_cast(data[i[0]]) = tmp00; aliasing_cast(data[i[1]]) = tmp00 >> 32; aliasing_cast(data[i[2]]) = tmp01; aliasing_cast(data[i[3]]) = tmp01 >> 32; aliasing_cast(data[i[4]]) = tmp10; aliasing_cast(data[i[5]]) = tmp10 >> 32; aliasing_cast(data[i[6]]) = tmp11; aliasing_cast(data[i[7]]) = tmp11 >> 32; #elif defined(Vc_IMPL_SSE4_1) using namespace SseIntrinsics; aliasing_cast(data[i[0]]) = _mm_cvtsi128_si32(tmp0); aliasing_cast(data[i[1]]) = extract_epi32<1>(tmp0); aliasing_cast(data[i[2]]) = extract_epi32<2>(tmp0); aliasing_cast(data[i[3]]) = extract_epi32<3>(tmp0); aliasing_cast(data[i[4]]) = _mm_cvtsi128_si32(tmp1); aliasing_cast(data[i[5]]) = extract_epi32<1>(tmp1); aliasing_cast(data[i[6]]) = extract_epi32<2>(tmp1); aliasing_cast(data[i[7]]) = extract_epi32<3>(tmp1); #else aliasing_cast(data[i[0]]) = _mm_cvtsi128_si32(tmp0); aliasing_cast(data[i[1]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 4)); aliasing_cast(data[i[2]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 8)); aliasing_cast(data[i[3]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 12)); aliasing_cast(data[i[4]]) = _mm_cvtsi128_si32(tmp1); aliasing_cast(data[i[5]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 4)); aliasing_cast(data[i[6]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 8)); aliasing_cast(data[i[7]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 12)); #endif } static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i, const typename V::AsArg v0, const typename V::AsArg v1) { const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data()); const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data()); V(tmp0).store(&data[i[0]], Vc::Unaligned); V(tmp1).store(&data[i[4]], Vc::Unaligned); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { #if defined Vc_USE_MASKMOV_SCATTER && !defined Vc_MSVC const __m64 mask = _mm_set_pi16(0, -1, -1, -1); const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data()); const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data()); const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data()); const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data()); const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2); const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2); const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3); const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3); _mm_maskmove_si64(_mm_movepi64_pi64(tmp4), mask, reinterpret_cast(&data[i[0]])); _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp4, 8)), mask, reinterpret_cast(&data[i[1]])); _mm_maskmove_si64(_mm_movepi64_pi64(tmp5), mask, reinterpret_cast(&data[i[2]])); _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp5, 8)), mask, reinterpret_cast(&data[i[3]])); _mm_maskmove_si64(_mm_movepi64_pi64(tmp6), mask, reinterpret_cast(&data[i[4]])); _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp6, 8)), mask, reinterpret_cast(&data[i[5]])); _mm_maskmove_si64(_mm_movepi64_pi64(tmp7), mask, reinterpret_cast(&data[i[6]])); _mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp7, 8)), mask, reinterpret_cast(&data[i[7]])); _mm_empty(); #else interleave(data, i, v0, v1); v2.scatter(data + 2, i); #endif } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data()); const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data()); const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data()); const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data()); const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2); const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2); const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3); const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7)); } static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data()); const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data()); const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data()); const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data()); const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2); const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2); const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3); const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3); V(tmp4).store(&data[i[0]], ::Vc::Unaligned); V(tmp5).store(&data[i[2]], ::Vc::Unaligned); V(tmp6).store(&data[i[4]], ::Vc::Unaligned); V(tmp7).store(&data[i[6]], ::Vc::Unaligned); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) { interleave(data, i, v0, v1, v2, v3); v4.scatter(data + 4, i); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6, v7); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1) { const __m128i a = _mm_cvtsi32_si128(*aliasing_cast(&data[i[0]])); const __m128i b = _mm_cvtsi32_si128(*aliasing_cast(&data[i[1]])); const __m128i c = _mm_cvtsi32_si128(*aliasing_cast(&data[i[2]])); const __m128i d = _mm_cvtsi32_si128(*aliasing_cast(&data[i[3]])); const __m128i e = _mm_cvtsi32_si128(*aliasing_cast(&data[i[4]])); const __m128i f = _mm_cvtsi32_si128(*aliasing_cast(&data[i[5]])); const __m128i g = _mm_cvtsi32_si128(*aliasing_cast(&data[i[6]])); const __m128i h = _mm_cvtsi32_si128(*aliasing_cast(&data[i[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); const __m128i tmp3 = _mm_unpacklo_epi16(c, g); const __m128i tmp4 = _mm_unpacklo_epi16(b, f); const __m128i tmp5 = _mm_unpacklo_epi16(d, h); const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2) { const __m128i a = _mm_loadl_epi64(reinterpret_cast(&data[i[0]])); const __m128i b = _mm_loadl_epi64(reinterpret_cast(&data[i[1]])); const __m128i c = _mm_loadl_epi64(reinterpret_cast(&data[i[2]])); const __m128i d = _mm_loadl_epi64(reinterpret_cast(&data[i[3]])); const __m128i e = _mm_loadl_epi64(reinterpret_cast(&data[i[4]])); const __m128i f = _mm_loadl_epi64(reinterpret_cast(&data[i[5]])); const __m128i g = _mm_loadl_epi64(reinterpret_cast(&data[i[6]])); const __m128i h = _mm_loadl_epi64(reinterpret_cast(&data[i[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); const __m128i tmp4 = _mm_unpacklo_epi16(b, f); const __m128i tmp3 = _mm_unpacklo_epi16(c, g); const __m128i tmp5 = _mm_unpacklo_epi16(d, h); const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3) { const __m128i a = _mm_loadl_epi64(reinterpret_cast(&data[i[0]])); const __m128i b = _mm_loadl_epi64(reinterpret_cast(&data[i[1]])); const __m128i c = _mm_loadl_epi64(reinterpret_cast(&data[i[2]])); const __m128i d = _mm_loadl_epi64(reinterpret_cast(&data[i[3]])); const __m128i e = _mm_loadl_epi64(reinterpret_cast(&data[i[4]])); const __m128i f = _mm_loadl_epi64(reinterpret_cast(&data[i[5]])); const __m128i g = _mm_loadl_epi64(reinterpret_cast(&data[i[6]])); const __m128i h = _mm_loadl_epi64(reinterpret_cast(&data[i[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); const __m128i tmp4 = _mm_unpacklo_epi16(b, f); const __m128i tmp3 = _mm_unpacklo_epi16(c, g); const __m128i tmp5 = _mm_unpacklo_epi16(d, h); const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4) { const __m128i a = _mm_loadu_si128(reinterpret_cast(&data[i[0]])); const __m128i b = _mm_loadu_si128(reinterpret_cast(&data[i[1]])); const __m128i c = _mm_loadu_si128(reinterpret_cast(&data[i[2]])); const __m128i d = _mm_loadu_si128(reinterpret_cast(&data[i[3]])); const __m128i e = _mm_loadu_si128(reinterpret_cast(&data[i[4]])); const __m128i f = _mm_loadu_si128(reinterpret_cast(&data[i[5]])); const __m128i g = _mm_loadu_si128(reinterpret_cast(&data[i[6]])); const __m128i h = _mm_loadu_si128(reinterpret_cast(&data[i[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); const __m128i tmp4 = _mm_unpacklo_epi16(b, f); const __m128i tmp3 = _mm_unpacklo_epi16(c, g); const __m128i tmp5 = _mm_unpacklo_epi16(d, h); const __m128i tmp10 = _mm_unpackhi_epi16(a, e); const __m128i tmp11 = _mm_unpackhi_epi16(c, g); const __m128i tmp12 = _mm_unpackhi_epi16(b, f); const __m128i tmp13 = _mm_unpackhi_epi16(d, h); const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) { const __m128i a = _mm_loadu_si128(reinterpret_cast(&data[i[0]])); const __m128i b = _mm_loadu_si128(reinterpret_cast(&data[i[1]])); const __m128i c = _mm_loadu_si128(reinterpret_cast(&data[i[2]])); const __m128i d = _mm_loadu_si128(reinterpret_cast(&data[i[3]])); const __m128i e = _mm_loadu_si128(reinterpret_cast(&data[i[4]])); const __m128i f = _mm_loadu_si128(reinterpret_cast(&data[i[5]])); const __m128i g = _mm_loadu_si128(reinterpret_cast(&data[i[6]])); const __m128i h = _mm_loadu_si128(reinterpret_cast(&data[i[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); const __m128i tmp4 = _mm_unpacklo_epi16(b, f); const __m128i tmp3 = _mm_unpacklo_epi16(c, g); const __m128i tmp5 = _mm_unpacklo_epi16(d, h); const __m128i tmp10 = _mm_unpackhi_epi16(a, e); const __m128i tmp11 = _mm_unpackhi_epi16(c, g); const __m128i tmp12 = _mm_unpackhi_epi16(b, f); const __m128i tmp13 = _mm_unpackhi_epi16(d, h); const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) { const __m128i a = _mm_loadu_si128(reinterpret_cast(&data[i[0]])); const __m128i b = _mm_loadu_si128(reinterpret_cast(&data[i[1]])); const __m128i c = _mm_loadu_si128(reinterpret_cast(&data[i[2]])); const __m128i d = _mm_loadu_si128(reinterpret_cast(&data[i[3]])); const __m128i e = _mm_loadu_si128(reinterpret_cast(&data[i[4]])); const __m128i f = _mm_loadu_si128(reinterpret_cast(&data[i[5]])); const __m128i g = _mm_loadu_si128(reinterpret_cast(&data[i[6]])); const __m128i h = _mm_loadu_si128(reinterpret_cast(&data[i[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); const __m128i tmp4 = _mm_unpacklo_epi16(b, f); const __m128i tmp3 = _mm_unpacklo_epi16(c, g); const __m128i tmp5 = _mm_unpacklo_epi16(d, h); const __m128i tmp10 = _mm_unpackhi_epi16(a, e); const __m128i tmp11 = _mm_unpackhi_epi16(c, g); const __m128i tmp12 = _mm_unpackhi_epi16(b, f); const __m128i tmp13 = _mm_unpackhi_epi16(d, h); const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); v6.data() = _mm_unpacklo_epi16(tmp14, tmp15); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) { const __m128i a = _mm_loadu_si128(reinterpret_cast(&data[i[0]])); const __m128i b = _mm_loadu_si128(reinterpret_cast(&data[i[1]])); const __m128i c = _mm_loadu_si128(reinterpret_cast(&data[i[2]])); const __m128i d = _mm_loadu_si128(reinterpret_cast(&data[i[3]])); const __m128i e = _mm_loadu_si128(reinterpret_cast(&data[i[4]])); const __m128i f = _mm_loadu_si128(reinterpret_cast(&data[i[5]])); const __m128i g = _mm_loadu_si128(reinterpret_cast(&data[i[6]])); const __m128i h = _mm_loadu_si128(reinterpret_cast(&data[i[7]])); const __m128i tmp2 = _mm_unpacklo_epi16(a, e); const __m128i tmp4 = _mm_unpacklo_epi16(b, f); const __m128i tmp3 = _mm_unpacklo_epi16(c, g); const __m128i tmp5 = _mm_unpacklo_epi16(d, h); const __m128i tmp10 = _mm_unpackhi_epi16(a, e); const __m128i tmp11 = _mm_unpackhi_epi16(c, g); const __m128i tmp12 = _mm_unpackhi_epi16(b, f); const __m128i tmp13 = _mm_unpackhi_epi16(d, h); const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); v0.data() = _mm_unpacklo_epi16(tmp0, tmp1); v1.data() = _mm_unpackhi_epi16(tmp0, tmp1); v2.data() = _mm_unpacklo_epi16(tmp6, tmp7); v3.data() = _mm_unpackhi_epi16(tmp6, tmp7); v4.data() = _mm_unpacklo_epi16(tmp8, tmp9); v5.data() = _mm_unpackhi_epi16(tmp8, tmp9); v6.data() = _mm_unpacklo_epi16(tmp14, tmp15); v7.data() = _mm_unpackhi_epi16(tmp14, tmp15); } }; template struct InterleaveImpl { static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i, const typename V::AsArg v0, const typename V::AsArg v1) { const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); _mm_storeu_ps(aliasing_cast(&data[i[0]]), tmp0); _mm_storeu_ps(aliasing_cast(&data[i[2]]), tmp1); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1) { const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { #ifdef Vc_USE_MASKMOV_SCATTER const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data())); const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data())); const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data())); const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data())); const __m128i mask = _mm_set_epi32(0, -1, -1, -1); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast(&data[i[0]])); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast(&data[i[1]])); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast(&data[i[2]])); _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast(&data[i[3]])); #else const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1); v2.scatter(data + 2, i); #endif } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data())); const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data())); const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data())); _mm_storeu_ps(aliasing_cast(&data[i[0]]), _mm_movelh_ps(tmp0, tmp2)); _mm_storeu_ps(aliasing_cast(&data[i[1]]), _mm_movehl_ps(tmp2, tmp0)); _mm_storeu_ps(aliasing_cast(&data[i[2]]), _mm_movelh_ps(tmp1, tmp3)); _mm_storeu_ps(aliasing_cast(&data[i[3]]), _mm_movehl_ps(tmp3, tmp1)); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) { interleave(data, i, v0, v1, v2, v3); v4.scatter(data + 4, i); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6, v7); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1) { const __m128 a = _mm_castpd_ps(_mm_load_sd(aliasing_cast(&data[i[0]]))); const __m128 b = _mm_castpd_ps(_mm_load_sd(aliasing_cast(&data[i[1]]))); const __m128 c = _mm_castpd_ps(_mm_load_sd(aliasing_cast(&data[i[2]]))); const __m128 d = _mm_castpd_ps(_mm_load_sd(aliasing_cast(&data[i[3]]))); const __m128 tmp0 = _mm_unpacklo_ps(a, b); const __m128 tmp1 = _mm_unpacklo_ps(c, d); v0.data() = SSE::sse_cast(_mm_movelh_ps(tmp0, tmp1)); v1.data() = SSE::sse_cast(_mm_movehl_ps(tmp1, tmp0)); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2) { const __m128 a = _mm_loadu_ps(aliasing_cast(&data[i[0]])); const __m128 b = _mm_loadu_ps(aliasing_cast(&data[i[1]])); const __m128 c = _mm_loadu_ps(aliasing_cast(&data[i[2]])); const __m128 d = _mm_loadu_ps(aliasing_cast(&data[i[3]])); const __m128 tmp0 = _mm_unpacklo_ps(a, b); const __m128 tmp1 = _mm_unpacklo_ps(c, d); const __m128 tmp2 = _mm_unpackhi_ps(a, b); const __m128 tmp3 = _mm_unpackhi_ps(c, d); v0.data() = SSE::sse_cast(_mm_movelh_ps(tmp0, tmp1)); v1.data() = SSE::sse_cast(_mm_movehl_ps(tmp1, tmp0)); v2.data() = SSE::sse_cast(_mm_movelh_ps(tmp2, tmp3)); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3) { const __m128 a = _mm_loadu_ps(aliasing_cast(&data[i[0]])); const __m128 b = _mm_loadu_ps(aliasing_cast(&data[i[1]])); const __m128 c = _mm_loadu_ps(aliasing_cast(&data[i[2]])); const __m128 d = _mm_loadu_ps(aliasing_cast(&data[i[3]])); const __m128 tmp0 = _mm_unpacklo_ps(a, b); const __m128 tmp1 = _mm_unpacklo_ps(c, d); const __m128 tmp2 = _mm_unpackhi_ps(a, b); const __m128 tmp3 = _mm_unpackhi_ps(c, d); v0.data() = SSE::sse_cast(_mm_movelh_ps(tmp0, tmp1)); v1.data() = SSE::sse_cast(_mm_movehl_ps(tmp1, tmp0)); v2.data() = SSE::sse_cast(_mm_movelh_ps(tmp2, tmp3)); v3.data() = SSE::sse_cast(_mm_movehl_ps(tmp3, tmp2)); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4) { deinterleave(data, i, v0, v1, v2, v3); v4.gather(data + 4, i); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) { deinterleave(data, i, v0, v1, v2, v3); deinterleave(data + 4, i, v4, v5); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) { deinterleave(data, i, v0, v1, v2, v3); deinterleave(data + 4, i, v4, v5, v6); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) { deinterleave(data, i, v0, v1, v2, v3); deinterleave(data + 4, i, v4, v5, v6, v7); } }; template struct InterleaveImpl { template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1) { const __m128d tmp0 = _mm_unpacklo_pd(v0.data(), v1.data()); const __m128d tmp1 = _mm_unpackhi_pd(v0.data(), v1.data()); _mm_storeu_pd(&data[i[0]], tmp0); _mm_storeu_pd(&data[i[1]], tmp1); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { interleave(data, i, v0, v1); v2.scatter(data + 2, i); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { interleave(data, i, v0, v1); interleave(data + 2, i, v2, v3); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) { interleave(data, i, v0, v1, v2, v3); v4.scatter(data + 4, i); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6, v7); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1) { const __m128d a = _mm_loadu_pd(&data[i[0]]); const __m128d b = _mm_loadu_pd(&data[i[1]]); v0.data() = _mm_unpacklo_pd(a, b); v1.data() = _mm_unpackhi_pd(a, b); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2) { v2.gather(data + 2, i); deinterleave(data, i, v0, v1); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3) { deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4) { deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); v4.gather(data + 4, i); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) { deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); deinterleave(data + 4, i, v4, v5); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) { deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); deinterleave(data + 4, i, v4, v5); v6.gather(data + 6, i); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) { deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); deinterleave(data + 4, i, v4, v5); deinterleave(data + 6, i, v6, v7); } }; } } #endif namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template Vc_INTRINSIC_L Vc_CONST_L int mask_count(__m128i) Vc_INTRINSIC_R Vc_CONST_R; template Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m128i) Vc_INTRINSIC_R Vc_CONST_R; template Vc_INTRINSIC_L Vc_CONST_L bool is_equal(__m128, __m128) Vc_INTRINSIC_R Vc_CONST_R; template Vc_INTRINSIC_L Vc_CONST_L bool is_not_equal(__m128, __m128) Vc_INTRINSIC_R Vc_CONST_R; } using SSE::sse_cast; template class Mask { using abi = VectorAbi::Sse; friend class Mask< double, abi>; friend class Mask< float, abi>; friend class Mask< int32_t, abi>; friend class Mask; friend class Mask< int16_t, abi>; friend class Mask; typedef Common::MaskBool MaskBool; typedef Common::Storage::Size> Storage; public: typedef bool EntryType; using value_type = EntryType; using EntryReference = Detail::ElementReference; using reference = EntryReference; typedef MaskBool VectorEntryType; using VectorType = typename Storage::VectorType; using Vector = SSE::Vector; public: Vc_FREE_STORE_OPERATORS_ALIGNED(16); static constexpr size_t Size = SSE::VectorTraits::Size; static constexpr size_t MemoryAlignment = Size; static constexpr std::size_t size() { return Size; } #if defined Vc_MSVC && defined _WIN32 typedef const Mask &Argument; #else typedef Mask Argument; #endif Vc_INTRINSIC Mask() = default; Vc_INTRINSIC Mask(const Mask &) = default; Vc_INTRINSIC Mask &operator=(const Mask &) = default; Vc_INTRINSIC Mask(const __m128 &x) : d(sse_cast(x)) {} Vc_INTRINSIC Mask(const __m128d &x) : d(sse_cast(x)) {} Vc_INTRINSIC Mask(const __m128i &x) : d(sse_cast(x)) {} Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : Mask(_mm_setzero_ps()) {} Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : Mask(SSE::_mm_setallone_ps()) {} Vc_INTRINSIC explicit Mask(bool b) : Mask(b ? SSE::_mm_setallone_ps() : _mm_setzero_ps()) {} Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; } Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; } template Vc_INTRINSIC Mask( U &&rhs, Common::enable_if_mask_converts_implicitly = nullarg) : d(sse_cast( Detail::mask_cast::value, Size, __m128>( rhs.dataI()))) { } #if Vc_IS_VERSION_1 template Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between " "mask types") Vc_INTRINSIC explicit Mask(U &&rhs, Common::enable_if_mask_converts_explicitly = nullarg); #endif Vc_ALWAYS_INLINE explicit Mask(const bool *mem) { load(mem); } template Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags f) { load(mem, f); } Vc_ALWAYS_INLINE_L void load(const bool *mem) Vc_ALWAYS_INLINE_R; template Vc_ALWAYS_INLINE void load(const bool *mem, Flags) { load(mem); } Vc_ALWAYS_INLINE_L void store(bool *) const Vc_ALWAYS_INLINE_R; template Vc_ALWAYS_INLINE void store(bool *mem, Flags) const { store(mem); } Vc_ALWAYS_INLINE Vc_PURE bool operator==(const Mask &rhs) const { return Detail::is_equal(dataF(), rhs.dataF()); } Vc_ALWAYS_INLINE Vc_PURE bool operator!=(const Mask &rhs) const { return Detail::is_not_equal(dataF(), rhs.dataF()); } Vc_ALWAYS_INLINE Vc_PURE Mask operator!() const { #ifdef Vc_GCC return ~dataI(); #else return _mm_andnot_si128(dataI(), SSE::_mm_setallone_si128()); #endif } Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { d.v() = SSE::sse_cast(_mm_and_ps(dataF(), rhs.dataF())); return *this; } Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { d.v() = SSE::sse_cast(_mm_or_ps (dataF(), rhs.dataF())); return *this; } Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { d.v() = SSE::sse_cast(_mm_xor_ps(dataF(), rhs.dataF())); return *this; } Vc_ALWAYS_INLINE Vc_PURE Mask operator&(const Mask &rhs) const { return _mm_and_ps(dataF(), rhs.dataF()); } Vc_ALWAYS_INLINE Vc_PURE Mask operator|(const Mask &rhs) const { return _mm_or_ps (dataF(), rhs.dataF()); } Vc_ALWAYS_INLINE Vc_PURE Mask operator^(const Mask &rhs) const { return _mm_xor_ps(dataF(), rhs.dataF()); } Vc_ALWAYS_INLINE Vc_PURE Mask operator&&(const Mask &rhs) const { return _mm_and_ps(dataF(), rhs.dataF()); } Vc_ALWAYS_INLINE Vc_PURE Mask operator||(const Mask &rhs) const { return _mm_or_ps (dataF(), rhs.dataF()); } Vc_ALWAYS_INLINE Vc_PURE bool isFull () const { return #ifdef Vc_USE_PTEST _mm_testc_si128(dataI(), SSE::_mm_setallone_si128()); #else _mm_movemask_epi8(dataI()) == 0xffff; #endif } Vc_ALWAYS_INLINE Vc_PURE bool isNotEmpty() const { return #ifdef Vc_USE_PTEST 0 == _mm_testz_si128(dataI(), dataI()); #else _mm_movemask_epi8(dataI()) != 0x0000; #endif } Vc_ALWAYS_INLINE Vc_PURE bool isEmpty() const { return #ifdef Vc_USE_PTEST 0 != _mm_testz_si128(dataI(), dataI()); #else _mm_movemask_epi8(dataI()) == 0x0000; #endif } Vc_ALWAYS_INLINE Vc_PURE bool isMix() const { #ifdef Vc_USE_PTEST return _mm_test_mix_ones_zeros(dataI(), SSE::_mm_setallone_si128()); #else const int tmp = _mm_movemask_epi8(dataI()); return tmp != 0 && (tmp ^ 0xffff) != 0; #endif } Vc_ALWAYS_INLINE Vc_PURE int shiftMask() const { return _mm_movemask_epi8(dataI()); } Vc_ALWAYS_INLINE Vc_PURE int toInt() const { return Detail::mask_to_int(dataI()); } Vc_ALWAYS_INLINE Vc_PURE VectorType data() const { return d.v(); } Vc_ALWAYS_INLINE Vc_PURE __m128 dataF() const { return SSE::sse_cast<__m128 >(d.v()); } Vc_ALWAYS_INLINE Vc_PURE __m128i dataI() const { return SSE::sse_cast<__m128i>(d.v()); } Vc_ALWAYS_INLINE Vc_PURE __m128d dataD() const { return SSE::sse_cast<__m128d>(d.v()); } private: friend reference; static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept { return m.toInt() & (1 << i); } template static Vc_INTRINSIC void set(Mask &m, int i, U &&v) noexcept(noexcept(MaskBool(std::declval()))) { m.d.set(i, MaskBool(std::forward(v))); } public: Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept { return {*this, int(index)}; } Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept { return get(*this, index); } Vc_ALWAYS_INLINE Vc_PURE int count() const { return Detail::mask_count(dataI()); } Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R; template static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R; Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R; private: #ifdef Vc_COMPILE_BENCHMARKS public: #endif Storage d; }; template constexpr size_t Mask::Size; template constexpr size_t Mask::MemoryAlignment; } namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template<> Vc_INTRINSIC Vc_CONST int mask_count<2>(__m128i k) { int mask = _mm_movemask_pd(_mm_castsi128_pd(k)); return (mask & 1) + (mask >> 1); } template<> Vc_INTRINSIC Vc_CONST int mask_count<4>(__m128i k) { #ifdef Vc_IMPL_POPCNT return _mm_popcnt_u32(_mm_movemask_ps(_mm_castsi128_ps(k))); #else auto x = _mm_srli_epi32(k, 31); x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3))); x = _mm_add_epi32(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(x); #endif } template<> Vc_INTRINSIC Vc_CONST int mask_count<8>(__m128i k) { #ifdef Vc_IMPL_POPCNT return _mm_popcnt_u32(_mm_movemask_epi8(k)) / 2; #else auto x = _mm_srli_epi16(k, 15); x = _mm_add_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3))); x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 1, 2, 3))); x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1))); return _mm_extract_epi16(x, 0); #endif } template<> Vc_INTRINSIC Vc_CONST int mask_count<16>(__m128i k) { return Detail::popcnt16(_mm_movemask_epi8(k)); } template<> Vc_INTRINSIC Vc_CONST int mask_to_int<2>(__m128i k) { return _mm_movemask_pd(_mm_castsi128_pd(k)); } template<> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m128i k) { return _mm_movemask_ps(_mm_castsi128_ps(k)); } template<> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m128i k) { return _mm_movemask_epi8(_mm_packs_epi16(k, _mm_setzero_si128())); } template<> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m128i k) { return _mm_movemask_epi8(k); } template Vc_ALWAYS_INLINE void mask_store(__m128i k, bool *mem); template <> Vc_ALWAYS_INLINE void mask_store<16>(__m128i k, bool *mem) { _mm_store_si128(reinterpret_cast<__m128i *>(mem), _mm_and_si128(k, _mm_set1_epi8(1))); } template <> Vc_ALWAYS_INLINE void mask_store<8>(__m128i k, bool *mem) { k = _mm_srli_epi16(k, 15); const auto k2 = _mm_packs_epi16(k, _mm_setzero_si128()); #ifdef __x86_64__ *aliasing_cast(mem) = _mm_cvtsi128_si64(k2); #else _mm_store_sd(aliasing_cast(mem), _mm_castsi128_pd(k2)); #endif } template <> Vc_ALWAYS_INLINE void mask_store<4>(__m128i k, bool *mem) { *aliasing_cast(mem) = _mm_cvtsi128_si32( _mm_packs_epi16(_mm_srli_epi16(_mm_packs_epi32(k, _mm_setzero_si128()), 15), _mm_setzero_si128())); } template <> Vc_ALWAYS_INLINE void mask_store<2>(__m128i k, bool *mem) { mem[0] = -SseIntrinsics::extract_epi32<1>(k); mem[1] = -SseIntrinsics::extract_epi32<3>(k); } template Vc_ALWAYS_INLINE __m128 mask_load(const bool *mem); template<> Vc_ALWAYS_INLINE __m128 mask_load<16>(const bool *mem) { return sse_cast<__m128>(_mm_cmpgt_epi8( _mm_load_si128(reinterpret_cast(mem)), _mm_setzero_si128())); } template<> Vc_ALWAYS_INLINE __m128 mask_load<8>(const bool *mem) { #ifdef __x86_64__ __m128i k = _mm_cvtsi64_si128(*reinterpret_cast(mem)); #else __m128i k = _mm_castpd_si128(_mm_load_sd(reinterpret_cast(mem))); #endif return sse_cast<__m128>(_mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128())); } template<> Vc_ALWAYS_INLINE __m128 mask_load<4>(const bool *mem) { __m128i k = _mm_cvtsi32_si128(*reinterpret_cast(mem)); k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()); return sse_cast<__m128>(_mm_unpacklo_epi16(k, k)); } template<> Vc_ALWAYS_INLINE __m128 mask_load<2>(const bool *mem) { return sse_cast<__m128>( _mm_set_epi32(-int(mem[1]), -int(mem[1]), -int(mem[0]), -int(mem[0]))); } template <> Vc_INTRINSIC Vc_CONST bool is_equal<2>(__m128 k1, __m128 k2) { return _mm_movemask_pd(_mm_castps_pd(k1)) == _mm_movemask_pd(_mm_castps_pd(k2)); } template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<2>(__m128 k1, __m128 k2) { return _mm_movemask_pd(_mm_castps_pd(k1)) != _mm_movemask_pd(_mm_castps_pd(k2)); } template <> Vc_INTRINSIC Vc_CONST bool is_equal<4>(__m128 k1, __m128 k2) { return _mm_movemask_ps(k1) == _mm_movemask_ps(k2); } template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<4>(__m128 k1, __m128 k2) { return _mm_movemask_ps(k1) != _mm_movemask_ps(k2); } template <> Vc_INTRINSIC Vc_CONST bool is_equal<8>(__m128 k1, __m128 k2) { return _mm_movemask_epi8(_mm_castps_si128(k1)) == _mm_movemask_epi8(_mm_castps_si128(k2)); } template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<8>(__m128 k1, __m128 k2) { return _mm_movemask_epi8(_mm_castps_si128(k1)) != _mm_movemask_epi8(_mm_castps_si128(k2)); } template <> Vc_INTRINSIC Vc_CONST bool is_equal<16>(__m128 k1, __m128 k2) { return _mm_movemask_epi8(_mm_castps_si128(k1)) == _mm_movemask_epi8(_mm_castps_si128(k2)); } template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<16>(__m128 k1, __m128 k2) { return _mm_movemask_epi8(_mm_castps_si128(k1)) != _mm_movemask_epi8(_mm_castps_si128(k2)); } } template<> Vc_ALWAYS_INLINE void SSE::double_m::store(bool *mem) const { *aliasing_cast(mem) = _mm_movemask_epi8(dataI()) & 0x0101; } template Vc_ALWAYS_INLINE void Mask::store(bool *mem) const { Detail::mask_store(dataI(), mem); } template<> Vc_ALWAYS_INLINE void SSE::double_m::load(const bool *mem) { d.set(0, MaskBool(mem[0])); d.set(1, MaskBool(mem[1])); } template Vc_ALWAYS_INLINE void Mask::load(const bool *mem) { d.v() = sse_cast(Detail::mask_load(mem)); } template <> Vc_INTRINSIC Vc_PURE bool SSE::short_m::get(const SSE::short_m &m, int index) noexcept { return m.shiftMask() & (1 << 2 * index); } template <> Vc_INTRINSIC Vc_PURE bool SSE::ushort_m::get(const SSE::ushort_m &m, int index) noexcept { return m.shiftMask() & (1 << 2 * index); } template Vc_ALWAYS_INLINE Vc_PURE int Mask::firstOne() const { const int mask = toInt(); #ifdef _MSC_VER unsigned long bit; _BitScanForward(&bit, mask); #else int bit; __asm__("bsf %1,%0" : "=&r"(bit) : "r"(mask)); #endif return bit; } template Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant) { return _mm_set_epi64x(gen(1) ? 0xffffffffffffffffull : 0, gen(0) ? 0xffffffffffffffffull : 0); } template Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant) { return _mm_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0, gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0); } template Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant) { return _mm_setr_epi16(gen(0) ? 0xffffu : 0, gen(1) ? 0xffffu : 0, gen(2) ? 0xffffu : 0, gen(3) ? 0xffffu : 0, gen(4) ? 0xffffu : 0, gen(5) ? 0xffffu : 0, gen(6) ? 0xffffu : 0, gen(7) ? 0xffffu : 0); } template template Vc_INTRINSIC Mask Mask::generate(G &&gen) { return generate_impl>(std::forward(gen), std::integral_constant()); } template Vc_INTRINSIC Vc_PURE Mask Mask::shifted(int amount) const { switch (amount * int(sizeof(VectorEntryType))) { case 0: return *this; case 1: return Detail::shifted< 1>(dataI()); case 2: return Detail::shifted< 2>(dataI()); case 3: return Detail::shifted< 3>(dataI()); case 4: return Detail::shifted< 4>(dataI()); case 5: return Detail::shifted< 5>(dataI()); case 6: return Detail::shifted< 6>(dataI()); case 7: return Detail::shifted< 7>(dataI()); case 8: return Detail::shifted< 8>(dataI()); case 9: return Detail::shifted< 9>(dataI()); case 10: return Detail::shifted< 10>(dataI()); case 11: return Detail::shifted< 11>(dataI()); case 12: return Detail::shifted< 12>(dataI()); case 13: return Detail::shifted< 13>(dataI()); case 14: return Detail::shifted< 14>(dataI()); case 15: return Detail::shifted< 15>(dataI()); case 16: return Detail::shifted< 16>(dataI()); case -1: return Detail::shifted< -1>(dataI()); case -2: return Detail::shifted< -2>(dataI()); case -3: return Detail::shifted< -3>(dataI()); case -4: return Detail::shifted< -4>(dataI()); case -5: return Detail::shifted< -5>(dataI()); case -6: return Detail::shifted< -6>(dataI()); case -7: return Detail::shifted< -7>(dataI()); case -8: return Detail::shifted< -8>(dataI()); case -9: return Detail::shifted< -9>(dataI()); case -10: return Detail::shifted<-10>(dataI()); case -11: return Detail::shifted<-11>(dataI()); case -12: return Detail::shifted<-12>(dataI()); case -13: return Detail::shifted<-13>(dataI()); case -14: return Detail::shifted<-14>(dataI()); case -15: return Detail::shifted<-15>(dataI()); case -16: return Detail::shifted<-16>(dataI()); } return Zero(); } } #endif #include #include #ifdef isfinite #undef isfinite #endif #ifdef isnan #undef isnan #endif namespace Vc_VERSIONED_NAMESPACE { #define Vc_CURRENT_CLASS_NAME Vector template class Vector { static_assert(std::is_arithmetic::value, "Vector only accepts arithmetic builtin types as template parameter T."); protected: #ifdef Vc_COMPILE_BENCHMARKS public: #endif typedef typename SSE::VectorTraits::StorageType StorageType; StorageType d; typedef typename SSE::VectorTraits::GatherMaskType GatherMask; typedef SSE::VectorHelper::VectorType> HV; typedef SSE::VectorHelper HT; public: Vc_FREE_STORE_OPERATORS_ALIGNED(16); typedef typename SSE::VectorTraits::VectorType VectorType; using vector_type = VectorType; static constexpr size_t Size = SSE::VectorTraits::Size; static constexpr size_t MemoryAlignment = alignof(VectorType); typedef typename SSE::VectorTraits::EntryType EntryType; using value_type = EntryType; using VectorEntryType = EntryType; using IndexType = fixed_size_simd; using index_type = IndexType; typedef typename SSE::VectorTraits::MaskType Mask; using MaskType = Mask; using mask_type = Mask; typedef typename Mask::Argument MaskArg; typedef typename Mask::Argument MaskArgument; typedef const Vector AsArg; using abi = VectorAbi::Sse; using WriteMaskedVector = Common::WriteMaskedVector; template using V = Vector; using reference = Detail::ElementReference; public: Vc_INTRINSIC Vector() = default; static constexpr std::size_t size() { return Size; } explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R; explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R; explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R; static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); } static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); } static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero() { return Vector(Vc::IndexesFromZero); } template ()(size_t())), value_type>::value>::type> explicit Vector(G &&g) : Vector(generate(std::forward(g))) { } static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R; Vc_ALWAYS_INLINE Vector(VectorType x) : d(x) {} template Vc_INTRINSIC Vector( V x, typename std::enable_if::value, void *>::type = nullptr) : d(SSE::convert(x.data())) { } #if Vc_IS_VERSION_1 template Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between " "vector types") Vc_INTRINSIC explicit Vector( V x, typename std::enable_if::value, void *>::type = nullptr) : d(SSE::convert(x.data())) { } #endif Vc_INTRINSIC Vector(EntryType a) : d(HT::set(a)) {} template Vc_INTRINSIC Vector(U a, typename std::enable_if::value && !std::is_same::value, void *>::type = nullptr) : Vector(static_cast(a)) { } Vc_INTRINSIC explicit Vector(reference a) : Vector(static_cast(a)) {} explicit Vc_INTRINSIC Vector(const EntryType *mem) { load(mem); } template ::value>> explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags) { load(mem, flags); } template ::value || !std::is_integral::value || sizeof(EntryType) >= sizeof(U)) && std::is_arithmetic::value &&Traits::is_load_store_flag::value>> explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags()) { load(x, flags); } Vc_INTRINSIC void load(const EntryType *mem) { load(mem, DefaultLoadTag()); } template Vc_INTRINSIC enable_if::value, void> load(const EntryType *mem, Flags flags) { load(mem, flags); } private: template struct load_concept : public std::enable_if< (!std::is_integral::value || !std::is_integral::value || sizeof(EntryType) >= sizeof(U)) && std::is_arithmetic::value && Traits::is_load_store_flag::value, void> {}; public: template Vc_INTRINSIC_L typename load_concept::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R; template < typename U, typename Flags = DefaultStoreTag, typename = enable_if::value &&Traits::is_load_store_flag::value>> Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R; template < typename U, typename Flags = DefaultStoreTag, typename = enable_if::value &&Traits::is_load_store_flag::value>> Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R; Vc_INTRINSIC void store(EntryType *mem) const { store(mem, DefaultStoreTag()); } template ::value>> Vc_INTRINSIC void store(EntryType *mem, Flags flags) const { store(mem, flags); } Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const { store(mem, mask, DefaultStoreTag()); } template ::value>> Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const { store(mem, mask, flags); } Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R; Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R; Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R; Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R; Vc_INTRINSIC_L void setQnan(const Mask &k) Vc_INTRINSIC_R; #ifndef Vc_CURRENT_CLASS_NAME #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors." #endif private: template inline void gatherImplementation(const Common::GatherArguments &); template inline void gatherImplementation(const Common::GatherArguments &, MaskArgument mask); public: #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that can be converted to the " \ "EntryType of this SIMD vector type."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes))); } template Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments &args) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args); } template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask); } template Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments &args, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args, mask); } template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes))); } template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask); } template Vc_INTRINSIC void gather(const Common::GatherArguments &args) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args); } template Vc_INTRINSIC void gather(const Common::GatherArguments &args, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args, mask); } #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_ private: template inline void scatterImplementation(MT *mem, IT &&indexes) const; template inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const; public: #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that the EntryType of this " \ "SIMD vector type can be converted to."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes)); } template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes), mask); } template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args) const { scatter(args.address, args.indexes); } template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args, MaskArgument mask) const { scatter(args.address, args.indexes, mask); } #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_ #if defined Vc_IMPL_AVX2 && !defined Vc_MSVC template ::size(), class = enable_if<(Vector::size() >= size() && sizeof(T) >= 4)>> Vc_INTRINSIC void gatherImplementation( const Common::GatherArguments, Scale> &args) { d.v() = SSE::gather( args.address, simd_cast(args.indexes).data()); } template ::size(), class = enable_if<(Vector::size() >= size() && sizeof(T) >= 4)>> Vc_INTRINSIC void gatherImplementation( const Common::GatherArguments, Scale> &args, MaskArgument k) { d.v() = SSE::gather( d.v(), k.data(), args.address, simd_cast(args.indexes).data()); } template < class MT, class U, class A, int Scale, class = enable_if<(sizeof(T) == 2 && std::is_integral::value && (sizeof(MT) <= 2) && Vector::size() >= size())>> Vc_INTRINSIC void gatherImplementation( const Common::GatherArguments, Scale> &args) { using AVX2::int_v; const auto idx = simd_cast(args.indexes).data(); *this = simd_cast(int_v( AVX::gather(aliasing_cast(args.address), idx))); if (sizeof(MT) == 1) { if (std::is_signed::value) { d.v() = _mm_srai_epi16(_mm_slli_epi16(d.v(), 8), 8); } else { *this &= 0xff; } } } template < class MT, class U, class A, int Scale, class = enable_if<(sizeof(T) == 2 && std::is_integral::value && (sizeof(MT) <= 2) && Vector::size() >= size())>> Vc_INTRINSIC void gatherImplementation( const Common::GatherArguments, Scale> &args, MaskArgument k) { using AVX2::int_v; auto v = simd_cast(int_v(AVX::gather( _mm256_setzero_si256(), simd_cast(k).data(), aliasing_cast(args.address), simd_cast(args.indexes).data()))); if (sizeof(MT) == 1) { if (std::is_signed::value) { v.data() = _mm_srai_epi16(_mm_slli_epi16(v.data(), 8), 8); } else { v &= 0xff; } } assign(v, k); } template Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) && Traits::is_valid_vector_argument::value && !std::is_same::value && Vector::size() >= size()), void> gatherImplementation(const Common::GatherArguments, Scale> &args) { *this = simd_cast(fixed_size_simd(args)); } template Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) && Traits::is_valid_vector_argument::value && !std::is_same::value && Vector::size() >= size()), void> gatherImplementation(const Common::GatherArguments, Scale> &args, MaskArgument k) { assign(simd_cast(fixed_size_simd(args, k)), k); } #endif Vc_INTRINSIC Vector &operator++() { data() = HT::add(data(), HT::one()); return *this; } Vc_INTRINSIC Vector &operator--() { data() = HT::sub(data(), HT::one()); return *this; } Vc_INTRINSIC Vector operator++(int) { const Vector r = *this; data() = HT::add(data(), HT::one()); return r; } Vc_INTRINSIC Vector operator--(int) { const Vector r = *this; data() = HT::sub(data(), HT::one()); return r; } private: friend reference; Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept { return o.d.m(i); } template Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept( noexcept(std::declval() = v)) { o.d.set(i, v); } public: Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept { static_assert(noexcept(reference{std::declval(), int()}), ""); return {*this, int(index)}; } Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept { return d.m(index); } Vc_INTRINSIC_L Vector Vc_VDECL operator[](const SSE::int_v &perm) const Vc_INTRINSIC_R; Vc_INTRINSIC Vc_PURE Mask operator!() const { return *this == Zero(); } Vc_INTRINSIC Vc_PURE Vector operator~() const { #ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS static_assert(std::is_integral::value, "bit-complement can only be used with Vectors of integral type"); #endif return Detail::andnot_(data(), HV::allone()); } Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; } Vc_ALWAYS_INLINE Vector Vc_VDECL operator<< (AsArg shift) const { return generate([&](int i) { return get(*this, i) << get(shift, i); }); } Vc_ALWAYS_INLINE Vector Vc_VDECL operator>> (AsArg shift) const { return generate([&](int i) { return get(*this, i) >> get(shift, i); }); } Vc_ALWAYS_INLINE Vector &Vc_VDECL operator<<=(AsArg shift) { return *this = *this << shift; } Vc_ALWAYS_INLINE Vector &Vc_VDECL operator>>=(AsArg shift) { return *this = *this >> shift; } Vc_INTRINSIC_L Vector &Vc_VDECL operator<<=( int shift) Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector Vc_VDECL operator<< ( int shift) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector &Vc_VDECL operator>>=( int shift) Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector Vc_VDECL operator>> ( int shift) const Vc_INTRINSIC_R; Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask isNegative() const { return Vc::isnegative(*this); } Vc_ALWAYS_INLINE void assign(const Vector &v, const Mask &mask) { data() = HV::blend(data(), v.data(), mask.data()); } template Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE Vc_PURE V2 staticCast() const { return SSE::convert(data()); } template Vc_DEPRECATED("use reinterpret_components_cast instead") Vc_ALWAYS_INLINE Vc_PURE V2 reinterpretCast() const { return SSE::sse_cast(data()); } Vc_INTRINSIC WriteMaskedVector operator()(const Mask &k) { return {*this, k}; } Vc_ALWAYS_INLINE Vc_PURE VectorType &data() { return d.v(); } Vc_ALWAYS_INLINE Vc_PURE const VectorType &data() const { return d.v(); } template Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R; Vc_INTRINSIC EntryType min() const { return HT::min(data()); } Vc_INTRINSIC EntryType max() const { return HT::max(data()); } Vc_INTRINSIC EntryType product() const { return HT::mul(data()); } Vc_INTRINSIC EntryType sum() const { return HT::add(data()); } Vc_INTRINSIC_L Vector partialSum() const Vc_INTRINSIC_R; Vc_INTRINSIC_L EntryType min(MaskArg m) const Vc_INTRINSIC_R; Vc_INTRINSIC_L EntryType max(MaskArg m) const Vc_INTRINSIC_R; Vc_INTRINSIC_L EntryType product(MaskArg m) const Vc_INTRINSIC_R; Vc_INTRINSIC_L EntryType sum(MaskArg m) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R; Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R; template void callWithValuesSorted(F &&f) { EntryType value = d.m(0); f(value); for (std::size_t i = 1; i < Size; ++i) { if (d.m(i) != value) { value = d.m(i); f(value); } } } template Vc_INTRINSIC void call(F &&f) const { Common::for_all_vector_entries([&](size_t i) { f(EntryType(d.m(i))); }); } template Vc_INTRINSIC void call(F &&f, const Mask &mask) const { for(size_t i : where(mask)) { f(EntryType(d.m(i))); } } template Vc_INTRINSIC Vector apply(F &&f) const { Vector r; Common::for_all_vector_entries( [&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); }); return r; } template Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const { Vector r(*this); for (size_t i : where(mask)) { r.d.set(i, f(EntryType(r.d.m(i)))); } return r; } template Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) { Common::for_all_vector_entries([&](size_t i) { d.set(i, f(i)); }); } Vc_INTRINSIC void fill(EntryType (&f)()) { Common::for_all_vector_entries([&](size_t i) { d.set(i, f()); }); } template static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R; Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector copySign(AsArg x) const { return Vc::copysign(*this, x); } Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const { return Vc::exponent(*this); } Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R; }; #undef Vc_CURRENT_CLASS_NAME template constexpr size_t Vector::Size; template constexpr size_t Vector::MemoryAlignment; static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v min(const SSE::int_v &x, const SSE::int_v &y) { return SSE::min_epi32(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v min(const SSE::uint_v &x, const SSE::uint_v &y) { return SSE::min_epu32(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v min(const SSE::short_v &x, const SSE::short_v &y) { return _mm_min_epi16(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v min(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::min_epu16(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v min(const SSE::float_v &x, const SSE::float_v &y) { return _mm_min_ps(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v min(const SSE::double_v &x, const SSE::double_v &y) { return _mm_min_pd(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v max(const SSE::int_v &x, const SSE::int_v &y) { return SSE::max_epi32(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v max(const SSE::uint_v &x, const SSE::uint_v &y) { return SSE::max_epu32(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v max(const SSE::short_v &x, const SSE::short_v &y) { return _mm_max_epi16(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v max(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::max_epu16(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v max(const SSE::float_v &x, const SSE::float_v &y) { return _mm_max_ps(x.data(), y.data()); } static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v max(const SSE::double_v &x, const SSE::double_v &y) { return _mm_max_pd(x.data(), y.data()); } template ::value || std::is_same::value || std::is_same::value || std::is_same::value>> Vc_ALWAYS_INLINE Vc_PURE Vector abs(Vector x) { return SSE::VectorHelper::abs(x.data()); } template Vc_ALWAYS_INLINE Vc_PURE Vector sqrt (const Vector &x) { return SSE::VectorHelper::sqrt(x.data()); } template Vc_ALWAYS_INLINE Vc_PURE Vector rsqrt(const Vector &x) { return SSE::VectorHelper::rsqrt(x.data()); } template Vc_ALWAYS_INLINE Vc_PURE Vector reciprocal(const Vector &x) { return SSE::VectorHelper::reciprocal(x.data()); } template Vc_ALWAYS_INLINE Vc_PURE Vector round(const Vector &x) { return SSE::VectorHelper::round(x.data()); } template Vc_ALWAYS_INLINE Vc_PURE typename Vector::Mask isfinite(const Vector &x) { return SSE::VectorHelper::isFinite(x.data()); } template Vc_ALWAYS_INLINE Vc_PURE typename Vector::Mask isinf(const Vector &x) { return SSE::VectorHelper::isInfinite(x.data()); } template Vc_ALWAYS_INLINE Vc_PURE typename Vector::Mask isnan(const Vector &x) { return SSE::VectorHelper::isNaN(x.data()); } #define Vc_CONDITIONAL_ASSIGN(name_,op_) \ template \ Vc_INTRINSIC enable_if conditional_assign( \ Vector &lhs, M &&mask, U &&rhs) \ { \ lhs(mask) op_ rhs; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_CONDITIONAL_ASSIGN( Assign, =); Vc_CONDITIONAL_ASSIGN( PlusAssign, +=); Vc_CONDITIONAL_ASSIGN( MinusAssign, -=); Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=); Vc_CONDITIONAL_ASSIGN( DivideAssign, /=); Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=); Vc_CONDITIONAL_ASSIGN( XorAssign, ^=); Vc_CONDITIONAL_ASSIGN( AndAssign, &=); Vc_CONDITIONAL_ASSIGN( OrAssign, |=); Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=); Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=); #undef Vc_CONDITIONAL_ASSIGN #define Vc_CONDITIONAL_ASSIGN(name_,expr_) \ template \ Vc_INTRINSIC enable_if> \ conditional_assign(Vector &lhs, M &&mask) \ { \ return expr_; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++); Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask)); Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--); Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask)); #undef Vc_CONDITIONAL_ASSIGN } #ifndef VC_COMMON_X86_PREFETCHES_H_ #define VC_COMMON_X86_PREFETCHES_H_ #include namespace Vc_VERSIONED_NAMESPACE { namespace Common { static constexpr int exclusive_hint = 0; template Vc_INTRINSIC void prefetchForOneRead(const void *addr) { if (std::is_same::value) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_NTA); } else { _mm_prefetch(static_cast(const_cast(addr)), static_cast(_MM_HINT_NTA | exclusive_hint)); } } template Vc_INTRINSIC void prefetchClose(const void *addr) { if (std::is_same::value) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T0); } else { _mm_prefetch(static_cast(const_cast(addr)), static_cast(_MM_HINT_T0 | exclusive_hint)); } } template Vc_INTRINSIC void prefetchMid(const void *addr) { if (std::is_same::value) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T1); } else { _mm_prefetch(static_cast(const_cast(addr)), static_cast(_MM_HINT_T1 | exclusive_hint)); } } template Vc_INTRINSIC void prefetchFar(const void *addr) { if (std::is_same::value) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T2); } else { _mm_prefetch(static_cast(const_cast(addr)), static_cast(_MM_HINT_T2 | exclusive_hint)); } } namespace { template Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if::type = nullptr) { const char *addr = static_cast(addr_); prefetchClose::type>(addr + L1); prefetchMid ::type>(addr + L2); } template Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if::type = nullptr) { const char *addr = static_cast(addr_); prefetchMid ::type>(addr + L2); } template Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if::type = nullptr) { const char *addr = static_cast(addr_); prefetchClose::type>(addr + L1); } template Vc_INTRINSIC void handlePrefetch(const void *, typename std::enable_if::type = nullptr) { } template Vc_INTRINSIC void handleLoadPrefetches(const void * , Flags, typename Flags::EnableIfNotPrefetch = nullptr) {} template Vc_INTRINSIC void handleLoadPrefetches(const void *addr, Flags, typename Flags::EnableIfPrefetch = nullptr) { handlePrefetch(addr); } template Vc_INTRINSIC void handleStorePrefetches(const void * , Flags, typename Flags::EnableIfNotPrefetch = nullptr) {} template Vc_INTRINSIC void handleStorePrefetches(const void *addr, Flags, typename Flags::EnableIfPrefetch = nullptr) { handlePrefetch(addr); } } } using Common::prefetchForOneRead; using Common::prefetchClose; using Common::prefetchMid; using Common::prefetchFar; } #endif #ifndef VC_SSE_LIMITS_H_ #define VC_SSE_LIMITS_H_ namespace std { template<> struct numeric_limits< ::Vc::SSE::ushort_v> : public numeric_limits { static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v max() Vc_NOEXCEPT { return ::Vc::SSE::_mm_setallone_si128(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v min() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v lowest() Vc_NOEXCEPT { return min(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); } }; template<> struct numeric_limits< ::Vc::SSE::short_v> : public numeric_limits { static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v max() Vc_NOEXCEPT { return _mm_srli_epi16(::Vc::SSE::_mm_setallone_si128(), 1); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v min() Vc_NOEXCEPT { return ::Vc::SSE::setmin_epi16(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v lowest() Vc_NOEXCEPT { return min(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); } }; template<> struct numeric_limits< ::Vc::SSE::uint_v> : public numeric_limits { static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v max() Vc_NOEXCEPT { return ::Vc::SSE::_mm_setallone_si128(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v min() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v lowest() Vc_NOEXCEPT { return min(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); } }; template<> struct numeric_limits< ::Vc::SSE::int_v> : public numeric_limits { static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v max() Vc_NOEXCEPT { return _mm_srli_epi32(::Vc::SSE::_mm_setallone_si128(), 1); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v min() Vc_NOEXCEPT { return ::Vc::SSE::setmin_epi32(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v lowest() Vc_NOEXCEPT { return min(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); } static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); } }; } #endif #ifndef VC_COMMON_BITSCANINTRINSICS_H_ #define VC_COMMON_BITSCANINTRINSICS_H_ #if defined(Vc_GCC) || defined(Vc_CLANG) || defined(Vc_APPLECLANG) #include # ifndef _bit_scan_forward #define _bit_scan_forward(x) __builtin_ctz(x) static Vc_ALWAYS_INLINE Vc_CONST int _Vc_bit_scan_reverse_asm(unsigned int x) { int r; __asm__("bsr %1,%0" : "=r"(r) : "X"(x)); return r; } #define _bit_scan_reverse(x) _Vc_bit_scan_reverse_asm(x) # endif #elif defined(_WIN32) #include static inline __forceinline unsigned long _bit_scan_forward(unsigned long x) { unsigned long index; _BitScanForward(&index, x); return index; } static inline __forceinline unsigned long _bit_scan_reverse(unsigned long x) { unsigned long index; _BitScanReverse(&index, x); return index; } #elif defined(Vc_ICC) #else #endif #endif #ifndef VC_COMMON_SET_H_ #define VC_COMMON_SET_H_ namespace Vc_VERSIONED_NAMESPACE { namespace { static Vc_INTRINSIC Vc_CONST __m128i set(unsigned short x0, unsigned short x1, unsigned short x2, unsigned short x3, unsigned short x4, unsigned short x5, unsigned short x6, unsigned short x7) { #if defined(Vc_GNU_ASM) #if 0 __m128i r; unsigned long long tmp0 = x3; tmp0 = (tmp0 << 16) | x2; unsigned long long tmp1 = x1; tmp1 = (tmp1 << 16) | x0; asm("vmovq %1,%0" : "=x"(r) : "r"((tmp0 << 32) | tmp1)); unsigned long long tmp2 = x7; tmp2 = (tmp2 << 16) | x6; unsigned long long tmp3 = x5; tmp3 = (tmp3 << 16) | x4; asm("vpinsrq $1,%1,%0,%0" : "+x"(r) : "r"((tmp2 << 32) | tmp3)); return r; #elif defined(Vc_USE_VEX_CODING) __m128i r0, r1; unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0; unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2; unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4; unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6; asm("vmovd %1,%0" : "=x"(r0) : "r"(tmp0)); asm("vpinsrd $1,%1,%0,%0" : "+x"(r0) : "r"(tmp1)); asm("vmovd %1,%0" : "=x"(r1) : "r"(tmp2)); asm("vpinsrd $1,%1,%0,%0" : "+x"(r1) : "r"(tmp3)); asm("vpunpcklqdq %1,%0,%0" : "+x"(r0) : "x"(r1)); return r0; #else __m128i r0, r1; unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0; unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2; unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4; unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6; asm("movd %1,%0" : "=x"(r0) : "r"(tmp0)); asm("pinsrd $1,%1,%0" : "+x"(r0) : "r"(tmp1)); asm("movd %1,%0" : "=x"(r1) : "r"(tmp2)); asm("pinsrd $1,%1,%0" : "+x"(r1) : "r"(tmp3)); asm("punpcklqdq %1,%0" : "+x"(r0) : "x"(r1)); return r0; #endif #else unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0; unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2; unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4; unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6; return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3); #endif } static Vc_INTRINSIC Vc_CONST __m128i set(short x0, short x1, short x2, short x3, short x4, short x5, short x6, short x7) { return set(static_cast(x0), static_cast(x1), static_cast(x2), static_cast(x3), static_cast(x4), static_cast(x5), static_cast(x6), static_cast(x7)); } } } #endif #ifndef VC_COMMON_GATHERIMPLEMENTATION_H_ #define VC_COMMON_GATHERIMPLEMENTATION_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Common { enum class GatherScatterImplementation : int { SimpleLoop, SetIndexZero, BitScanLoop, PopcntSwitch }; using SimpleLoopT = std::integral_constant; using SetIndexZeroT = std::integral_constant; using BitScanLoopT = std::integral_constant; using PopcntSwitchT = std::integral_constant; template Vc_ALWAYS_INLINE void executeGather(SetIndexZeroT, V &v, const MT *mem, IT &&indexes_, typename V::MaskArgument mask) { auto indexes = std::forward(indexes_); indexes.setZeroInverted(static_cast(mask)); const V tmp(mem, indexes); where(mask) | v = tmp; } template Vc_ALWAYS_INLINE void executeGather(SimpleLoopT, V &v, const MT *mem, const IT &indexes, const typename V::MaskArgument mask) { if (Vc_IS_UNLIKELY(mask.isEmpty())) { return; } #if defined Vc_GCC && Vc_GCC >= 0x40900 constexpr std::size_t Sizeof = sizeof(V); using Builtin [[gnu::vector_size(Sizeof)]] = typename V::value_type; Builtin tmp = reinterpret_cast(v.data()); Common::unrolled_loop([&](std::size_t i) { if (mask[i]) { tmp[i] = mem[indexes[i]]; } }); v.data() = reinterpret_cast(tmp); #else Common::unrolled_loop([&](std::size_t i) { if (mask[i]) v[i] = mem[indexes[i]]; }); #endif } template Vc_ALWAYS_INLINE void executeGather(BitScanLoopT, V &v, const MT *mem, const IT &indexes, typename V::MaskArgument mask) { #ifdef Vc_GNU_ASM size_t bits = mask.toInt(); while (Vc_IS_LIKELY(bits > 0)) { size_t i, j; asm("bsf %[bits],%[i]\n\t" "bsr %[bits],%[j]\n\t" "btr %[i],%[bits]\n\t" "btr %[j],%[bits]\n\t" : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits)); v[i] = mem[indexes[i]]; v[j] = mem[indexes[j]]; } #else int bits = mask.toInt(); while (bits) { const int i = _bit_scan_forward(bits); bits &= bits - 1; v[i] = mem[indexes[i]]; } #endif } template Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT, V &v, const MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low, high = 0; switch (Vc::Detail::popcnt16(bits)) { case 16: v.gather(mem, indexes); break; case 15: low = _bit_scan_forward(bits); bits ^= 1 << low; v[low] = mem[indexes[low]]; case 14: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 13: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 12: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 11: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 10: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 9: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 8: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 7: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 6: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 5: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 4: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 3: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 2: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; case 1: low = _bit_scan_forward(bits); v[low] = mem[indexes[low]]; case 0: break; } } template Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT, V &v, const MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low, high = 0; switch (Vc::Detail::popcnt8(bits)) { case 8: v.gather(mem, indexes); break; case 7: low = _bit_scan_forward(bits); bits ^= 1 << low; v[low] = mem[indexes[low]]; case 6: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 5: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 4: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; high = (1 << high); case 3: low = _bit_scan_forward(bits); bits ^= high | (1 << low); v[low] = mem[indexes[low]]; case 2: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; case 1: low = _bit_scan_forward(bits); v[low] = mem[indexes[low]]; case 0: break; } } template Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT, V &v, const MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low, high = 0; switch (Vc::Detail::popcnt4(bits)) { case 4: v.gather(mem, indexes); break; case 3: low = _bit_scan_forward(bits); bits ^= 1 << low; v[low] = mem[indexes[low]]; case 2: high = _bit_scan_reverse(bits); v[high] = mem[indexes[high]]; case 1: low = _bit_scan_forward(bits); v[low] = mem[indexes[low]]; case 0: break; } } template Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT, V &v, const MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low; switch (Vc::Detail::popcnt4(bits)) { case 2: v.gather(mem, indexes); break; case 1: low = _bit_scan_forward(bits); v[low] = mem[indexes[low]]; case 0: break; } } } } #endif #ifndef VC_COMMON_SCATTERIMPLEMENTATION_H_ #define VC_COMMON_SCATTERIMPLEMENTATION_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Common { template Vc_ALWAYS_INLINE void executeScatter(SetIndexZeroT, V &v, MT *mem, IT indexes, typename V::MaskArgument mask) { indexes.setZeroInverted(static_cast(mask)); const V tmp(mem, indexes); where(mask) | v = tmp; } template Vc_ALWAYS_INLINE void executeScatter(SimpleLoopT, V &v, MT *mem, const IT &indexes, typename V::MaskArgument mask) { if (Vc_IS_UNLIKELY(mask.isEmpty())) { return; } Common::unrolled_loop([&](std::size_t i) { if (mask[i]) mem[indexes[i]] = v[i]; }); } template Vc_ALWAYS_INLINE void executeScatter(BitScanLoopT, V &v, MT *mem, const IT &indexes, typename V::MaskArgument mask) { size_t bits = mask.toInt(); while (Vc_IS_LIKELY(bits > 0)) { size_t i, j; asm("bsf %[bits],%[i]\n\t" "bsr %[bits],%[j]\n\t" "btr %[i],%[bits]\n\t" "btr %[j],%[bits]\n\t" : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits)); mem[indexes[i]] = v[i]; mem[indexes[j]] = v[j]; } } template Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT, V &v, MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low, high = 0; switch (Vc::Detail::popcnt16(bits)) { case 16: v.scatter(mem, indexes); break; case 15: low = _bit_scan_forward(bits); bits ^= 1 << low; mem[indexes[low]] = v[low]; case 14: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 13: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 12: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 11: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 10: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 9: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 8: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 7: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 6: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 5: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 4: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 3: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 2: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; case 1: low = _bit_scan_forward(bits); mem[indexes[low]] = v[low]; case 0: break; } } template Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT, V &v, MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low, high = 0; switch (Vc::Detail::popcnt8(bits)) { case 8: v.scatter(mem, indexes); break; case 7: low = _bit_scan_forward(bits); bits ^= 1 << low; mem[indexes[low]] = v[low]; case 6: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 5: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 4: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; high = (1 << high); case 3: low = _bit_scan_forward(bits); bits ^= high | (1 << low); mem[indexes[low]] = v[low]; case 2: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; case 1: low = _bit_scan_forward(bits); mem[indexes[low]] = v[low]; case 0: break; } } template Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT, V &v, MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low, high = 0; switch (Vc::Detail::popcnt4(bits)) { case 4: v.scatter(mem, indexes); break; case 3: low = _bit_scan_forward(bits); bits ^= 1 << low; mem[indexes[low]] = v[low]; case 2: high = _bit_scan_reverse(bits); mem[indexes[high]] = v[high]; case 1: low = _bit_scan_forward(bits); mem[indexes[low]] = v[low]; case 0: break; } } template Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT, V &v, MT *mem, const IT &indexes, typename V::MaskArgument mask, enable_if = nullarg) { unsigned int bits = mask.toInt(); unsigned int low; switch (Vc::Detail::popcnt4(bits)) { case 2: v.scatter(mem, indexes); break; case 1: low = _bit_scan_forward(bits); mem[indexes[low]] = v[low]; case 0: break; } } } } #endif namespace Vc_VERSIONED_NAMESPACE { namespace Detail { Vc_INTRINSIC SSE::double_m operator==(SSE::double_v a, SSE::double_v b) { return _mm_cmpeq_pd(a.data(), b.data()); } Vc_INTRINSIC SSE:: float_m operator==(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpeq_ps(a.data(), b.data()); } Vc_INTRINSIC SSE:: int_m operator==(SSE:: int_v a, SSE:: int_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); } Vc_INTRINSIC SSE:: uint_m operator==(SSE:: uint_v a, SSE:: uint_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); } Vc_INTRINSIC SSE:: short_m operator==(SSE:: short_v a, SSE:: short_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); } Vc_INTRINSIC SSE::ushort_m operator==(SSE::ushort_v a, SSE::ushort_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); } Vc_INTRINSIC SSE::double_m operator!=(SSE::double_v a, SSE::double_v b) { return _mm_cmpneq_pd(a.data(), b.data()); } Vc_INTRINSIC SSE:: float_m operator!=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpneq_ps(a.data(), b.data()); } Vc_INTRINSIC SSE:: int_m operator!=(SSE:: int_v a, SSE:: int_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); } Vc_INTRINSIC SSE:: uint_m operator!=(SSE:: uint_v a, SSE:: uint_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); } Vc_INTRINSIC SSE:: short_m operator!=(SSE:: short_v a, SSE:: short_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); } Vc_INTRINSIC SSE::ushort_m operator!=(SSE::ushort_v a, SSE::ushort_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); } Vc_INTRINSIC SSE::double_m operator> (SSE::double_v a, SSE::double_v b) { return _mm_cmpgt_pd(a.data(), b.data()); } Vc_INTRINSIC SSE:: float_m operator> (SSE:: float_v a, SSE:: float_v b) { return _mm_cmpgt_ps(a.data(), b.data()); } Vc_INTRINSIC SSE:: int_m operator> (SSE:: int_v a, SSE:: int_v b) { return _mm_cmpgt_epi32(a.data(), b.data()); } Vc_INTRINSIC SSE:: uint_m operator> (SSE:: uint_v a, SSE:: uint_v b) { #ifndef USE_INCORRECT_UNSIGNED_COMPARE return SSE::cmpgt_epu32(a.data(), b.data()); #else return _mm_cmpgt_epi32(a.data(), b.data()); #endif } Vc_INTRINSIC SSE:: short_m operator> (SSE:: short_v a, SSE:: short_v b) { return _mm_cmpgt_epi16(a.data(), b.data()); } Vc_INTRINSIC SSE::ushort_m operator> (SSE::ushort_v a, SSE::ushort_v b) { #ifndef USE_INCORRECT_UNSIGNED_COMPARE return SSE::cmpgt_epu16(a.data(), b.data()); #else return _mm_cmpgt_epi16(a.data(), b.data()); #endif } Vc_INTRINSIC SSE::double_m operator< (SSE::double_v a, SSE::double_v b) { return _mm_cmplt_pd(a.data(), b.data()); } Vc_INTRINSIC SSE:: float_m operator< (SSE:: float_v a, SSE:: float_v b) { return _mm_cmplt_ps(a.data(), b.data()); } Vc_INTRINSIC SSE:: int_m operator< (SSE:: int_v a, SSE:: int_v b) { return _mm_cmplt_epi32(a.data(), b.data()); } Vc_INTRINSIC SSE:: uint_m operator< (SSE:: uint_v a, SSE:: uint_v b) { #ifndef USE_INCORRECT_UNSIGNED_COMPARE return SSE::cmplt_epu32(a.data(), b.data()); #else return _mm_cmplt_epi32(a.data(), b.data()); #endif } Vc_INTRINSIC SSE:: short_m operator< (SSE:: short_v a, SSE:: short_v b) { return _mm_cmplt_epi16(a.data(), b.data()); } Vc_INTRINSIC SSE::ushort_m operator< (SSE::ushort_v a, SSE::ushort_v b) { #ifndef USE_INCORRECT_UNSIGNED_COMPARE return SSE::cmplt_epu16(a.data(), b.data()); #else return _mm_cmplt_epi16(a.data(), b.data()); #endif } Vc_INTRINSIC SSE::double_m operator>=(SSE::double_v a, SSE::double_v b) { return _mm_cmpnlt_pd(a.data(), b.data()); } Vc_INTRINSIC SSE:: float_m operator>=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpnlt_ps(a.data(), b.data()); } Vc_INTRINSIC SSE:: int_m operator>=(SSE:: int_v a, SSE:: int_v b) { return !(a < b); } Vc_INTRINSIC SSE:: uint_m operator>=(SSE:: uint_v a, SSE:: uint_v b) { return !(a < b); } Vc_INTRINSIC SSE:: short_m operator>=(SSE:: short_v a, SSE:: short_v b) { return !(a < b); } Vc_INTRINSIC SSE::ushort_m operator>=(SSE::ushort_v a, SSE::ushort_v b) { return !(a < b); } Vc_INTRINSIC SSE::double_m operator<=(SSE::double_v a, SSE::double_v b) { return _mm_cmple_pd(a.data(), b.data()); } Vc_INTRINSIC SSE:: float_m operator<=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmple_ps(a.data(), b.data()); } Vc_INTRINSIC SSE:: int_m operator<=(SSE:: int_v a, SSE:: int_v b) { return !(a > b); } Vc_INTRINSIC SSE:: uint_m operator<=(SSE:: uint_v a, SSE:: uint_v b) { return !(a > b); } Vc_INTRINSIC SSE:: short_m operator<=(SSE:: short_v a, SSE:: short_v b) { return !(a > b); } Vc_INTRINSIC SSE::ushort_m operator<=(SSE::ushort_v a, SSE::ushort_v b) { return !(a > b); } template Vc_INTRINSIC SSE::Vector operator^(SSE::Vector a, SSE::Vector b) { return xor_(a.data(), b.data()); } template Vc_INTRINSIC SSE::Vector operator&(SSE::Vector a, SSE::Vector b) { return and_(a.data(), b.data()); } template Vc_INTRINSIC SSE::Vector operator|(SSE::Vector a, SSE::Vector b) { return or_(a.data(), b.data()); } template Vc_INTRINSIC SSE::Vector operator+(SSE::Vector a, SSE::Vector b) { return add(a.data(), b.data(), T()); } template Vc_INTRINSIC SSE::Vector operator-(SSE::Vector a, SSE::Vector b) { return sub(a.data(), b.data(), T()); } template Vc_INTRINSIC SSE::Vector operator*(SSE::Vector a, SSE::Vector b) { return mul(a.data(), b.data(), T()); } template Vc_INTRINSIC enable_if::value, SSE::Vector> operator/( SSE::Vector a, SSE::Vector b) { return div(a.data(), b.data(), T()); } template Vc_INTRINSIC enable_if::value || std::is_same::value, SSE::Vector> operator/(SSE::Vector a, SSE::Vector b) { return SSE::Vector::generate([&](int i) { return a[i] / b[i]; }); } template Vc_INTRINSIC enable_if::value || std::is_same::value, SSE::Vector> operator/(SSE::Vector a, SSE::Vector b) { using HT = SSE::VectorHelper; __m128 lo = _mm_cvtepi32_ps(HT::expand0(a.data())); __m128 hi = _mm_cvtepi32_ps(HT::expand1(a.data())); lo = _mm_div_ps(lo, _mm_cvtepi32_ps(HT::expand0(b.data()))); hi = _mm_div_ps(hi, _mm_cvtepi32_ps(HT::expand1(b.data()))); return HT::concat(_mm_cvttps_epi32(lo), _mm_cvttps_epi32(hi)); } template Vc_INTRINSIC enable_if::value, SSE::Vector> operator%( SSE::Vector a, SSE::Vector b) { return a - a / b * b; } } template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerZero) : d(HV::zero()) { } template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerOne) : d(HT::one()) { } template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerIndexesFromZero) : d(Detail::load16(Detail::IndexesFromZero(), Aligned)) { #if defined Vc_GCC && Vc_GCC < 0x40903 && defined Vc_IMPL_AVX2 if (std::is_same::value) { asm("" ::"x"(d.v())); } #endif } template <> Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerIndexesFromZero) : d(SSE::convert(SSE::int_v::IndexesFromZero().data())) { } template <> Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerIndexesFromZero) : d(SSE::convert(SSE::int_v::IndexesFromZero().data())) { } template template Vc_INTRINSIC typename Vector:: #ifndef Vc_MSVC template #endif load_concept::type Vector::load(const SrcT *mem, Flags flags) { Common::handleLoadPrefetches(mem, flags); d.v() = Detail::load(mem, flags); } template Vc_INTRINSIC void Vector::setZero() { data() = HV::zero(); } template Vc_INTRINSIC void Vector::setZero(const Mask &k) { data() = Detail::andnot_(k.data(), data()); } template Vc_INTRINSIC void Vector::setZeroInverted(const Mask &k) { data() = Detail::and_(k.data(), data()); } template<> Vc_INTRINSIC void SSE::double_v::setQnan() { data() = SSE::_mm_setallone_pd(); } template<> Vc_INTRINSIC void Vector::setQnan(const Mask &k) { data() = _mm_or_pd(data(), k.dataD()); } template<> Vc_INTRINSIC void SSE::float_v::setQnan() { data() = SSE::_mm_setallone_ps(); } template<> Vc_INTRINSIC void Vector::setQnan(const Mask &k) { data() = _mm_or_ps(data(), k.dataF()); } template template Vc_INTRINSIC void Vector::store(U *mem, Flags flags) const { Common::handleStorePrefetches(mem, flags); HV::template store(mem, data()); } template template Vc_INTRINSIC void Vector::store(U *mem, Mask mask, Flags flags) const { Common::handleStorePrefetches(mem, flags); HV::template store(mem, data(), mask.data()); } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator-() const { return Detail::negate(d.v(), std::integral_constant()); } #ifdef Vc_IMPL_XOP template <> Vc_ALWAYS_INLINE SSE::int_v SSE::int_v::operator<<(const SSE::int_v shift) const { return _mm_sha_epi32(d.v(), shift.d.v()); } template <> Vc_ALWAYS_INLINE SSE::uint_v SSE::uint_v::operator<<(const SSE::uint_v shift) const { return _mm_shl_epi32(d.v(), shift.d.v()); } template <> Vc_ALWAYS_INLINE SSE::short_v SSE::short_v::operator<<(const SSE::short_v shift) const { return _mm_sha_epi16(d.v(), shift.d.v()); } template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator<<(const SSE::ushort_v shift) const { return _mm_shl_epi16(d.v(), shift.d.v()); } template <> Vc_ALWAYS_INLINE SSE::int_v SSE::int_v::operator>>(const SSE::int_v shift) const { return operator<<(-shift); } template <> Vc_ALWAYS_INLINE SSE::uint_v SSE::uint_v::operator>>(const SSE::uint_v shift) const { return operator<<(-shift); } template <> Vc_ALWAYS_INLINE SSE::short_v SSE::short_v::operator>>(const SSE::short_v shift) const { return operator<<(-shift); } template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator>>(const SSE::ushort_v shift) const { return operator<<(-shift); } #elif defined Vc_IMPL_AVX2 template <> Vc_ALWAYS_INLINE SSE::Vector< int> Vector< int, VectorAbi::Sse>::operator<<(const SSE::Vector< int> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); } template <> Vc_ALWAYS_INLINE SSE::Vector< uint> Vector< uint, VectorAbi::Sse>::operator<<(const SSE::Vector< uint> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); } template <> Vc_ALWAYS_INLINE SSE::Vector< int> Vector< int, VectorAbi::Sse>::operator>>(const SSE::Vector< int> x) const { return _mm_srav_epi32(d.v(), x.d.v()); } template <> Vc_ALWAYS_INLINE SSE::Vector< uint> Vector< uint, VectorAbi::Sse>::operator>>(const SSE::Vector< uint> x) const { return _mm_srlv_epi32(d.v(), x.d.v()); } #endif template Vc_ALWAYS_INLINE Vector &Vector::operator>>=(int shift) { d.v() = HT::shiftRight(d.v(), shift); return *this; } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator>>(int shift) const { return HT::shiftRight(d.v(), shift); } template Vc_ALWAYS_INLINE Vector &Vector::operator<<=(int shift) { d.v() = HT::shiftLeft(d.v(), shift); return *this; } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator<<(int shift) const { return HT::shiftLeft(d.v(), shift); } Vc_INTRINSIC Vc_CONST SSE::float_m isnegative(SSE::float_v x) { return sse_cast<__m128>(_mm_srai_epi32( sse_cast<__m128i>(_mm_and_ps(SSE::_mm_setsignmask_ps(), x.data())), 31)); } Vc_INTRINSIC Vc_CONST SSE::double_m isnegative(SSE::double_v x) { return Mem::permute(sse_cast<__m128>(_mm_srai_epi32( sse_cast<__m128i>(_mm_and_pd(SSE::_mm_setsignmask_pd(), x.data())), 31))); } #define Vc_GATHER_IMPL(V_) \ template <> \ template \ inline void SSE::V_::gatherImplementation( \ const Common::GatherArguments &args) #define Vc_M(i_) static_cast(args.address[Scale * args.indexes[i_]]) Vc_GATHER_IMPL(double_v) { d.v() = _mm_setr_pd(Vc_M(0), Vc_M(1)); } Vc_GATHER_IMPL(float_v) { d.v() = _mm_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); } Vc_GATHER_IMPL(int_v) { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); } Vc_GATHER_IMPL(uint_v) { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); } Vc_GATHER_IMPL(short_v) { d.v() = Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7)); } Vc_GATHER_IMPL(ushort_v) { d.v() = Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7)); } #undef Vc_M #undef Vc_GATHER_IMPL template template inline void Vector::gatherImplementation( const Common::GatherArguments &args, MaskArgument mask) { const auto *mem = args.address; const auto indexes = Scale * args.indexes; using Selector = std::integral_constant < Common::GatherScatterImplementation, #ifdef Vc_USE_SET_GATHERS Traits::is_simd_vector::value ? Common::GatherScatterImplementation::SetIndexZero : #endif #ifdef Vc_USE_BSF_GATHERS Common::GatherScatterImplementation::BitScanLoop #elif defined Vc_USE_POPCNT_BSF_GATHERS Common::GatherScatterImplementation::PopcntSwitch #else Common::GatherScatterImplementation::SimpleLoop #endif > ; Common::executeGather(Selector(), *this, mem, indexes, mask); } template template inline void Vector::scatterImplementation(MT *mem, IT &&indexes) const { Common::unrolled_loop([&](std::size_t i) { mem[indexes[i]] = d.m(i); }); } template template inline void Vector::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const { using Selector = std::integral_constant < Common::GatherScatterImplementation, #ifdef Vc_USE_SET_GATHERS Traits::is_simd_vector::value ? Common::GatherScatterImplementation::SetIndexZero : #endif #ifdef Vc_USE_BSF_GATHERS Common::GatherScatterImplementation::BitScanLoop #elif defined Vc_USE_POPCNT_BSF_GATHERS Common::GatherScatterImplementation::PopcntSwitch #else Common::GatherScatterImplementation::SimpleLoop #endif > ; Common::executeScatter(Selector(), *this, mem, indexes, mask); } template Vc_ALWAYS_INLINE Vector Vector::partialSum() const { Vector tmp = *this; if (Size > 1) tmp += tmp.shifted(-1); if (Size > 2) tmp += tmp.shifted(-2); if (Size > 4) tmp += tmp.shifted(-4); if (Size > 8) tmp += tmp.shifted(-8); if (Size > 16) tmp += tmp.shifted(-16); return tmp; } #ifndef Vc_IMPL_SSE4_1 template<> Vc_INTRINSIC Vc_PURE int SSE::int_v::product() const { return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3)); } template<> Vc_INTRINSIC Vc_PURE unsigned int SSE::uint_v::product() const { return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3)); } #endif template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::min(MaskArg m) const { Vector tmp = std::numeric_limits >::max(); tmp(m) = *this; return tmp.min(); } template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::max(MaskArg m) const { Vector tmp = std::numeric_limits >::min(); tmp(m) = *this; return tmp.max(); } template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::product(MaskArg m) const { Vector tmp(Vc::One); tmp(m) = *this; return tmp.product(); } template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::sum(MaskArg m) const { Vector tmp(Vc::Zero); tmp(m) = *this; return tmp.sum(); } namespace Detail { Vc_INTRINSIC Vc_CONST __m128 exponent(__m128 v) { __m128i tmp = _mm_srli_epi32(_mm_castps_si128(v), 23); tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x7f)); return _mm_cvtepi32_ps(tmp); } Vc_INTRINSIC Vc_CONST __m128d exponent(__m128d v) { __m128i tmp = _mm_srli_epi64(_mm_castpd_si128(v), 52); tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x3ff)); return _mm_cvtepi32_pd(_mm_shuffle_epi32(tmp, 0x08)); } } Vc_INTRINSIC Vc_CONST SSE::float_v exponent(SSE::float_v x) { using Detail::operator>=; Vc_ASSERT((x >= x.Zero()).isFull()); return Detail::exponent(x.data()); } Vc_INTRINSIC Vc_CONST SSE::double_v exponent(SSE::double_v x) { using Detail::operator>=; Vc_ASSERT((x >= x.Zero()).isFull()); return Detail::exponent(x.data()); } static void _doRandomStep(SSE::uint_v &state0, SSE::uint_v &state1) { using SSE::uint_v; using Detail::operator+; using Detail::operator*; state0.load(&Common::RandomState[0]); state1.load(&Common::RandomState[uint_v::Size]); (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]); uint_v(_mm_xor_si128((state0 * uint_v(0xdeece66du) + uint_v(11)).data(), _mm_srli_epi32(state1.data(), 16))) .store(&Common::RandomState[0]); } template Vc_ALWAYS_INLINE Vector Vector::Random() { SSE::uint_v state0, state1; _doRandomStep(state0, state1); return state0.data(); } template<> Vc_ALWAYS_INLINE SSE::float_v SSE::float_v::Random() { SSE::uint_v state0, state1; _doRandomStep(state0, state1); return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); } template<> Vc_ALWAYS_INLINE SSE::double_v SSE::double_v::Random() { typedef unsigned long long uint64 Vc_MAY_ALIAS; uint64 state0 = *reinterpret_cast(&Common::RandomState[8]); uint64 state1 = *reinterpret_cast(&Common::RandomState[10]); const __m128i state = _mm_load_si128(reinterpret_cast(&Common::RandomState[8])); *reinterpret_cast(&Common::RandomState[ 8]) = (state0 * 0x5deece66dull + 11); *reinterpret_cast(&Common::RandomState[10]) = (state1 * 0x5deece66dull + 11); return _mm_sub_pd(_mm_or_pd(_mm_castsi128_pd(_mm_srli_epi64(state, 12)), HT::one()), HT::one()); } template Vc_INTRINSIC Vc_PURE Vector Vector::shifted(int amount) const { enum { EntryTypeSizeof = sizeof(EntryType) }; switch (amount) { case 0: return *this; case 1: return SSE::sse_cast(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof)); case 2: return SSE::sse_cast(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof)); case 3: return SSE::sse_cast(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof)); case 4: return SSE::sse_cast(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof)); case 5: return SSE::sse_cast(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof)); case 6: return SSE::sse_cast(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof)); case 7: return SSE::sse_cast(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof)); case 8: return SSE::sse_cast(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof)); case -1: return SSE::sse_cast(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof)); case -2: return SSE::sse_cast(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof)); case -3: return SSE::sse_cast(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof)); case -4: return SSE::sse_cast(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof)); case -5: return SSE::sse_cast(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof)); case -6: return SSE::sse_cast(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof)); case -7: return SSE::sse_cast(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof)); case -8: return SSE::sse_cast(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof)); } return Zero(); } template Vc_INTRINSIC Vector Vector::shifted(int amount, Vector shiftIn) const { if (amount >= -int(size())) { constexpr int VectorWidth = int(size()); constexpr int EntryTypeSizeof = sizeof(EntryType); const __m128i v0 = sse_cast<__m128i>(d.v()); const __m128i v1 = sse_cast<__m128i>(shiftIn.d.v()); auto &&fixup = sse_cast; switch (amount) { case 0: return *this; case -1: return fixup(SSE::alignr_epi8<(VectorWidth - 1) * EntryTypeSizeof>(v0, v1)); case -2: return fixup(SSE::alignr_epi8<(VectorWidth - 2) * EntryTypeSizeof>(v0, v1)); case -3: return fixup(SSE::alignr_epi8<(VectorWidth - 3) * EntryTypeSizeof>(v0, v1)); case -4: return fixup(SSE::alignr_epi8<(VectorWidth - 4) * EntryTypeSizeof>(v0, v1)); case -5: return fixup(SSE::alignr_epi8<(VectorWidth - 5) * EntryTypeSizeof>(v0, v1)); case -6: return fixup(SSE::alignr_epi8<(VectorWidth - 6) * EntryTypeSizeof>(v0, v1)); case -7: return fixup(SSE::alignr_epi8<(VectorWidth - 7) * EntryTypeSizeof>(v0, v1)); case -8: return fixup(SSE::alignr_epi8<(VectorWidth - 8) * EntryTypeSizeof>(v0, v1)); case -9: return fixup(SSE::alignr_epi8<(VectorWidth - 9) * EntryTypeSizeof>(v0, v1)); case-10: return fixup(SSE::alignr_epi8<(VectorWidth -10) * EntryTypeSizeof>(v0, v1)); case-11: return fixup(SSE::alignr_epi8<(VectorWidth -11) * EntryTypeSizeof>(v0, v1)); case-12: return fixup(SSE::alignr_epi8<(VectorWidth -12) * EntryTypeSizeof>(v0, v1)); case-13: return fixup(SSE::alignr_epi8<(VectorWidth -13) * EntryTypeSizeof>(v0, v1)); case-14: return fixup(SSE::alignr_epi8<(VectorWidth -14) * EntryTypeSizeof>(v0, v1)); case-15: return fixup(SSE::alignr_epi8<(VectorWidth -15) * EntryTypeSizeof>(v0, v1)); case 1: return fixup(SSE::alignr_epi8< 1 * EntryTypeSizeof>(v1, v0)); case 2: return fixup(SSE::alignr_epi8< 2 * EntryTypeSizeof>(v1, v0)); case 3: return fixup(SSE::alignr_epi8< 3 * EntryTypeSizeof>(v1, v0)); case 4: return fixup(SSE::alignr_epi8< 4 * EntryTypeSizeof>(v1, v0)); case 5: return fixup(SSE::alignr_epi8< 5 * EntryTypeSizeof>(v1, v0)); case 6: return fixup(SSE::alignr_epi8< 6 * EntryTypeSizeof>(v1, v0)); case 7: return fixup(SSE::alignr_epi8< 7 * EntryTypeSizeof>(v1, v0)); case 8: return fixup(SSE::alignr_epi8< 8 * EntryTypeSizeof>(v1, v0)); case 9: return fixup(SSE::alignr_epi8< 9 * EntryTypeSizeof>(v1, v0)); case 10: return fixup(SSE::alignr_epi8<10 * EntryTypeSizeof>(v1, v0)); case 11: return fixup(SSE::alignr_epi8<11 * EntryTypeSizeof>(v1, v0)); case 12: return fixup(SSE::alignr_epi8<12 * EntryTypeSizeof>(v1, v0)); case 13: return fixup(SSE::alignr_epi8<13 * EntryTypeSizeof>(v1, v0)); case 14: return fixup(SSE::alignr_epi8<14 * EntryTypeSizeof>(v1, v0)); case 15: return fixup(SSE::alignr_epi8<15 * EntryTypeSizeof>(v1, v0)); } } return shiftIn.shifted(int(size()) + amount); } template Vc_INTRINSIC Vc_PURE Vector Vector::rotated(int amount) const { enum { EntryTypeSizeof = sizeof(EntryType) }; const __m128i v = SSE::sse_cast<__m128i>(d.v()); switch (static_cast(amount) % Size) { case 0: return *this; case 1: return SSE::sse_cast(SSE::alignr_epi8<1 * EntryTypeSizeof>(v, v)); case 2: return SSE::sse_cast(SSE::alignr_epi8<2 * EntryTypeSizeof>(v, v)); case 3: return SSE::sse_cast(SSE::alignr_epi8<3 * EntryTypeSizeof>(v, v)); case 4: return SSE::sse_cast(SSE::alignr_epi8<4 * EntryTypeSizeof>(v, v)); case 5: return SSE::sse_cast(SSE::alignr_epi8<5 * EntryTypeSizeof>(v, v)); case 6: return SSE::sse_cast(SSE::alignr_epi8<6 * EntryTypeSizeof>(v, v)); case 7: return SSE::sse_cast(SSE::alignr_epi8<7 * EntryTypeSizeof>(v, v)); } return Zero(); } namespace Detail { inline Vc_CONST SSE::double_v sorted(SSE::double_v x_) { const __m128d x = x_.data(); const __m128d y = _mm_shuffle_pd(x, x, _MM_SHUFFLE2(0, 1)); return _mm_unpacklo_pd(_mm_min_sd(x, y), _mm_max_sd(x, y)); } } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::sorted() const { return Detail::sorted(*this); } template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveLow (SSE::double_v x) const { return _mm_unpacklo_pd(data(), x.data()); } template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveHigh(SSE::double_v x) const { return _mm_unpackhi_pd(data(), x.data()); } template <> Vc_INTRINSIC SSE::float_v SSE::float_v::interleaveLow ( SSE::float_v x) const { return _mm_unpacklo_ps(data(), x.data()); } template <> Vc_INTRINSIC SSE::float_v SSE::float_v::interleaveHigh( SSE::float_v x) const { return _mm_unpackhi_ps(data(), x.data()); } template <> Vc_INTRINSIC SSE::int_v SSE::int_v::interleaveLow ( SSE::int_v x) const { return _mm_unpacklo_epi32(data(), x.data()); } template <> Vc_INTRINSIC SSE::int_v SSE::int_v::interleaveHigh( SSE::int_v x) const { return _mm_unpackhi_epi32(data(), x.data()); } template <> Vc_INTRINSIC SSE::uint_v SSE::uint_v::interleaveLow ( SSE::uint_v x) const { return _mm_unpacklo_epi32(data(), x.data()); } template <> Vc_INTRINSIC SSE::uint_v SSE::uint_v::interleaveHigh( SSE::uint_v x) const { return _mm_unpackhi_epi32(data(), x.data()); } template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveLow ( SSE::short_v x) const { return _mm_unpacklo_epi16(data(), x.data()); } template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveHigh( SSE::short_v x) const { return _mm_unpackhi_epi16(data(), x.data()); } template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveLow (SSE::ushort_v x) const { return _mm_unpacklo_epi16(data(), x.data()); } template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveHigh(SSE::ushort_v x) const { return _mm_unpackhi_epi16(data(), x.data()); } template <> template Vc_INTRINSIC SSE::double_v SSE::double_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); return _mm_setr_pd(tmp0, tmp1); } template <> template Vc_INTRINSIC SSE::float_v SSE::float_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); return _mm_setr_ps(tmp0, tmp1, tmp2, tmp3); } template <> template Vc_INTRINSIC SSE::int_v SSE::int_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3); } template <> template Vc_INTRINSIC SSE::uint_v SSE::uint_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3); } template <> template Vc_INTRINSIC SSE::short_v SSE::short_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); const auto tmp4 = gen(4); const auto tmp5 = gen(5); const auto tmp6 = gen(6); const auto tmp7 = gen(7); return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); } template <> template Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); const auto tmp4 = gen(4); const auto tmp5 = gen(5); const auto tmp6 = gen(6); const auto tmp7 = gen(7); return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); } template <> Vc_INTRINSIC Vc_PURE SSE::double_v SSE::double_v::reversed() const { return Mem::permute(d.v()); } template <> Vc_INTRINSIC Vc_PURE SSE::float_v SSE::float_v::reversed() const { return Mem::permute(d.v()); } template <> Vc_INTRINSIC Vc_PURE SSE::int_v SSE::int_v::reversed() const { return Mem::permute(d.v()); } template <> Vc_INTRINSIC Vc_PURE SSE::uint_v SSE::uint_v::reversed() const { return Mem::permute(d.v()); } template <> Vc_INTRINSIC Vc_PURE SSE::short_v SSE::short_v::reversed() const { return sse_cast<__m128i>( Mem::shuffle(sse_cast<__m128d>(Mem::permuteHi(d.v())), sse_cast<__m128d>(Mem::permuteLo(d.v())))); } template <> Vc_INTRINSIC Vc_PURE SSE::ushort_v SSE::ushort_v::reversed() const { return sse_cast<__m128i>( Mem::shuffle(sse_cast<__m128d>(Mem::permuteHi(d.v())), sse_cast<__m128d>(Mem::permuteLo(d.v())))); } template <> Vc_INTRINSIC SSE::float_v SSE::float_v::operator[](const SSE::int_v & #ifdef Vc_IMPL_AVX perm #endif ) const { #ifdef Vc_IMPL_AVX return _mm_permutevar_ps(d.v(), perm.data()); #else return *this; #endif } template <> template Vc_INTRINSIC SSE::float_v SSE::float_v::broadcast() const { constexpr VecPos Inner = static_cast(Index & 0x3); return Mem::permute(d.v()); } template <> template Vc_INTRINSIC SSE::double_v SSE::double_v::broadcast() const { constexpr VecPos Inner = static_cast(Index & 0x1); return Mem::permute(d.v()); } namespace Common { Vc_ALWAYS_INLINE void transpose_impl( TransposeTag<4, 4>, SSE::float_v *Vc_RESTRICT r[], const TransposeProxy &proxy) { const auto in0 = std::get<0>(proxy.in).data(); const auto in1 = std::get<1>(proxy.in).data(); const auto in2 = std::get<2>(proxy.in).data(); const auto in3 = std::get<3>(proxy.in).data(); const auto tmp0 = _mm_unpacklo_ps(in0, in2); const auto tmp1 = _mm_unpacklo_ps(in1, in3); const auto tmp2 = _mm_unpackhi_ps(in0, in2); const auto tmp3 = _mm_unpackhi_ps(in1, in3); *r[0] = _mm_unpacklo_ps(tmp0, tmp1); *r[1] = _mm_unpackhi_ps(tmp0, tmp1); *r[2] = _mm_unpacklo_ps(tmp2, tmp3); *r[3] = _mm_unpackhi_ps(tmp2, tmp3); } } } #ifndef VC_SSE_SIMD_CAST_H_ #define VC_SSE_SIMD_CAST_H_ #ifdef Vc_IMPL_AVX #ifndef VC_AVX_CASTS_H_ #define VC_AVX_CASTS_H_ #ifndef VC_AVX_SHUFFLE_H_ #define VC_AVX_SHUFFLE_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template struct Permutation {}; template struct Mask {}; #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST __m256i blend(__m256i a, __m256i b, Mask) { static_assert((Sel0 == 0 || Sel0 == 1) && (Sel1 == 0 || Sel1 == 1) && (Sel2 == 0 || Sel2 == 1) && (Sel3 == 0 || Sel3 == 1) && (Sel4 == 0 || Sel4 == 1) && (Sel5 == 0 || Sel5 == 1) && (Sel6 == 0 || Sel6 == 1) && (Sel7 == 0 || Sel7 == 1) && (Sel8 == 0 || Sel8 == 1) && (Sel9 == 0 || Sel9 == 1) && (Sel10 == 0 || Sel10 == 1) && (Sel11 == 0 || Sel11 == 1) && (Sel12 == 0 || Sel12 == 1) && (Sel13 == 0 || Sel13 == 1) && (Sel14 == 0 || Sel14 == 1) && (Sel15 == 0 || Sel15 == 1), "Selectors must be 0 or 1 to select the value from a or b"); constexpr uint8_t mask = static_cast( (Sel0 << 0 ) | (Sel1 << 1 ) | (Sel2 << 2 ) | (Sel3 << 3 ) | (Sel4 << 4 ) | (Sel5 << 5 ) | (Sel6 << 6 ) | (Sel7 << 7 ) | (Sel8 << 8 ) | (Sel9 << 9 ) | (Sel10 << 10) | (Sel11 << 11) | (Sel12 << 12) | (Sel13 << 13) | (Sel14 << 14) | (Sel15 << 15)); return _mm256_blend_epi16(a, b, mask); } #endif } namespace Mem { #ifdef Vc_IMPL_AVX2 template static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteLo(__m256i x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteHi(__m256i x) { static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range"); static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range"); return _mm256_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64); } #endif template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x) { static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range"); static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range"); return _mm256_permute2f128_ps( x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4))); } template static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x) { static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range"); static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range"); return _mm256_permute2f128_pd( x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4))); } template static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x) { static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range"); static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range"); #ifdef Vc_IMPL_AVX2 return _mm256_permute2x128_si256( x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4))); #else return _mm256_permute2f128_si256( x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4))); #endif } template static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle128(__m256 x, __m256 y) { static_assert(L >= X0 && H >= X0, "Incorrect_Range"); static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE __m256i Vc_CONST shuffle128(__m256i x, __m256i y) { static_assert(L >= X0 && H >= X0, "Incorrect_Range"); static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); #ifdef Vc_IMPL_AVX2 return _mm256_permute2x128_si256( x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); #else return _mm256_permute2f128_si256( x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); #endif } template static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle128(__m256d x, __m256d y) { static_assert(L >= X0 && H >= X0, "Incorrect_Range"); static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range"); static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8); } template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m256i Vc_CONST permute(__m256i x) { return _mm256_castps_si256(permute(_mm256_castsi256_ps(x))); } #ifdef Vc_IMPL_AVX2 template static Vc_ALWAYS_INLINE __m256i Vc_CONST permute4x64(__m256i x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute4x64_epi64(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } #endif template static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) { static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range"); static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range"); return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8); } template static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range"); return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); } template static Vc_ALWAYS_INLINE __m256 Vc_CONST blend(__m256 x, __m256 y) { static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range"); static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range"); static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range"); static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range"); static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range"); static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range"); static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range"); static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range"); return _mm256_blend_ps(x, y, (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 + (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + (Dst6 / Y6) * 64 + (Dst7 / Y7) *128 ); } template static Vc_ALWAYS_INLINE __m256i Vc_CONST blend(__m256i x, __m256i y) { return _mm256_castps_si256(blend(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y))); } template struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; }; template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) { static_assert(Dst0 >= X0 && Dst0 <= X7, "Incorrect_Range"); static_assert(Dst1 >= X0 && Dst1 <= X7, "Incorrect_Range"); static_assert(Dst2 >= X0 && Dst2 <= X7, "Incorrect_Range"); static_assert(Dst3 >= X0 && Dst3 <= X7, "Incorrect_Range"); static_assert(Dst4 >= X0 && Dst4 <= X7, "Incorrect_Range"); static_assert(Dst5 >= X0 && Dst5 <= X7, "Incorrect_Range"); static_assert(Dst6 >= X0 && Dst6 <= X7, "Incorrect_Range"); static_assert(Dst7 >= X0 && Dst7 <= X7, "Incorrect_Range"); if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) { return permute(x); } const __m128 loIn = _mm256_castps256_ps128(x); const __m128 hiIn = _mm256_extractf128_ps(x, 1); __m128 lo, hi; if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) { lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) { lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) { lo = shuffle(loIn, hiIn); } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) { lo = shuffle(hiIn, loIn); } else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) { lo = _mm_unpacklo_ps(loIn, hiIn); } else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) { lo = _mm_unpacklo_ps(hiIn, loIn); } else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) { lo = _mm_unpackhi_ps(loIn, hiIn); } else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) { lo = _mm_unpackhi_ps(hiIn, loIn); } else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) { lo = blend::Value, ScaleForBlend::Value, ScaleForBlend::Value, ScaleForBlend::Value>(loIn, hiIn); } if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) { hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64); } else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) { hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64); } else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) { hi = shuffle(loIn, hiIn); } else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) { hi = shuffle(hiIn, loIn); } else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) { hi = _mm_unpacklo_ps(loIn, hiIn); } else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) { hi = _mm_unpacklo_ps(hiIn, loIn); } else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) { hi = _mm_unpackhi_ps(loIn, hiIn); } else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) { hi = _mm_unpackhi_ps(hiIn, loIn); } else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) { hi = blend::Value, ScaleForBlend::Value, ScaleForBlend::Value, ScaleForBlend::Value>(loIn, hiIn); } return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); } } } namespace Vc_VERSIONED_NAMESPACE { namespace Reg { template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x, __m256 y) { static_assert(L >= X0 && H >= X0, "Incorrect_Range"); static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x, __m256i y) { static_assert(L >= X0 && H >= X0, "Incorrect_Range"); static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); #ifdef Vc_IMPL_AVX2 return _mm256_permute2x128_si256( x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); #else return _mm256_permute2f128_si256( x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); #endif } template static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x, __m256d y) { static_assert(L >= X0 && H >= X0, "Incorrect_Range"); static_assert(L <= Y1 && H <= Y1, "Incorrect_Range"); return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4)); } template static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range"); static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8); } template static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m128d Vc_CONST permute(__m128d x) { static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range"); return _mm_permute_pd(x, Dst0 + Dst1 * 2); } template static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range"); return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); } template static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) { static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range"); static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range"); return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8); } template static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) { static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range"); static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range"); return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); } } } #endif namespace Vc_VERSIONED_NAMESPACE { namespace AVX { namespace Casts { template Vc_INTRINSIC_L T avx_cast(__m128 v) Vc_INTRINSIC_R; template Vc_INTRINSIC_L T avx_cast(__m128i v) Vc_INTRINSIC_R; template Vc_INTRINSIC_L T avx_cast(__m128d v) Vc_INTRINSIC_R; template Vc_INTRINSIC_L T avx_cast(__m256 v) Vc_INTRINSIC_R; template Vc_INTRINSIC_L T avx_cast(__m256i v) Vc_INTRINSIC_R; template Vc_INTRINSIC_L T avx_cast(__m256d v) Vc_INTRINSIC_R; template<> Vc_INTRINSIC __m128 avx_cast(__m128 v) { return v; } template<> Vc_INTRINSIC __m128 avx_cast(__m128i v) { return _mm_castsi128_ps(v); } template<> Vc_INTRINSIC __m128 avx_cast(__m128d v) { return _mm_castpd_ps(v); } template<> Vc_INTRINSIC __m128i avx_cast(__m128 v) { return _mm_castps_si128(v); } template<> Vc_INTRINSIC __m128i avx_cast(__m128i v) { return v; } template<> Vc_INTRINSIC __m128i avx_cast(__m128d v) { return _mm_castpd_si128(v); } template<> Vc_INTRINSIC __m128d avx_cast(__m128 v) { return _mm_castps_pd(v); } template<> Vc_INTRINSIC __m128d avx_cast(__m128i v) { return _mm_castsi128_pd(v); } template<> Vc_INTRINSIC __m128d avx_cast(__m128d v) { return v; } template<> Vc_INTRINSIC __m256 avx_cast(__m128 v) { return _mm256_castps128_ps256(v); } template<> Vc_INTRINSIC __m256 avx_cast(__m128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); } template<> Vc_INTRINSIC __m256 avx_cast(__m128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); } template<> Vc_INTRINSIC __m256i avx_cast(__m128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); } template<> Vc_INTRINSIC __m256i avx_cast(__m128i v) { return _mm256_castsi128_si256(v); } template<> Vc_INTRINSIC __m256i avx_cast(__m128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); } template<> Vc_INTRINSIC __m256d avx_cast(__m128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); } template<> Vc_INTRINSIC __m256d avx_cast(__m128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); } template<> Vc_INTRINSIC __m256d avx_cast(__m128d v) { return _mm256_castpd128_pd256(v); } #if defined Vc_MSVC || defined Vc_CLANG || defined Vc_APPLECLANG static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); } static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); } static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); } #else static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); } static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); } static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); } #endif template<> Vc_INTRINSIC __m128 avx_cast(__m256 v) { return _mm256_castps256_ps128(v); } template<> Vc_INTRINSIC __m128 avx_cast(__m256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); } template<> Vc_INTRINSIC __m128 avx_cast(__m256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); } template<> Vc_INTRINSIC __m128i avx_cast(__m256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); } template<> Vc_INTRINSIC __m128i avx_cast(__m256i v) { return _mm256_castsi256_si128(v); } template<> Vc_INTRINSIC __m128i avx_cast(__m256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); } template<> Vc_INTRINSIC __m128d avx_cast(__m256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); } template<> Vc_INTRINSIC __m128d avx_cast(__m256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); } template<> Vc_INTRINSIC __m128d avx_cast(__m256d v) { return _mm256_castpd256_pd128(v); } template<> Vc_INTRINSIC __m256 avx_cast(__m256 v) { return v; } template<> Vc_INTRINSIC __m256 avx_cast(__m256i v) { return _mm256_castsi256_ps(v); } template<> Vc_INTRINSIC __m256 avx_cast(__m256d v) { return _mm256_castpd_ps(v); } template<> Vc_INTRINSIC __m256i avx_cast(__m256 v) { return _mm256_castps_si256(v); } template<> Vc_INTRINSIC __m256i avx_cast(__m256i v) { return v; } template<> Vc_INTRINSIC __m256i avx_cast(__m256d v) { return _mm256_castpd_si256(v); } template<> Vc_INTRINSIC __m256d avx_cast(__m256 v) { return _mm256_castps_pd(v); } template<> Vc_INTRINSIC __m256d avx_cast(__m256i v) { return _mm256_castsi256_pd(v); } template<> Vc_INTRINSIC __m256d avx_cast(__m256d v) { return v; } Vc_INTRINSIC Vc_CONST __m128 lo128(__m256 v) { return avx_cast<__m128>(v); } Vc_INTRINSIC Vc_CONST __m128d lo128(__m256d v) { return avx_cast<__m128d>(v); } Vc_INTRINSIC Vc_CONST __m128i lo128(__m256i v) { return avx_cast<__m128i>(v); } Vc_INTRINSIC Vc_CONST __m128 hi128(__m256 v) { return extract128<1>(v); } Vc_INTRINSIC Vc_CONST __m128d hi128(__m256d v) { return extract128<1>(v); } Vc_INTRINSIC Vc_CONST __m128i hi128(__m256i v) { return extract128<1>(v); } Vc_INTRINSIC Vc_CONST __m256 concat(__m128 a, __m128 b) { return insert128<1>(avx_cast<__m256 >(a), b); } Vc_INTRINSIC Vc_CONST __m256d concat(__m128d a, __m128d b) { return insert128<1>(avx_cast<__m256d>(a), b); } Vc_INTRINSIC Vc_CONST __m256i concat(__m128i a, __m128i b) { return insert128<1>(avx_cast<__m256i>(a), b); } } using namespace Casts; } namespace AVX2 { using namespace AVX::Casts; } namespace AVX { template struct ConvertTag {}; Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag) { return _mm256_cvttps_epi32(v); } Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag) { return _mm256_cvttpd_epi32(v); } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag) { #ifdef Vc_IMPL_AVX2 return _mm256_cvtepi16_epi32(v); #else return AVX::srai_epi32<16>( concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v))); #endif } Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag) { #ifdef Vc_IMPL_AVX2 return _mm256_cvtepu16_epi32(v); #else return AVX::srli_epi32<16>( concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v))); #endif } Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag) { using namespace AVX; return _mm256_castps_si256(_mm256_blendv_ps( _mm256_castsi256_ps(_mm256_cvttps_epi32(v)), _mm256_castsi256_ps(add_epi32(_mm256_cvttps_epi32(_mm256_sub_ps(v, set2power31_ps())), set2power31_epu32())), cmpge_ps(v, set2power31_ps()))); } Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag) { using namespace AVX; return _mm_xor_si128( _mm256_cvttpd_epi32(_mm256_sub_pd(_mm256_floor_pd(v), set1_pd(0x80000000u))), _mm_set2power31_epu32()); } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag) { #ifdef Vc_IMPL_AVX2 return _mm256_cvtepi16_epi32(v); #else return AVX::srai_epi32<16>( concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v))); #endif } Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag) { #ifdef Vc_IMPL_AVX2 return _mm256_cvtepu16_epi32(v); #else return AVX::srli_epi32<16>( concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v))); #endif } Vc_INTRINSIC __m256 convert(__m256 v, ConvertTag) { return v; } Vc_INTRINSIC __m128 convert(__m256d v, ConvertTag) { return _mm256_cvtpd_ps(v); } Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag) { return _mm256_cvtepi32_ps(v); } Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag) { using namespace AVX; return _mm256_blendv_ps( _mm256_cvtepi32_ps(v), _mm256_add_ps(_mm256_cvtepi32_ps(and_si256(v, set1_epi32(0x7ffffe00))), _mm256_add_ps(set2power31_ps(), _mm256_cvtepi32_ps(and_si256( v, set1_epi32(0x000001ff))))), _mm256_castsi256_ps(cmplt_epi32(v, _mm256_setzero_si256()))); } Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag) { return _mm256_cvtepi32_ps(convert(v, ConvertTag< short, int>())); } Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag) { return _mm256_cvtepi32_ps(convert(v, ConvertTag())); } Vc_INTRINSIC __m256d convert(__m128 v, ConvertTag) { return _mm256_cvtps_pd(v); } Vc_INTRINSIC __m256d convert(__m256d v, ConvertTag) { return v; } Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag) { return _mm256_cvtepi32_pd(v); } Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag) { using namespace AVX; return _mm256_add_pd( _mm256_cvtepi32_pd(_mm_xor_si128(v, _mm_setmin_epi32())), set1_pd(1u << 31)); } Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag) { return convert(convert(v, SSE::ConvertTag< short, int>()), ConvertTag()); } Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag) { return convert(convert(v, SSE::ConvertTag()), ConvertTag()); } Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag) { #ifdef Vc_IMPL_AVX2 auto a = _mm256_shuffle_epi8( v, _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80)); return lo128(_mm256_permute4x64_epi64(a, 0xf8)); #else const auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v)); const auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v)); const auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); const auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); return _mm_unpacklo_epi16(tmp2, tmp3); #endif } Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag) { return convert(v, ConvertTag()); } Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag) { return convert(convert(v, ConvertTag()), ConvertTag()); } Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag) { return convert(convert(v, ConvertTag()), SSE::ConvertTag()); } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag) { auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v)); auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v)); auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); return _mm_unpacklo_epi16(tmp2, tmp3); } Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag) { auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v)); auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v)); auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); return _mm_unpacklo_epi16(tmp2, tmp3); } Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag) { return convert(convert(v, ConvertTag()), ConvertTag()); } Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag) { return convert(convert(v, ConvertTag()), SSE::ConvertTag()); } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag) { return v; } template Vc_INTRINSIC auto convert( typename std::conditional<(sizeof(From) < sizeof(To)), typename SSE::VectorTraits::VectorType, typename AVX::VectorTypeHelper::Type>::type v) -> decltype(convert(v, ConvertTag())) { return convert(v, ConvertTag()); } template > Vc_INTRINSIC auto convert(typename AVX::VectorTypeHelper::Type v) -> decltype(convert(lo128(v), ConvertTag())) { return convert(lo128(v), ConvertTag()); } } } #endif #endif #ifndef VC_SSE_VECTOR_H_ #error "Vc/sse/vector.h needs to be included before Vc/sse/simd_cast.h" #endif namespace Vc_VERSIONED_NAMESPACE { namespace SSE { #define Vc_SIMD_CAST_1(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x, enable_if::value> = nullarg) #define Vc_SIMD_CAST_2(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, enable_if::value> = nullarg) #define Vc_SIMD_CAST_4(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, from_ x3, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_8(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, from_ x7, \ enable_if::value> = nullarg) Vc_SIMD_CAST_1( float_v, int_v); Vc_SIMD_CAST_1(double_v, int_v); Vc_SIMD_CAST_1( uint_v, int_v); Vc_SIMD_CAST_1( short_v, int_v); Vc_SIMD_CAST_1(ushort_v, int_v); Vc_SIMD_CAST_1( float_v, uint_v); Vc_SIMD_CAST_1(double_v, uint_v); Vc_SIMD_CAST_1( int_v, uint_v); Vc_SIMD_CAST_1( short_v, uint_v); Vc_SIMD_CAST_1(ushort_v, uint_v); Vc_SIMD_CAST_1(double_v, float_v); Vc_SIMD_CAST_1( int_v, float_v); Vc_SIMD_CAST_1( uint_v, float_v); Vc_SIMD_CAST_1( short_v, float_v); Vc_SIMD_CAST_1(ushort_v, float_v); Vc_SIMD_CAST_1( float_v, double_v); Vc_SIMD_CAST_1( int_v, double_v); Vc_SIMD_CAST_1( uint_v, double_v); Vc_SIMD_CAST_1( short_v, double_v); Vc_SIMD_CAST_1(ushort_v, double_v); Vc_SIMD_CAST_1( int_v, short_v); Vc_SIMD_CAST_1( uint_v, short_v); Vc_SIMD_CAST_1( float_v, short_v); Vc_SIMD_CAST_1(double_v, short_v); Vc_SIMD_CAST_1(ushort_v, short_v); Vc_SIMD_CAST_1( int_v, ushort_v); Vc_SIMD_CAST_1( uint_v, ushort_v); Vc_SIMD_CAST_1( float_v, ushort_v); Vc_SIMD_CAST_1(double_v, ushort_v); Vc_SIMD_CAST_1( short_v, ushort_v); Vc_SIMD_CAST_2(double_v, int_v); Vc_SIMD_CAST_2(double_v, uint_v); Vc_SIMD_CAST_2(double_v, float_v); Vc_SIMD_CAST_2( int_v, short_v); Vc_SIMD_CAST_2( uint_v, short_v); Vc_SIMD_CAST_2( float_v, short_v); Vc_SIMD_CAST_2(double_v, short_v); Vc_SIMD_CAST_2( int_v, ushort_v); Vc_SIMD_CAST_2( uint_v, ushort_v); Vc_SIMD_CAST_2( float_v, ushort_v); Vc_SIMD_CAST_2(double_v, ushort_v); #define Vc_CAST_(To_) \ template \ Vc_INTRINSIC Vc_CONST enable_if::value, Return> Vc_CAST_(short_v) simd_cast(double_v a, double_v b, double_v c); Vc_CAST_(ushort_v) simd_cast(double_v a, double_v b, double_v c); Vc_SIMD_CAST_4(double_v, short_v); Vc_SIMD_CAST_4(double_v, ushort_v); } using SSE::simd_cast; template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST To simd_cast(SSE::Vector x, enable_if::value> = nullarg); #undef Vc_SIMD_CAST_1 #undef Vc_SIMD_CAST_2 #undef Vc_SIMD_CAST_4 #undef Vc_SIMD_CAST_8 #define Vc_SIMD_CAST_1(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x, enable_if::value>) #define Vc_SIMD_CAST_2(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, \ enable_if::value>) #define Vc_SIMD_CAST_4(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, \ enable_if::value>) #define Vc_SIMD_CAST_8(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ from_ x5, from_ x6, from_ x7, \ enable_if::value>) namespace SSE { Vc_INTRINSIC __m128i convert_int32_to_int16(__m128i a, __m128i b) { auto tmp0 = _mm_unpacklo_epi16(a, b); auto tmp1 = _mm_unpackhi_epi16(a, b); auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); return _mm_unpacklo_epi16(tmp2, tmp3); } Vc_SIMD_CAST_1( float_v, int_v) { return convert< float, int>(x.data()); } Vc_SIMD_CAST_1(double_v, int_v) { return convert(x.data()); } Vc_SIMD_CAST_1( uint_v, int_v) { return convert< uint, int>(x.data()); } Vc_SIMD_CAST_1( short_v, int_v) { return convert< short, int>(x.data()); } Vc_SIMD_CAST_1(ushort_v, int_v) { return convert(x.data()); } Vc_SIMD_CAST_1( float_v, uint_v) { return convert< float, uint>(x.data()); } Vc_SIMD_CAST_1(double_v, uint_v) { return convert(x.data()); } Vc_SIMD_CAST_1( int_v, uint_v) { return convert< int, uint>(x.data()); } Vc_SIMD_CAST_1( short_v, uint_v) { return convert< short, uint>(x.data()); } Vc_SIMD_CAST_1(ushort_v, uint_v) { return convert(x.data()); } Vc_SIMD_CAST_1(double_v, float_v) { return convert(x.data()); } Vc_SIMD_CAST_1( int_v, float_v) { return convert< int, float>(x.data()); } Vc_SIMD_CAST_1( uint_v, float_v) { return convert< uint, float>(x.data()); } Vc_SIMD_CAST_1( short_v, float_v) { return convert< short, float>(x.data()); } Vc_SIMD_CAST_1(ushort_v, float_v) { return convert(x.data()); } Vc_SIMD_CAST_1( float_v, double_v) { return convert< float, double>(x.data()); } Vc_SIMD_CAST_1( int_v, double_v) { return convert< int, double>(x.data()); } Vc_SIMD_CAST_1( uint_v, double_v) { return convert< uint, double>(x.data()); } Vc_SIMD_CAST_1( short_v, double_v) { return convert< short, double>(x.data()); } Vc_SIMD_CAST_1(ushort_v, double_v) { return convert(x.data()); } Vc_SIMD_CAST_1( int_v, short_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); } Vc_SIMD_CAST_1( uint_v, short_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); } Vc_SIMD_CAST_1( float_v, short_v) { return _mm_packs_epi32(simd_cast(x).data(), _mm_setzero_si128()); } Vc_SIMD_CAST_1(double_v, short_v) { return _mm_packs_epi32(simd_cast(x).data(), _mm_setzero_si128()); } Vc_SIMD_CAST_1(ushort_v, short_v) { return x.data(); } Vc_SIMD_CAST_1( int_v, ushort_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); } Vc_SIMD_CAST_1( uint_v, ushort_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); } Vc_SIMD_CAST_1( float_v, ushort_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(double_v, ushort_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1( short_v, ushort_v) { return x.data(); } Vc_SIMD_CAST_2(double_v, int_v) { #ifdef Vc_IMPL_AVX return AVX::convert(AVX::concat(x0.data(), x1.data())); #else return _mm_unpacklo_epi64(convert(x0.data()), convert(x1.data())); #endif } Vc_SIMD_CAST_2(double_v, uint_v) { #ifdef Vc_IMPL_AVX return AVX::convert(AVX::concat(x0.data(), x1.data())); #else return _mm_unpacklo_epi64(convert(x0.data()), convert(x1.data())); #endif } Vc_SIMD_CAST_2(double_v, float_v) { #ifdef Vc_IMPL_AVX return _mm256_cvtpd_ps(AVX::concat(x0.data(), x1.data())); #else return _mm_movelh_ps(_mm_cvtpd_ps(x0.data()), _mm_cvtpd_ps(x1.data())); #endif } Vc_SIMD_CAST_2( int_v, short_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); } Vc_SIMD_CAST_2( uint_v, short_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); } Vc_SIMD_CAST_2( float_v, short_v) { return _mm_packs_epi32(simd_cast(x0).data(), simd_cast(x1).data()); } Vc_SIMD_CAST_2(double_v, short_v) { return _mm_packs_epi32(simd_cast(x0, x1).data(), _mm_setzero_si128()); } Vc_SIMD_CAST_2( int_v, ushort_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); } Vc_SIMD_CAST_2( uint_v, ushort_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); } Vc_SIMD_CAST_2( float_v, ushort_v) { return simd_cast(simd_cast(x0), simd_cast(x1)); } Vc_SIMD_CAST_2(double_v, ushort_v) { return simd_cast(simd_cast(x0, x1)); } Vc_CAST_(short_v) simd_cast(double_v a, double_v b, double_v c) { return simd_cast(simd_cast(a, b), simd_cast(c)); } Vc_CAST_(ushort_v) simd_cast(double_v a, double_v b, double_v c) { return simd_cast(simd_cast(a, b), simd_cast(c)); } #undef Vc_CAST_ Vc_SIMD_CAST_4(double_v, short_v) { return _mm_packs_epi32(simd_cast(x0, x1).data(), simd_cast(x2, x3).data()); } Vc_SIMD_CAST_4(double_v, ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> ) { return _mm_setr_pd(x.data(), 0.); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> ) { return _mm_setr_ps(x.data(), 0.f, 0.f, 0.f); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> ) { return _mm_setr_epi32(x.data(), 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> ) { return _mm_setr_epi32(uint(x.data()), 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> ) { return _mm_setr_epi16( x.data(), 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> ) { return _mm_setr_epi16( x.data(), 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> ) { return _mm_setr_pd(x0.data(), x1.data()); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> ) { return _mm_setr_ps(x0.data(), x1.data(), 0.f, 0.f); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> ) { return _mm_setr_epi32(x0.data(), x1.data(), 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> ) { return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> ) { return _mm_setr_epi16( x0.data(), x1.data(), 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> ) { return _mm_setr_epi16( x0.data(), x1.data(), 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm_setr_ps(x0.data(), x1.data(), x2.data(), 0.f); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm_setr_epi32(x0.data(), x1.data(), x2.data(), 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> ) { return _mm_setr_ps( x0.data(), x1.data(), x2.data(), x3.data()); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> ) { return _mm_setr_epi32( x0.data(), x1.data(), x2.data(), x3.data()); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> ) { return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), uint(x3.data())); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> ) { return _mm_setr_epi16( x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> ) { return _mm_setr_epi16( x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value>) { return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value>) { return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value>) { return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value>) { return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value>) { return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value>) { return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> ) { return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data()); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> ) { return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data()); } template Vc_INTRINSIC Vc_CONST To simd_cast(SSE::Vector x, enable_if::value> ) { return static_cast(x[0]); } template Vc_INTRINSIC Vc_CONST Return simd_cast(SSE::Mask x, enable_if::value> = nullarg) { using M = SSE::Mask; return {Detail::mask_cast(x.dataI())}; } template Vc_INTRINSIC Vc_CONST Return simd_cast( SSE::Mask x0, SSE::Mask x1, enable_if::value && Mask::Size * 2 == Return::Size> = nullarg) { return SSE::sse_cast<__m128>(_mm_packs_epi16(x0.dataI(), x1.dataI())); } template Vc_INTRINSIC Vc_CONST Return simd_cast( SSE::Mask x0, SSE::Mask x1, enable_if::value && Mask::Size * 4 == Return::Size> = nullarg) { return SSE::sse_cast<__m128>( _mm_packs_epi16(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_setzero_si128())); } template Vc_INTRINSIC Vc_CONST Return simd_cast( SSE::Mask x0, SSE::Mask x1, SSE::Mask x2, SSE::Mask x3, enable_if::value && Mask::Size * 4 == Return::Size> = nullarg) { return SSE::sse_cast<__m128>(_mm_packs_epi16(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI()))); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask x, enable_if::value> = nullarg) { Return m(false); m[0] = x[0]; return m; } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask x0, Scalar::Mask x1, enable_if::value> = nullarg) { Return m(false); m[0] = x0[0]; m[1] = x1[0]; return m; } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask x0, Scalar::Mask x1, Scalar::Mask x2, Scalar::Mask x3, enable_if::value> = nullarg) { Return m(false); m[0] = x0[0]; m[1] = x1[0]; if (Return::Size >= 4) { m[2] = x2[0]; m[3] = x3[0]; } return m; } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask x0, Scalar::Mask x1, Scalar::Mask x2, Scalar::Mask x3, Scalar::Mask x4, Scalar::Mask x5, Scalar::Mask x6, Scalar::Mask x7, enable_if::value> = nullarg) { Return m(false); m[0] = x0[0]; m[1] = x1[0]; if (Return::Size >= 4) { m[2] = x2[0]; m[3] = x3[0]; } if (Return::Size >= 8) { m[4] = x4[0]; m[5] = x5[0]; m[6] = x6[0]; m[7] = x7[0]; } return m; } template Vc_INTRINSIC Vc_CONST To simd_cast(SSE::Mask x, enable_if::value> = nullarg) { return static_cast(x[0]); } template Vc_INTRINSIC Vc_CONST Return simd_cast(V &&x, enable_if>::value && SSE::is_vector::value) || (SSE::is_mask>::value && SSE::is_mask::value))> = nullarg) { return simd_cast(x); } template Vc_INTRINSIC Vc_CONST Return simd_cast(V &&x, enable_if>::value && SSE::is_vector::value) || (Scalar::is_mask>::value && SSE::is_mask::value))> = nullarg) { return simd_cast(x); } template Vc_INTRINSIC Vc_CONST Return simd_cast( V x, enable_if::value && SSE::is_vector::value)> = nullarg) { constexpr int shift = (sizeof(V) / V::Size) * offset * Return::Size; static_assert(shift > 0 && shift < 16, ""); return simd_cast(V{SSE::sse_cast( _mm_srli_si128(SSE::sse_cast<__m128i>(x.data()), shift & 0xff))}); } template Vc_INTRINSIC Vc_CONST Return simd_cast(SSE::Vector x, enable_if::value> = nullarg) { return static_cast(x[offset]); } template Vc_INTRINSIC Vc_CONST Return simd_cast( V x, enable_if::value && SSE::is_mask::value)> = nullarg) { constexpr int shift = (sizeof(V) / V::Size) * offset * Return::Size; static_assert(shift > 0 && shift < 16, ""); return simd_cast(V{SSE::sse_cast( _mm_srli_si128(SSE::sse_cast<__m128i>(x.data()), shift & 0xff))}); } #undef Vc_SIMD_CAST_1 #undef Vc_SIMD_CAST_2 #undef Vc_SIMD_CAST_4 #undef Vc_SIMD_CAST_8 } #endif #endif #endif #ifdef Vc_IMPL_AVX #ifndef VC_AVX_VECTOR_H_ #define VC_AVX_VECTOR_H_ #ifndef VC_AVX_VECTORHELPER_H_ #define VC_AVX_VECTORHELPER_H_ #include namespace Vc_VERSIONED_NAMESPACE { namespace AVX { template<> struct VectorHelper<__m256> { typedef __m256 VectorType; typedef const VectorType VTArg; template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_ps(mem, x); } template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_ps(mem, x); } template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_ps(mem, x); } template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_ps()); } template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if::type = nullptr) { _mm256_maskstore(mem, m, x); } template static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); } }; template<> struct VectorHelper<__m256d> { typedef __m256d VectorType; typedef const VectorType VTArg; template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_pd(mem, x); } template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_pd(mem, x); } template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_pd(mem, x); } template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_pd()); } template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if::type = nullptr) { _mm256_maskstore(mem, m, x); } template static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); } }; template<> struct VectorHelper<__m256i> { typedef __m256i VectorType; typedef const VectorType VTArg; template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); } template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); } template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); } template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_si256()); } template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if::type = nullptr) { _mm256_maskstore(mem, m, x); } template static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); } }; #define Vc_OP1(op) \ static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return Vc_CAT2(_mm256_##op##_, Vc_SUFFIX)(a); } #define Vc_OP(op) \ static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(op##_ , Vc_SUFFIX)(a, b); } #define Vc_OP_(op) \ static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op , Vc_SUFFIX)(a, b); } #define Vc_OPx(op,op2) \ static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op2##_, Vc_SUFFIX)(a, b); } template<> struct VectorHelper { typedef __m256d VectorType; typedef const VectorType VTArg; typedef double EntryType; #define Vc_SUFFIX pd static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(_mm256_castps_pd(mask), a); } static Vc_ALWAYS_INLINE VectorType set(const double a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); } static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) { return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d); } static Vc_ALWAYS_INLINE VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); } static inline void fma(VectorType &v1, VTArg v2, VTArg v3) { #ifdef Vc_IMPL_FMA4 v1 = _mm256_macc_pd(v1, v2, v3); #else VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble))); VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast(&c_general::highMaskDouble))); #if defined(Vc_GCC) && Vc_GCC < 0x40703 asm("":"+x"(h1), "+x"(h2)); #endif const VectorType l1 = _mm256_sub_pd(v1, h1); const VectorType l2 = _mm256_sub_pd(v2, h2); const VectorType ll = mul(l1, l2); const VectorType lh = add(mul(l1, h2), mul(h1, l2)); const VectorType hh = mul(h1, h2); const VectorType lh_lt_v3 = cmplt_pd(abs(lh), abs(v3)); const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3); const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3); v1 = add(add(ll, b), add(c, hh)); #endif } static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_pd(a,b); } static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_pd(a,b); } static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_pd(a,b); } Vc_OP1(sqrt) static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) { return _mm256_div_pd(one(), sqrt(x)); } static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) { return _mm256_div_pd(one(), x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_pd()); } static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_pd(a, b); } static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_pd(a, b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) { __m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); b = _mm_min_sd(b, _mm_unpackhi_pd(b, b)); return _mm_cvtsd_f64(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) { __m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); b = _mm_max_sd(b, _mm_unpackhi_pd(b, b)); return _mm_cvtsd_f64(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) { __m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1))); return _mm_cvtsd_f64(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) { __m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1)); b = _mm_hadd_pd(b, b); return _mm_cvtsd_f64(b); } #undef Vc_SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return _mm256_round_pd(a, _MM_FROUND_NINT); } }; template<> struct VectorHelper { typedef float EntryType; typedef __m256 VectorType; typedef const VectorType VTArg; #define Vc_SUFFIX ps static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(mask, a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); } static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d, const float e, const float f, const float g, const float h) { return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); } static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); } static Vc_ALWAYS_INLINE Vc_CONST __m256 concat(__m256d a, __m256d b) { return _mm256_insertf128_ps(avx_cast<__m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); } static inline void fma(VectorType &v1, VTArg v2, VTArg v3) { #ifdef Vc_IMPL_FMA4 v1 = _mm256_macc_ps(v1, v2, v3); #else __m256d v1_0 = _mm256_cvtps_pd(lo128(v1)); __m256d v1_1 = _mm256_cvtps_pd(hi128(v1)); __m256d v2_0 = _mm256_cvtps_pd(lo128(v2)); __m256d v2_1 = _mm256_cvtps_pd(hi128(v2)); __m256d v3_0 = _mm256_cvtps_pd(lo128(v3)); __m256d v3_1 = _mm256_cvtps_pd(hi128(v3)); v1 = AVX::concat( _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)), _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1))); #endif } static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_ps(a, b); } static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_ps(a, b); } static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_ps(a, b); } Vc_OP1(sqrt) Vc_OP1(rsqrt) static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) { return _mm256_rcp_ps(x); } static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_ps()); } static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_ps(a, b); } static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_ps(a, b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) { __m128 b = _mm_min_ps(lo128(a), hi128(a)); b = _mm_min_ps(b, _mm_movehl_ps(b, b)); b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtss_f32(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) { __m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1)); b = _mm_max_ps(b, _mm_movehl_ps(b, b)); b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtss_f32(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) { __m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1)); b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); return _mm_cvtss_f32(b); } static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) { __m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1)); b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3))); b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1))); return _mm_cvtss_f32(b); } #undef Vc_SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return _mm256_round_ps(a, _MM_FROUND_NINT); } }; #undef Vc_OP1 #undef Vc_OP #undef Vc_OP_ #undef Vc_OPx } } #endif #ifndef VC_AVX_MASK_H_ #define VC_AVX_MASK_H_ #include #ifndef VC_AVX_DETAIL_H_ #define VC_AVX_DETAIL_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>, typename Flags::EnableIfAligned = nullptr) { return _mm256_load_ps(x); } template Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>, typename Flags::EnableIfUnaligned = nullptr) { return _mm256_loadu_ps(x); } template Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>, typename Flags::EnableIfStreaming = nullptr) { return AvxIntrinsics::stream_load<__m256>(x); } template Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>, typename Flags::EnableIfAligned = nullptr) { return _mm256_load_pd(x); } template Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>, typename Flags::EnableIfUnaligned = nullptr) { return _mm256_loadu_pd(x); } template Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>, typename Flags::EnableIfStreaming = nullptr) { return AvxIntrinsics::stream_load<__m256d>(x); } template ::value>> Vc_INTRINSIC Vc_PURE __m256i load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfAligned = nullptr) { return _mm256_load_si256(reinterpret_cast(x)); } template ::value>> Vc_INTRINSIC Vc_PURE __m256i load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfUnaligned = nullptr) { return _mm256_loadu_si256(reinterpret_cast(x)); } template ::value>> Vc_INTRINSIC Vc_PURE __m256i load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfStreaming = nullptr) { return AvxIntrinsics::stream_load<__m256i>(x); } Vc_INTRINSIC __m256 load32(const float *mem, when_aligned) { return _mm256_load_ps(mem); } Vc_INTRINSIC __m256 load32(const float *mem, when_unaligned) { return _mm256_loadu_ps(mem); } Vc_INTRINSIC __m256 load32(const float *mem, when_streaming) { return AvxIntrinsics::stream_load<__m256>(mem); } Vc_INTRINSIC __m256d load32(const double *mem, when_aligned) { return _mm256_load_pd(mem); } Vc_INTRINSIC __m256d load32(const double *mem, when_unaligned) { return _mm256_loadu_pd(mem); } Vc_INTRINSIC __m256d load32(const double *mem, when_streaming) { return AvxIntrinsics::stream_load<__m256d>(mem); } template Vc_INTRINSIC __m256i load32(const T *mem, when_aligned) { static_assert(std::is_integral::value, "load32 is only intended for integral T"); return _mm256_load_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load32(const T *mem, when_unaligned) { static_assert(std::is_integral::value, "load32 is only intended for integral T"); return _mm256_loadu_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load32(const T *mem, when_streaming) { static_assert(std::is_integral::value, "load32 is only intended for integral T"); return AvxIntrinsics::stream_load<__m256i>(mem); } #ifdef Vc_MSVC Vc_INTRINSIC __m256i load(const uint *mem, when_aligned, LoadTag<__m256i, int>) { return _mm256_load_si256(reinterpret_cast(mem)); } Vc_INTRINSIC __m256d load(const double *mem, when_unaligned, LoadTag<__m256d, double>) { return _mm256_loadu_pd(mem); } template Vc_INTRINSIC __m256 load(const float *mem, when_aligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_load_ps(mem); } template Vc_INTRINSIC __m256 load(const float *mem, when_unaligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_loadu_ps(mem); } template Vc_INTRINSIC __m256 load(const float *mem, when_streaming, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return AvxIntrinsics::stream_load<__m256>(mem); } template Vc_INTRINSIC __m256d load(const double *mem, when_aligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_load_pd(mem); } template Vc_INTRINSIC __m256d load(const double *mem, when_unaligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_loadu_pd(mem); } template Vc_INTRINSIC __m256d load(const double *mem, when_streaming, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return AvxIntrinsics::stream_load<__m256d>(mem); } template Vc_INTRINSIC __m256i load(const uint *mem, when_aligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_load_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const uint *mem, when_unaligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_loadu_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const uint *mem, when_streaming, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return AvxIntrinsics::stream_load<__m256i>(mem); } template Vc_INTRINSIC __m256i load(const int *mem, when_unaligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_loadu_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const int *mem, when_aligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_load_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const int *mem, when_streaming, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return AvxIntrinsics::stream_load<__m256i>(mem); } template Vc_INTRINSIC __m256i load(const short *mem, when_unaligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_loadu_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const short *mem, when_aligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_load_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const short *mem, when_streaming, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return AvxIntrinsics::stream_load<__m256i>(mem); } template Vc_INTRINSIC __m256i load(const ushort *mem, when_unaligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_loadu_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const ushort *mem, when_aligned, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return _mm256_load_si256(reinterpret_cast(mem)); } template Vc_INTRINSIC __m256i load(const ushort *mem, when_streaming, enable_if<(std::is_same::value && std::is_same::value)> = nullarg) { return AvxIntrinsics::stream_load<__m256i>(mem); } #endif template Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, short>) { return load32(mem, f); } template Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, short>) { return AVX::cvtepu8_epi16(load16(mem, f)); } template Vc_INTRINSIC __m256i load(const schar *mem, Flags f, LoadTag<__m256i, short>) { return AVX::cvtepi8_epi16(load16(mem, f)); } template Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, ushort>) { return AVX::cvtepu8_epi16(load16(mem, f)); } template Vc_INTRINSIC __m256i load(const uint *mem, Flags f, LoadTag<__m256i, int>) { return load32(mem, f); } template Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, int>) { return AVX::cvtepu16_epi32(load16(mem, f)); } template Vc_INTRINSIC __m256i load(const short *mem, Flags f, LoadTag<__m256i, int>) { return AVX::cvtepi16_epi32(load16(mem, f)); } template Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, int>) { return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); } template Vc_INTRINSIC __m256i load(const schar *mem, Flags, LoadTag<__m256i, int>) { return AVX::cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); } template Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, uint>) { return AVX::cvtepu16_epi32(load16(mem, f)); } template Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, uint>) { return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); } template Vc_INTRINSIC __m256d load(const float *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256d load(const uint *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256d load(const int *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256d load(const ushort *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256d load(const short *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256d load(const uchar *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256d load(const schar *mem, Flags f, LoadTag<__m256d, double>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256 load(const double *mem, Flags f, LoadTag<__m256, float>) { return AVX::concat(_mm256_cvtpd_ps(load32(&mem[0], f)), _mm256_cvtpd_ps(load32(&mem[4], f))); } template Vc_INTRINSIC __m256 load(const uint *mem, Flags f, LoadTag<__m256, float>) { const auto v = load32(mem, f); return _mm256_blendv_ps( _mm256_cvtepi32_ps(v), _mm256_add_ps(_mm256_cvtepi32_ps(AVX::sub_epi32(v, AVX::set2power31_epu32())), AVX::set2power31_ps()), _mm256_castsi256_ps(AVX::cmplt_epi32(v, _mm256_setzero_si256()))); } template Vc_INTRINSIC __m256 load(const int *mem, Flags f, LoadTag<__m256, float>) { return AVX::convert(load32(mem, f)); } template ::value>> Vc_INTRINSIC __m256 load(const T *mem, Flags f, LoadTag<__m256, float>) { return _mm256_cvtepi32_ps(load<__m256i, int>(mem, f)); } template Vc_INTRINSIC __m256 load(const ushort *mem, Flags f, LoadTag<__m256, float>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC __m256 load(const short *mem, Flags f, LoadTag<__m256, float>) { return AVX::convert(load16(mem, f)); } template Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount >= 16), T> shifted(T k) { return AVX::avx_cast(AVX::zeroExtend( _mm_srli_si128(AVX::hi128(AVX::avx_cast<__m256i>(k)), amount - 16))); } template Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > 0 && amount < 16), T> shifted(T k) { return AVX::avx_cast( AVX::alignr(Mem::permute128(AVX::avx_cast<__m256i>(k)), AVX::avx_cast<__m256i>(k))); } template Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount <= -16), T> shifted(T k) { return AVX::avx_cast(Mem::permute128(AVX::avx_cast<__m256i>( _mm_slli_si128(AVX::lo128(AVX::avx_cast<__m256i>(k)), -16 - amount)))); } template Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > -16 && amount < 0), T> shifted(T k) { return AVX::avx_cast( AVX::alignr<16 + amount>(AVX::avx_cast<__m256i>(k), Mem::permute128(AVX::avx_cast<__m256i>(k)))); } template Vc_INTRINSIC Vc_CONST R mask_cast(__m256i k) { static_assert(From == To, "Incorrect mask cast."); static_assert(std::is_same::value, "Incorrect mask cast."); return AVX::avx_cast<__m256>(k); } template <> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 4, __m128>(__m256i k) { return AVX::avx_cast<__m128>(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k))); } template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 4, __m256>(__m128i k) { const auto kk = _mm_castsi128_ps(k); return AVX::concat(_mm_unpacklo_ps(kk, kk), _mm_unpackhi_ps(kk, kk)); } template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m256i k) { return AVX::avx_cast<__m256>(AVX::concat(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)), _mm_setzero_si128())); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m256i k) { return AVX::avx_cast<__m128>(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)), _mm_setzero_si128())); } template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m128i k) { return AVX::zeroExtend(AVX::avx_cast<__m128>(k)); } template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 16, __m256>(__m256i k) { return AVX::zeroExtend(mask_cast<4, 8, __m128>(k)); } template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m256i k) { const auto lo = AVX::lo128(AVX::avx_cast<__m256>(k)); return AVX::concat(_mm_unpacklo_ps(lo, lo), _mm_unpackhi_ps(lo, lo)); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m256i k) { return AVX::avx_cast<__m128>(AVX::lo128(k)); } template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m128i k) { const auto tmp = _mm_unpacklo_epi16(k, k); return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp))); } template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 8, __m128>(__m256i k) { return AVX::avx_cast<__m128>(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k))); } template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 8, __m256>(__m128i k) { return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(k, k), _mm_unpackhi_epi16(k, k))); } template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 16, __m256>(__m256i k) { return AVX::zeroExtend(mask_cast<8, 8, __m128>(k)); } #ifdef Vc_IMPL_AVX2 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 8, __m256>(__m256i k) { const auto flipped = Mem::permute4x64(k); return _mm256_castsi256_ps(AVX::unpacklo_epi16(flipped, flipped)); } #endif template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 4, __m256>(__m256i k) { const auto tmp = _mm_unpacklo_epi16(AVX::lo128(k), AVX::lo128(k)); return _mm256_castsi256_ps(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp))); } template<> Vc_INTRINSIC Vc_CONST __m256 allone<__m256 >() { return AVX::setallone_ps(); } template<> Vc_INTRINSIC Vc_CONST __m256i allone<__m256i>() { return AVX::setallone_si256(); } template<> Vc_INTRINSIC Vc_CONST __m256d allone<__m256d>() { return AVX::setallone_pd(); } template<> Vc_INTRINSIC Vc_CONST __m256 zero<__m256 >() { return _mm256_setzero_ps(); } template<> Vc_INTRINSIC Vc_CONST __m256i zero<__m256i>() { return _mm256_setzero_si256(); } template<> Vc_INTRINSIC Vc_CONST __m256d zero<__m256d>() { return _mm256_setzero_pd(); } Vc_INTRINSIC Vc_CONST __m256 one( float) { return AVX::setone_ps (); } Vc_INTRINSIC Vc_CONST __m256d one(double) { return AVX::setone_pd (); } Vc_INTRINSIC Vc_CONST __m256i one( int) { return AVX::setone_epi32(); } Vc_INTRINSIC Vc_CONST __m256i one( uint) { return AVX::setone_epu32(); } Vc_INTRINSIC Vc_CONST __m256i one( short) { return AVX::setone_epi16(); } Vc_INTRINSIC Vc_CONST __m256i one(ushort) { return AVX::setone_epu16(); } Vc_INTRINSIC Vc_CONST __m256i one( schar) { return AVX::setone_epi8 (); } Vc_INTRINSIC Vc_CONST __m256i one( uchar) { return AVX::setone_epu8 (); } Vc_ALWAYS_INLINE Vc_CONST __m256 negate(__m256 v, std::integral_constant) { return _mm256_xor_ps(v, AVX::setsignmask_ps()); } Vc_ALWAYS_INLINE Vc_CONST __m256d negate(__m256d v, std::integral_constant) { return _mm256_xor_pd(v, AVX::setsignmask_pd()); } Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant) { return AVX::sign_epi32(v, Detail::allone<__m256i>()); } Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant) { return AVX::sign_epi16(v, Detail::allone<__m256i>()); } Vc_INTRINSIC __m256 xor_(__m256 a, __m256 b) { return _mm256_xor_ps(a, b); } Vc_INTRINSIC __m256d xor_(__m256d a, __m256d b) { return _mm256_xor_pd(a, b); } Vc_INTRINSIC __m256i xor_(__m256i a, __m256i b) { #ifdef Vc_IMPL_AVX2 return _mm256_xor_si256(a, b); #else return _mm256_castps_si256( _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); #endif } Vc_INTRINSIC __m256 or_(__m256 a, __m256 b) { return _mm256_or_ps(a, b); } Vc_INTRINSIC __m256d or_(__m256d a, __m256d b) { return _mm256_or_pd(a, b); } Vc_INTRINSIC __m256i or_(__m256i a, __m256i b) { #ifdef Vc_IMPL_AVX2 return _mm256_or_si256(a, b); #else return _mm256_castps_si256( _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); #endif } Vc_INTRINSIC __m256 and_(__m256 a, __m256 b) { return _mm256_and_ps(a, b); } Vc_INTRINSIC __m256d and_(__m256d a, __m256d b) { return _mm256_and_pd(a, b); } Vc_INTRINSIC __m256i and_(__m256i a, __m256i b) { #ifdef Vc_IMPL_AVX2 return _mm256_and_si256(a, b); #else return _mm256_castps_si256( _mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); #endif } Vc_INTRINSIC __m256 andnot_(__m256 a, __m256 b) { return _mm256_andnot_ps(a, b); } Vc_INTRINSIC __m256d andnot_(__m256d a, __m256d b) { return _mm256_andnot_pd(a, b); } Vc_INTRINSIC __m256i andnot_(__m256i a, __m256i b) { #ifdef Vc_IMPL_AVX2 return _mm256_andnot_si256(a, b); #else return _mm256_castps_si256( _mm256_andnot_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); #endif } Vc_INTRINSIC __m256 not_(__m256 a) { return andnot_(a, allone<__m256 >()); } Vc_INTRINSIC __m256d not_(__m256d a) { return andnot_(a, allone<__m256d>()); } Vc_INTRINSIC __m256i not_(__m256i a) { return andnot_(a, allone<__m256i>()); } Vc_INTRINSIC __m256 blend(__m256 a, __m256 b, __m256 c) { return _mm256_blendv_ps(a, b, c); } Vc_INTRINSIC __m256d blend(__m256d a, __m256d b, __m256d c) { return _mm256_blendv_pd(a, b, c); } Vc_INTRINSIC __m256i blend(__m256i a, __m256i b, __m256i c) { return AVX::blendv_epi8(a, b, c); } Vc_INTRINSIC __m256 abs(__m256 a, float) { return and_(a, AVX::setabsmask_ps()); } Vc_INTRINSIC __m256d abs(__m256d a, double) { return and_(a, AVX::setabsmask_pd()); } Vc_INTRINSIC __m256i abs(__m256i a, int) { return AVX::abs_epi32(a); } Vc_INTRINSIC __m256i abs(__m256i a, uint) { return a; } Vc_INTRINSIC __m256i abs(__m256i a, short) { return AVX::abs_epi16(a); } Vc_INTRINSIC __m256i abs(__m256i a, ushort) { return a; } Vc_INTRINSIC __m256i abs(__m256i a, schar) { return AVX::abs_epi8 (a); } Vc_INTRINSIC __m256i abs(__m256i a, uchar) { return a; } Vc_INTRINSIC __m256 add(__m256 a, __m256 b, float) { return _mm256_add_ps(a, b); } Vc_INTRINSIC __m256d add(__m256d a, __m256d b, double) { return _mm256_add_pd(a, b); } Vc_INTRINSIC __m256i add(__m256i a, __m256i b, int) { return AVX::add_epi32(a, b); } Vc_INTRINSIC __m256i add(__m256i a, __m256i b, uint) { return AVX::add_epi32(a, b); } Vc_INTRINSIC __m256i add(__m256i a, __m256i b, short) { return AVX::add_epi16(a, b); } Vc_INTRINSIC __m256i add(__m256i a, __m256i b, ushort) { return AVX::add_epi16(a, b); } Vc_INTRINSIC __m256 sub(__m256 a, __m256 b, float) { return _mm256_sub_ps(a, b); } Vc_INTRINSIC __m256d sub(__m256d a, __m256d b, double) { return _mm256_sub_pd(a, b); } Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, int) { return AVX::sub_epi32(a, b); } Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, uint) { return AVX::sub_epi32(a, b); } Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, short) { return AVX::sub_epi16(a, b); } Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, ushort) { return AVX::sub_epi16(a, b); } Vc_INTRINSIC __m256 mul(__m256 a, __m256 b, float) { return _mm256_mul_ps(a, b); } Vc_INTRINSIC __m256d mul(__m256d a, __m256d b, double) { return _mm256_mul_pd(a, b); } Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, int) { return AVX::mullo_epi32(a, b); } Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, uint) { return AVX::mullo_epi32(a, b); } Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, short) { return AVX::mullo_epi16(a, b); } Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, ushort) { return AVX::mullo_epi16(a, b); } Vc_INTRINSIC __m256 div(__m256 a, __m256 b, float) { return _mm256_div_ps(a, b); } Vc_INTRINSIC __m256d div(__m256d a, __m256d b, double) { return _mm256_div_pd(a, b); } Vc_INTRINSIC __m256i div(__m256i a, __m256i b, int) { using namespace AVX; const __m256d lo1 = _mm256_cvtepi32_pd(lo128(a)); const __m256d lo2 = _mm256_cvtepi32_pd(lo128(b)); const __m256d hi1 = _mm256_cvtepi32_pd(hi128(a)); const __m256d hi2 = _mm256_cvtepi32_pd(hi128(b)); return concat(_mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)), _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2))); } Vc_INTRINSIC __m256i div(__m256i a, __m256i b, uint) { using namespace AVX; const __m256i aa = add_epi32(a, set1_epi32(-2147483648)); const __m256i bb = add_epi32(b, set1_epi32(-2147483648)); const __m256d loa = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(aa)), set1_pd(2147483648.)); const __m256d hia = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(aa)), set1_pd(2147483648.)); const __m256d lob = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(bb)), set1_pd(2147483648.)); const __m256d hib = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(bb)), set1_pd(2147483648.)); return avx_cast<__m256i>(_mm256_blendv_ps( avx_cast<__m256>(concat(_mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)), _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)))), avx_cast<__m256>(a), avx_cast<__m256>(cmpeq_epi32(b, setone_epi32())))); } Vc_INTRINSIC __m256i div(__m256i a, __m256i b, short) { using namespace AVX; const __m256 lo = _mm256_div_ps(convert(lo128(a)), convert(lo128(b))); const __m256 hi = _mm256_div_ps(convert(hi128(a)), convert(hi128(b))); return concat(convert(lo), convert(hi)); } template Vc_INTRINSIC T add(Common::IntrinsicType a, T) { return {add(add(AVX::lo128(a), AVX::hi128(a), T()), T())}; } template Vc_INTRINSIC T mul(Common::IntrinsicType a, T) { return {mul(mul(AVX::lo128(a), AVX::hi128(a), T()), T())}; } template Vc_INTRINSIC T min(Common::IntrinsicType a, T) { return {min(min(AVX::lo128(a), AVX::hi128(a), T()), T())}; } template Vc_INTRINSIC T max(Common::IntrinsicType a, T) { return {max(max(AVX::lo128(a), AVX::hi128(a), T()), T())}; } Vc_INTRINSIC __m256 cmpeq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpeq_ps(a, b); } Vc_INTRINSIC __m256d cmpeq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpeq_pd(a, b); } Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, int) { return AvxIntrinsics::cmpeq_epi32(a, b); } Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, uint) { return AvxIntrinsics::cmpeq_epi32(a, b); } Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, short) { return AvxIntrinsics::cmpeq_epi16(a, b); } Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, ushort) { return AvxIntrinsics::cmpeq_epi16(a, b); } Vc_INTRINSIC __m256 cmpneq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpneq_ps(a, b); } Vc_INTRINSIC __m256d cmpneq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpneq_pd(a, b); } Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, int) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); } Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uint) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); } Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, short) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); } Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, ushort) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); } Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, schar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); } Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uchar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); } Vc_INTRINSIC __m256 cmpgt(__m256 a, __m256 b, float) { return AVX::cmpgt_ps(a, b); } Vc_INTRINSIC __m256d cmpgt(__m256d a, __m256d b, double) { return AVX::cmpgt_pd(a, b); } Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(a, b); } Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(a, b); } Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(a, b); } Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(a, b); } Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (a, b); } Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (a, b); } Vc_INTRINSIC __m256 cmpge(__m256 a, __m256 b, float) { return AVX::cmpge_ps(a, b); } Vc_INTRINSIC __m256d cmpge(__m256d a, __m256d b, double) { return AVX::cmpge_pd(a, b); } Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(b, a)); } Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(b, a)); } Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(b, a)); } Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(b, a)); } Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (b, a)); } Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (b, a)); } Vc_INTRINSIC __m256 cmple(__m256 a, __m256 b, float) { return AVX::cmple_ps(a, b); } Vc_INTRINSIC __m256d cmple(__m256d a, __m256d b, double) { return AVX::cmple_pd(a, b); } Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(a, b)); } Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(a, b)); } Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(a, b)); } Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(a, b)); } Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (a, b)); } Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (a, b)); } Vc_INTRINSIC __m256 cmplt(__m256 a, __m256 b, float) { return AVX::cmplt_ps(a, b); } Vc_INTRINSIC __m256d cmplt(__m256d a, __m256d b, double) { return AVX::cmplt_pd(a, b); } Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(b, a); } Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(b, a); } Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(b, a); } Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(b, a); } Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (b, a); } Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (b, a); } Vc_INTRINSIC __m256 fma(__m256 a, __m256 b, __m256 c, float) { #ifdef Vc_IMPL_FMA4 return _mm256_macc_ps(a, b, c); #elif defined Vc_IMPL_FMA return _mm256_fmadd_ps(a, b, c); #else using namespace AVX; __m256d v1_0 = _mm256_cvtps_pd(lo128(a)); __m256d v1_1 = _mm256_cvtps_pd(hi128(a)); __m256d v2_0 = _mm256_cvtps_pd(lo128(b)); __m256d v2_1 = _mm256_cvtps_pd(hi128(b)); __m256d v3_0 = _mm256_cvtps_pd(lo128(c)); __m256d v3_1 = _mm256_cvtps_pd(hi128(c)); return concat(_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)), _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1))); #endif } Vc_INTRINSIC __m256d fma(__m256d a, __m256d b, __m256d c, double) { #ifdef Vc_IMPL_FMA4 return _mm256_macc_pd(a, b, c); #elif defined Vc_IMPL_FMA return _mm256_fmadd_pd(a, b, c); #else using namespace AVX; __m256d h1 = and_(a, _mm256_broadcast_sd(reinterpret_cast( &c_general::highMaskDouble))); __m256d h2 = and_(b, _mm256_broadcast_sd(reinterpret_cast( &c_general::highMaskDouble))); const __m256d l1 = _mm256_sub_pd(a, h1); const __m256d l2 = _mm256_sub_pd(b, h2); const __m256d ll = mul(l1, l2, double()); const __m256d lh = add(mul(l1, h2, double()), mul(h1, l2, double()), double()); const __m256d hh = mul(h1, h2, double()); const __m256d lh_lt_v3 = cmplt(abs(lh, double()), abs(c, double()), double()); const __m256d x = _mm256_blendv_pd(c, lh, lh_lt_v3); const __m256d y = _mm256_blendv_pd(lh, c, lh_lt_v3); return add(add(ll, x, double()), add(y, hh, double()), double()); #endif } template Vc_INTRINSIC __m256i fma(__m256i a, __m256i b, __m256i c, T) { return add(mul(a, b, T()), c, T()); } template Vc_INTRINSIC __m256i shiftRight(__m256i a, int) { return AVX::srai_epi32(a); } template Vc_INTRINSIC __m256i shiftRight(__m256i a, uint) { return AVX::srli_epi32(a); } template Vc_INTRINSIC __m256i shiftRight(__m256i a, short) { return AVX::srai_epi16(a); } template Vc_INTRINSIC __m256i shiftRight(__m256i a, ushort) { return AVX::srli_epi16(a); } Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, int) { return AVX::sra_epi32(a, _mm_cvtsi32_si128(shift)); } Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, uint) { return AVX::srl_epi32(a, _mm_cvtsi32_si128(shift)); } Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, short) { return AVX::sra_epi16(a, _mm_cvtsi32_si128(shift)); } Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, ushort) { return AVX::srl_epi16(a, _mm_cvtsi32_si128(shift)); } template Vc_INTRINSIC __m256i shiftLeft(__m256i a, int) { return AVX::slli_epi32(a); } template Vc_INTRINSIC __m256i shiftLeft(__m256i a, uint) { return AVX::slli_epi32(a); } template Vc_INTRINSIC __m256i shiftLeft(__m256i a, short) { return AVX::slli_epi16(a); } template Vc_INTRINSIC __m256i shiftLeft(__m256i a, ushort) { return AVX::slli_epi16(a); } Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, int) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); } Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, uint) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); } Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, short) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); } Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, ushort) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); } Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m256 x) { return x; } Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m256d x) { return x; } Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m256i x) { return x; } Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m128 x) { return AVX::zeroExtend(x); } Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m128d x) { return AVX::zeroExtend(x); } Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m128i x) { return AVX::zeroExtend(x); } Vc_INTRINSIC __m256 avx_broadcast( float x) { return _mm256_set1_ps(x); } Vc_INTRINSIC __m256d avx_broadcast(double x) { return _mm256_set1_pd(x); } Vc_INTRINSIC __m256i avx_broadcast( int x) { return _mm256_set1_epi32(x); } Vc_INTRINSIC __m256i avx_broadcast( uint x) { return _mm256_set1_epi32(x); } Vc_INTRINSIC __m256i avx_broadcast( short x) { return _mm256_set1_epi16(x); } Vc_INTRINSIC __m256i avx_broadcast(ushort x) { return _mm256_set1_epi16(x); } Vc_INTRINSIC __m256i avx_broadcast( char x) { return _mm256_set1_epi8(x); } Vc_INTRINSIC __m256i avx_broadcast( schar x) { return _mm256_set1_epi8(x); } Vc_INTRINSIC __m256i avx_broadcast( uchar x) { return _mm256_set1_epi8(x); } template = AVXImpl && Impl <= AVX2Impl)>> Vc_CONST_L AVX2::Vector Vc_VDECL sorted(AVX2::Vector x) Vc_CONST_R; template Vc_INTRINSIC Vc_CONST AVX2::Vector sorted(AVX2::Vector x) { return sorted(x); } template static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32), V> shifted(V v, int amount) { using namespace AVX; constexpr int S = sizeof(T); switch (amount) { case 0: return v; case 1: return shifted( 1 * S)>(v); case 2: return shifted( 2 * S)>(v); case 3: return shifted( 3 * S)>(v); case -1: return shifted(-1 * S)>(v); case -2: return shifted(-2 * S)>(v); case -3: return shifted(-3 * S)>(v); } if (sizeof(T) <= 4) { switch (amount) { case 4: return shifted( 4 * S)>(v); case 5: return shifted( 5 * S)>(v); case 6: return shifted( 6 * S)>(v); case 7: return shifted( 7 * S)>(v); case -4: return shifted(-4 * S)>(v); case -5: return shifted(-5 * S)>(v); case -6: return shifted(-6 * S)>(v); case -7: return shifted(-7 * S)>(v); } if (sizeof(T) <= 2) { switch (amount) { case 8: return shifted( 8 * S)>(v); case 9: return shifted( 9 * S)>(v); case 10: return shifted( 10 * S)>(v); case 11: return shifted( 11 * S)>(v); case 12: return shifted( 12 * S)>(v); case 13: return shifted( 13 * S)>(v); case 14: return shifted( 14 * S)>(v); case 15: return shifted( 15 * S)>(v); case -8: return shifted(- 8 * S)>(v); case -9: return shifted(- 9 * S)>(v); case -10: return shifted(-10 * S)>(v); case -11: return shifted(-11 * S)>(v); case -12: return shifted(-12 * S)>(v); case -13: return shifted(-13 * S)>(v); case -14: return shifted(-14 * S)>(v); case -15: return shifted(-15 * S)>(v); } if (sizeof(T) == 1) { switch (amount) { case 16: return shifted( 16)>(v); case 17: return shifted( 17)>(v); case 18: return shifted( 18)>(v); case 19: return shifted( 19)>(v); case 20: return shifted( 20)>(v); case 21: return shifted( 21)>(v); case 22: return shifted( 22)>(v); case 23: return shifted( 23)>(v); case 24: return shifted( 24)>(v); case 25: return shifted( 25)>(v); case 26: return shifted( 26)>(v); case 27: return shifted( 27)>(v); case 28: return shifted( 28)>(v); case 29: return shifted( 29)>(v); case 30: return shifted( 30)>(v); case 31: return shifted( 31)>(v); case -16: return shifted(-16)>(v); case -17: return shifted(-17)>(v); case -18: return shifted(-18)>(v); case -19: return shifted(-19)>(v); case -20: return shifted(-20)>(v); case -21: return shifted(-21)>(v); case -22: return shifted(-22)>(v); case -23: return shifted(-23)>(v); case -24: return shifted(-24)>(v); case -25: return shifted(-25)>(v); case -26: return shifted(-26)>(v); case -27: return shifted(-27)>(v); case -28: return shifted(-28)>(v); case -29: return shifted(-29)>(v); case -30: return shifted(-30)>(v); case -31: return shifted(-31)>(v); } } } } return avx_cast(_mm256_setzero_ps()); } template static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> shifted(V v, int amount) { using namespace AVX; switch (amount) { case 0: return v; case 1: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(1 * sizeof(T)))); case 2: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(2 * sizeof(T)))); case 3: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(3 * sizeof(T)))); case -1: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(1 * sizeof(T)))); case -2: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(2 * sizeof(T)))); case -3: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(3 * sizeof(T)))); } if (sizeof(T) <= 2) { switch (amount) { case 4: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(4 * sizeof(T)))); case 5: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(5 * sizeof(T)))); case 6: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(6 * sizeof(T)))); case 7: return avx_cast(_mm_srli_si128(avx_cast<__m128i>(v), sanitize(7 * sizeof(T)))); case -4: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(4 * sizeof(T)))); case -5: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(5 * sizeof(T)))); case -6: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(6 * sizeof(T)))); case -7: return avx_cast(_mm_slli_si128(avx_cast<__m128i>(v), sanitize(7 * sizeof(T)))); } } return avx_cast(_mm_setzero_ps()); } template static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 4), V> rotated(V v, int amount) { using namespace AVX; const __m128i vLo = avx_cast<__m128i>(lo128(v)); const __m128i vHi = avx_cast<__m128i>(hi128(v)); switch (static_cast(amount) % N) { case 0: return v; case 1: return avx_cast(concat(SSE::alignr_epi8(vHi, vLo), SSE::alignr_epi8(vLo, vHi))); case 2: return Mem::permute128(v); case 3: return avx_cast(concat(SSE::alignr_epi8(vLo, vHi), SSE::alignr_epi8(vHi, vLo))); } return avx_cast(_mm256_setzero_ps()); } template static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 8), V> rotated(V v, int amount) { using namespace AVX; const __m128i vLo = avx_cast<__m128i>(lo128(v)); const __m128i vHi = avx_cast<__m128i>(hi128(v)); switch (static_cast(amount) % N) { case 0: return v; case 1: return avx_cast(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi))); case 2: return avx_cast(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi))); case 3: return avx_cast(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi))); case 4: return Mem::permute128(v); case 5: return avx_cast(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo))); case 6: return avx_cast(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo))); case 7: return avx_cast(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo))); } return avx_cast(_mm256_setzero_ps()); } #ifdef Vc_IMPL_AVX2 template static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 16), V> rotated( V v, int amount) { using namespace AVX; const __m128i vLo = avx_cast<__m128i>(lo128(v)); const __m128i vHi = avx_cast<__m128i>(hi128(v)); switch (static_cast(amount) % N) { case 0: return v; case 1: return avx_cast(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi))); case 2: return avx_cast(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi))); case 3: return avx_cast(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi))); case 4: return Mem::permute4x64(v); case 5: return avx_cast(concat(SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi))); case 6: return avx_cast(concat(SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi))); case 7: return avx_cast(concat(SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo), SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi))); case 8: return Mem::permute128(v); case 9: return avx_cast(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo))); case 10: return avx_cast(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo))); case 11: return avx_cast(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo))); case 12: return Mem::permute4x64(v); case 13: return avx_cast(concat(SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo))); case 14: return avx_cast(concat(SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo))); case 15: return avx_cast(concat(SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi), SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo))); } return avx_cast(_mm256_setzero_ps()); } #endif Vc_INTRINSIC Vc_CONST int testc(__m128 a, __m128 b) { return _mm_testc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); } Vc_INTRINSIC Vc_CONST int testc(__m256 a, __m256 b) { return _mm256_testc_ps(a, b); } Vc_INTRINSIC Vc_CONST int testc(__m256d a, __m256d b) { return _mm256_testc_pd(a, b); } Vc_INTRINSIC Vc_CONST int testc(__m256i a, __m256i b) { return _mm256_testc_si256(a, b); } Vc_INTRINSIC Vc_CONST int testz(__m128 a, __m128 b) { return _mm_testz_si128(_mm_castps_si128(a), _mm_castps_si128(b)); } Vc_INTRINSIC Vc_CONST int testz(__m256 a, __m256 b) { return _mm256_testz_ps(a, b); } Vc_INTRINSIC Vc_CONST int testz(__m256d a, __m256d b) { return _mm256_testz_pd(a, b); } Vc_INTRINSIC Vc_CONST int testz(__m256i a, __m256i b) { return _mm256_testz_si256(a, b); } Vc_INTRINSIC Vc_CONST int testnzc(__m128 a, __m128 b) { return _mm_testnzc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); } Vc_INTRINSIC Vc_CONST int testnzc(__m256 a, __m256 b) { return _mm256_testnzc_ps(a, b); } Vc_INTRINSIC Vc_CONST int testnzc(__m256d a, __m256d b) { return _mm256_testnzc_pd(a, b); } Vc_INTRINSIC Vc_CONST int testnzc(__m256i a, __m256i b) { return _mm256_testnzc_si256(a, b); } Vc_INTRINSIC Vc_CONST int movemask(__m256i a) { return AVX::movemask_epi8(a); } Vc_INTRINSIC Vc_CONST int movemask(__m128i a) { return _mm_movemask_epi8(a); } Vc_INTRINSIC Vc_CONST int movemask(__m256d a) { return _mm256_movemask_pd(a); } Vc_INTRINSIC Vc_CONST int movemask(__m128d a) { return _mm_movemask_pd(a); } Vc_INTRINSIC Vc_CONST int movemask(__m256 a) { return _mm256_movemask_ps(a); } Vc_INTRINSIC Vc_CONST int movemask(__m128 a) { return _mm_movemask_ps(a); } template Vc_INTRINSIC void mask_store(__m256i k, bool *mem, Flags) { static_assert( N == 4 || N == 8 || N == 16, "mask_store(__m256i, bool *) is only implemented for 4, 8, and 16 entries"); switch (N) { case 4: *aliasing_cast(mem) = (_mm_movemask_epi8(AVX::lo128(k)) | (_mm_movemask_epi8(AVX::hi128(k)) << 16)) & 0x01010101; break; case 8: { const auto k2 = _mm_srli_epi16(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)), 15); const auto k3 = _mm_packs_epi16(k2, _mm_setzero_si128()); #ifdef __x86_64__ *aliasing_cast(mem) = _mm_cvtsi128_si64(k3); #else *aliasing_cast(mem) = _mm_cvtsi128_si32(k3); *aliasing_cast(mem + 4) = _mm_extract_epi32(k3, 1); #endif } break; case 16: { const auto bools = Detail::and_(_mm_set1_epi8(1), _mm_packs_epi16(AVX::lo128(k), AVX::hi128(k))); if (Flags::IsAligned) { _mm_store_si128(reinterpret_cast<__m128i *>(mem), bools); } else { _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), bools); } } break; default: Vc_UNREACHABLE(); } } template Vc_INTRINSIC R mask_load(const bool *mem, Flags, enable_if::value> = nullarg) { static_assert(N == 4 || N == 8, "mask_load<__m128>(const bool *) is only implemented for 4, 8 entries"); switch (N) { case 4: { __m128i k = _mm_cvtsi32_si128(*aliasing_cast(mem)); k = _mm_unpacklo_epi8(k, k); k = _mm_unpacklo_epi16(k, k); k = _mm_cmpgt_epi32(k, _mm_setzero_si128()); return AVX::avx_cast<__m128>(k); } case 8: { #ifdef __x86_64__ __m128i k = _mm_cvtsi64_si128(*aliasing_cast(mem)); #else __m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast(mem))); #endif return AVX::avx_cast<__m128>( _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128())); } default: Vc_UNREACHABLE(); } } template Vc_INTRINSIC R mask_load(const bool *mem, Flags, enable_if::value> = nullarg) { static_assert( N == 4 || N == 8 || N == 16, "mask_load<__m256>(const bool *) is only implemented for 4, 8, and 16 entries"); switch (N) { case 4: { __m128i k = AVX::avx_cast<__m128i>(_mm_and_ps( _mm_set1_ps(*aliasing_cast(mem)), AVX::avx_cast<__m128>(_mm_setr_epi32(0x1, 0x100, 0x10000, 0x1000000)))); k = _mm_cmpgt_epi32(k, _mm_setzero_si128()); return AVX::avx_cast<__m256>( AVX::concat(_mm_unpacklo_epi32(k, k), _mm_unpackhi_epi32(k, k))); } case 8: { #ifdef __x86_64__ __m128i k = _mm_cvtsi64_si128(*aliasing_cast(mem)); #else __m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast(mem))); #endif k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()); return AVX::avx_cast<__m256>( AVX::concat(_mm_unpacklo_epi16(k, k), _mm_unpackhi_epi16(k, k))); } case 16: { const auto k128 = _mm_cmpgt_epi8( Flags::IsAligned ? _mm_load_si128(reinterpret_cast(mem)) : _mm_loadu_si128(reinterpret_cast(mem)), _mm_setzero_si128()); return AVX::avx_cast<__m256>( AVX::concat(_mm_unpacklo_epi8(k128, k128), _mm_unpackhi_epi8(k128, k128))); } default: Vc_UNREACHABLE(); return R(); } } template Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m256i x) Vc_INTRINSIC_R Vc_CONST_R; template <> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m256i k) { return movemask(AVX::avx_cast<__m256d>(k)); } template <> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m256i k) { return movemask(AVX::avx_cast<__m256>(k)); } #ifdef Vc_IMPL_BMI2 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k) { return _pext_u32(movemask(k), 0x55555555u); } #endif template <> Vc_INTRINSIC Vc_CONST int mask_to_int<32>(__m256i k) { return movemask(k); } template struct InterleaveImpl { template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1) { const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data()); const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data()); using namespace AVX; *aliasing_cast(&data[i[ 0]]) = _mm_cvtsi128_si32(lo128(tmp0)); *aliasing_cast(&data[i[ 1]]) = _mm_extract_epi32(lo128(tmp0), 1); *aliasing_cast(&data[i[ 2]]) = _mm_extract_epi32(lo128(tmp0), 2); *aliasing_cast(&data[i[ 3]]) = _mm_extract_epi32(lo128(tmp0), 3); *aliasing_cast(&data[i[ 4]]) = _mm_cvtsi128_si32(lo128(tmp1)); *aliasing_cast(&data[i[ 5]]) = _mm_extract_epi32(lo128(tmp1), 1); *aliasing_cast(&data[i[ 6]]) = _mm_extract_epi32(lo128(tmp1), 2); *aliasing_cast(&data[i[ 7]]) = _mm_extract_epi32(lo128(tmp1), 3); *aliasing_cast(&data[i[ 8]]) = _mm_cvtsi128_si32(hi128(tmp0)); *aliasing_cast(&data[i[ 9]]) = _mm_extract_epi32(hi128(tmp0), 1); *aliasing_cast(&data[i[10]]) = _mm_extract_epi32(hi128(tmp0), 2); *aliasing_cast(&data[i[11]]) = _mm_extract_epi32(hi128(tmp0), 3); *aliasing_cast(&data[i[12]]) = _mm_cvtsi128_si32(hi128(tmp1)); *aliasing_cast(&data[i[13]]) = _mm_extract_epi32(hi128(tmp1), 1); *aliasing_cast(&data[i[14]]) = _mm_extract_epi32(hi128(tmp1), 2); *aliasing_cast(&data[i[15]]) = _mm_extract_epi32(hi128(tmp1), 3); } static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i, const typename V::AsArg v0, const typename V::AsArg v1) { const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data()); const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data()); V(Mem::shuffle128(tmp0, tmp1)).store(&data[i[0]], Vc::Unaligned); V(Mem::shuffle128(tmp0, tmp1)).store(&data[i[8]], Vc::Unaligned); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { interleave(data, i, v0, v1); v2.scatter(data + 2, i); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data()); const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data()); const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data()); const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data()); const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); using namespace AVX; auto &&store = [&](__m256i x, int offset) { _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 0]]), lo128(x)); _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 8]]), hi128(x)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 1]]), avx_cast<__m128>(x)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 9]]), avx_cast<__m128>(hi128(x))); }; store(tmp4, 0); store(tmp5, 2); store(tmp6, 4); store(tmp7, 6); } static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data()); const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data()); const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data()); const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data()); const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); V(Mem::shuffle128(tmp4, tmp5)).store(&data[i[0]], ::Vc::Unaligned); V(Mem::shuffle128(tmp6, tmp7)).store(&data[i[4]], ::Vc::Unaligned); V(Mem::shuffle128(tmp4, tmp5)).store(&data[i[8]], ::Vc::Unaligned); V(Mem::shuffle128(tmp6, tmp7)).store(&data[i[12]], ::Vc::Unaligned); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) { interleave(data, i, v0, v1, v2, v3); v4.scatter(data + 4, i); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6, v7); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1) { const __m256i tmp4 = _mm256_setr_epi32( *aliasing_cast(&data[i[0]]), *aliasing_cast(&data[i[1]]), *aliasing_cast(&data[i[2]]), *aliasing_cast(&data[i[3]]), *aliasing_cast(&data[i[8]]), *aliasing_cast(&data[i[9]]), *aliasing_cast(&data[i[10]]), *aliasing_cast(&data[i[11]])); const __m256i tmp5 = _mm256_setr_epi32( *aliasing_cast(&data[i[4]]), *aliasing_cast(&data[i[5]]), *aliasing_cast(&data[i[6]]), *aliasing_cast(&data[i[7]]), *aliasing_cast(&data[i[12]]), *aliasing_cast(&data[i[13]]), *aliasing_cast(&data[i[14]]), *aliasing_cast(&data[i[15]])); const __m256i tmp2 = AVX::unpacklo_epi16(tmp4, tmp5); const __m256i tmp3 = AVX::unpackhi_epi16(tmp4, tmp5); const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); const __m256i tmp1 = AVX::unpackhi_epi16(tmp2, tmp3); v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2) { using namespace AVX; const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd( *aliasing_cast(&data[i[0]]), *aliasing_cast(&data[i[1]]), *aliasing_cast(&data[i[8]]), *aliasing_cast(&data[i[9]]))); const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd( *aliasing_cast(&data[i[2]]), *aliasing_cast(&data[i[3]]), *aliasing_cast(&data[i[10]]), *aliasing_cast(&data[i[11]]))); const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd( *aliasing_cast(&data[i[4]]), *aliasing_cast(&data[i[5]]), *aliasing_cast(&data[i[12]]), *aliasing_cast(&data[i[13]]))); const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd( *aliasing_cast(&data[i[6]]), *aliasing_cast(&data[i[7]]), *aliasing_cast(&data[i[14]]), *aliasing_cast(&data[i[15]]))); const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6); const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6); const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7); const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7); v0.data() = AVX::unpacklo_epi16(tmp8, tmp10); v1.data() = AVX::unpackhi_epi16(tmp8, tmp10); v2.data() = AVX::unpacklo_epi16(tmp9, tmp11); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3) { using namespace AVX; const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd( *aliasing_cast(&data[i[0]]), *aliasing_cast(&data[i[1]]), *aliasing_cast(&data[i[8]]), *aliasing_cast(&data[i[9]]))); const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd( *aliasing_cast(&data[i[2]]), *aliasing_cast(&data[i[3]]), *aliasing_cast(&data[i[10]]), *aliasing_cast(&data[i[11]]))); const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd( *aliasing_cast(&data[i[4]]), *aliasing_cast(&data[i[5]]), *aliasing_cast(&data[i[12]]), *aliasing_cast(&data[i[13]]))); const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd( *aliasing_cast(&data[i[6]]), *aliasing_cast(&data[i[7]]), *aliasing_cast(&data[i[14]]), *aliasing_cast(&data[i[15]]))); const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6); const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6); const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7); const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7); v0.data() = AVX::unpacklo_epi16(tmp8, tmp10); v1.data() = AVX::unpackhi_epi16(tmp8, tmp10); v2.data() = AVX::unpacklo_epi16(tmp9, tmp11); v3.data() = AVX::unpackhi_epi16(tmp9, tmp11); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4) { using namespace AVX; const __m256i a = concat(_mm_loadu_si128(reinterpret_cast(&data[i[0]])), _mm_loadu_si128(reinterpret_cast(&data[i[8]]))); const __m256i b = concat(_mm_loadu_si128(reinterpret_cast(&data[i[1]])), _mm_loadu_si128(reinterpret_cast(&data[i[9]]))); const __m256i c = concat(_mm_loadu_si128(reinterpret_cast(&data[i[2]])), _mm_loadu_si128(reinterpret_cast(&data[i[10]]))); const __m256i d = concat(_mm_loadu_si128(reinterpret_cast(&data[i[3]])), _mm_loadu_si128(reinterpret_cast(&data[i[11]]))); const __m256i e = concat(_mm_loadu_si128(reinterpret_cast(&data[i[4]])), _mm_loadu_si128(reinterpret_cast(&data[i[12]]))); const __m256i f = concat(_mm_loadu_si128(reinterpret_cast(&data[i[5]])), _mm_loadu_si128(reinterpret_cast(&data[i[13]]))); const __m256i g = concat(_mm_loadu_si128(reinterpret_cast(&data[i[6]])), _mm_loadu_si128(reinterpret_cast(&data[i[14]]))); const __m256i h = concat(_mm_loadu_si128(reinterpret_cast(&data[i[7]])), _mm_loadu_si128(reinterpret_cast(&data[i[15]]))); const __m256i tmp2 = AVX::unpacklo_epi16(a, e); const __m256i tmp4 = AVX::unpacklo_epi16(b, f); const __m256i tmp3 = AVX::unpacklo_epi16(c, g); const __m256i tmp5 = AVX::unpacklo_epi16(d, h); const __m256i tmp10 = AVX::unpackhi_epi16(a, e); const __m256i tmp11 = AVX::unpackhi_epi16(c, g); const __m256i tmp12 = AVX::unpackhi_epi16(b, f); const __m256i tmp13 = AVX::unpackhi_epi16(d, h); const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5); const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3); const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5); const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11); const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13); v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); v2.data() = AVX::unpacklo_epi16(tmp6, tmp7); v3.data() = AVX::unpackhi_epi16(tmp6, tmp7); v4.data() = AVX::unpacklo_epi16(tmp8, tmp9); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) { using namespace AVX; const __m256i a = concat(_mm_loadu_si128(reinterpret_cast(&data[i[0]])), _mm_loadu_si128(reinterpret_cast(&data[i[8]]))); const __m256i b = concat(_mm_loadu_si128(reinterpret_cast(&data[i[1]])), _mm_loadu_si128(reinterpret_cast(&data[i[9]]))); const __m256i c = concat(_mm_loadu_si128(reinterpret_cast(&data[i[2]])), _mm_loadu_si128(reinterpret_cast(&data[i[10]]))); const __m256i d = concat(_mm_loadu_si128(reinterpret_cast(&data[i[3]])), _mm_loadu_si128(reinterpret_cast(&data[i[11]]))); const __m256i e = concat(_mm_loadu_si128(reinterpret_cast(&data[i[4]])), _mm_loadu_si128(reinterpret_cast(&data[i[12]]))); const __m256i f = concat(_mm_loadu_si128(reinterpret_cast(&data[i[5]])), _mm_loadu_si128(reinterpret_cast(&data[i[13]]))); const __m256i g = concat(_mm_loadu_si128(reinterpret_cast(&data[i[6]])), _mm_loadu_si128(reinterpret_cast(&data[i[14]]))); const __m256i h = concat(_mm_loadu_si128(reinterpret_cast(&data[i[7]])), _mm_loadu_si128(reinterpret_cast(&data[i[15]]))); const __m256i tmp2 = AVX::unpacklo_epi16(a, e); const __m256i tmp4 = AVX::unpacklo_epi16(b, f); const __m256i tmp3 = AVX::unpacklo_epi16(c, g); const __m256i tmp5 = AVX::unpacklo_epi16(d, h); const __m256i tmp10 = AVX::unpackhi_epi16(a, e); const __m256i tmp11 = AVX::unpackhi_epi16(c, g); const __m256i tmp12 = AVX::unpackhi_epi16(b, f); const __m256i tmp13 = AVX::unpackhi_epi16(d, h); const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5); const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3); const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5); const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11); const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13); v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); v2.data() = AVX::unpacklo_epi16(tmp6, tmp7); v3.data() = AVX::unpackhi_epi16(tmp6, tmp7); v4.data() = AVX::unpacklo_epi16(tmp8, tmp9); v5.data() = AVX::unpackhi_epi16(tmp8, tmp9); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) { using namespace AVX; const __m256i a = concat(_mm_loadu_si128(reinterpret_cast(&data[i[0]])), _mm_loadu_si128(reinterpret_cast(&data[i[8]]))); const __m256i b = concat(_mm_loadu_si128(reinterpret_cast(&data[i[1]])), _mm_loadu_si128(reinterpret_cast(&data[i[9]]))); const __m256i c = concat(_mm_loadu_si128(reinterpret_cast(&data[i[2]])), _mm_loadu_si128(reinterpret_cast(&data[i[10]]))); const __m256i d = concat(_mm_loadu_si128(reinterpret_cast(&data[i[3]])), _mm_loadu_si128(reinterpret_cast(&data[i[11]]))); const __m256i e = concat(_mm_loadu_si128(reinterpret_cast(&data[i[4]])), _mm_loadu_si128(reinterpret_cast(&data[i[12]]))); const __m256i f = concat(_mm_loadu_si128(reinterpret_cast(&data[i[5]])), _mm_loadu_si128(reinterpret_cast(&data[i[13]]))); const __m256i g = concat(_mm_loadu_si128(reinterpret_cast(&data[i[6]])), _mm_loadu_si128(reinterpret_cast(&data[i[14]]))); const __m256i h = concat(_mm_loadu_si128(reinterpret_cast(&data[i[7]])), _mm_loadu_si128(reinterpret_cast(&data[i[15]]))); const __m256i tmp2 = AVX::unpacklo_epi16(a, e); const __m256i tmp4 = AVX::unpacklo_epi16(b, f); const __m256i tmp3 = AVX::unpacklo_epi16(c, g); const __m256i tmp5 = AVX::unpacklo_epi16(d, h); const __m256i tmp10 = AVX::unpackhi_epi16(a, e); const __m256i tmp11 = AVX::unpackhi_epi16(c, g); const __m256i tmp12 = AVX::unpackhi_epi16(b, f); const __m256i tmp13 = AVX::unpackhi_epi16(d, h); const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5); const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3); const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5); const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11); const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13); const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11); const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13); v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); v2.data() = AVX::unpacklo_epi16(tmp6, tmp7); v3.data() = AVX::unpackhi_epi16(tmp6, tmp7); v4.data() = AVX::unpacklo_epi16(tmp8, tmp9); v5.data() = AVX::unpackhi_epi16(tmp8, tmp9); v6.data() = AVX::unpacklo_epi16(tmp14, tmp15); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) { using namespace AVX; const __m256i a = concat(_mm_loadu_si128(reinterpret_cast(&data[i[0]])), _mm_loadu_si128(reinterpret_cast(&data[i[8]]))); const __m256i b = concat(_mm_loadu_si128(reinterpret_cast(&data[i[1]])), _mm_loadu_si128(reinterpret_cast(&data[i[9]]))); const __m256i c = concat(_mm_loadu_si128(reinterpret_cast(&data[i[2]])), _mm_loadu_si128(reinterpret_cast(&data[i[10]]))); const __m256i d = concat(_mm_loadu_si128(reinterpret_cast(&data[i[3]])), _mm_loadu_si128(reinterpret_cast(&data[i[11]]))); const __m256i e = concat(_mm_loadu_si128(reinterpret_cast(&data[i[4]])), _mm_loadu_si128(reinterpret_cast(&data[i[12]]))); const __m256i f = concat(_mm_loadu_si128(reinterpret_cast(&data[i[5]])), _mm_loadu_si128(reinterpret_cast(&data[i[13]]))); const __m256i g = concat(_mm_loadu_si128(reinterpret_cast(&data[i[6]])), _mm_loadu_si128(reinterpret_cast(&data[i[14]]))); const __m256i h = concat(_mm_loadu_si128(reinterpret_cast(&data[i[7]])), _mm_loadu_si128(reinterpret_cast(&data[i[15]]))); const __m256i tmp2 = AVX::unpacklo_epi16(a, e); const __m256i tmp4 = AVX::unpacklo_epi16(b, f); const __m256i tmp3 = AVX::unpacklo_epi16(c, g); const __m256i tmp5 = AVX::unpacklo_epi16(d, h); const __m256i tmp10 = AVX::unpackhi_epi16(a, e); const __m256i tmp11 = AVX::unpackhi_epi16(c, g); const __m256i tmp12 = AVX::unpackhi_epi16(b, f); const __m256i tmp13 = AVX::unpackhi_epi16(d, h); const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3); const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5); const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3); const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5); const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11); const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13); const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11); const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13); v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); v2.data() = AVX::unpacklo_epi16(tmp6, tmp7); v3.data() = AVX::unpackhi_epi16(tmp6, tmp7); v4.data() = AVX::unpacklo_epi16(tmp8, tmp9); v5.data() = AVX::unpackhi_epi16(tmp8, tmp9); v6.data() = AVX::unpacklo_epi16(tmp14, tmp15); v7.data() = AVX::unpackhi_epi16(tmp14, tmp15); } }; template struct InterleaveImpl { static_assert(sizeof(typename V::value_type) == 4, ""); template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1) { using namespace AVX; const m256 tmp0 = _mm256_unpacklo_ps(avx_cast(v0.data()), avx_cast(v1.data())); const m256 tmp1 = _mm256_unpackhi_ps(avx_cast(v0.data()), avx_cast(v1.data())); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), lo128(tmp0)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), lo128(tmp0)); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), lo128(tmp1)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), lo128(tmp1)); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), hi128(tmp0)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), hi128(tmp0)); _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), hi128(tmp1)); _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), hi128(tmp1)); } static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i, const typename V::AsArg v0, const typename V::AsArg v1) { using namespace AVX; const m256 tmp0 = _mm256_unpacklo_ps(avx_cast(v0.data()), avx_cast(v1.data())); const m256 tmp1 = _mm256_unpackhi_ps(avx_cast(v0.data()), avx_cast(v1.data())); _mm_storeu_ps(aliasing_cast(&data[i[0]]), lo128(tmp0)); _mm_storeu_ps(aliasing_cast(&data[i[2]]), lo128(tmp1)); _mm_storeu_ps(aliasing_cast(&data[i[4]]), hi128(tmp0)); _mm_storeu_ps(aliasing_cast(&data[i[6]]), hi128(tmp1)); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { using namespace AVX; #ifdef Vc_USE_MASKMOV_SCATTER const m256 tmp0 = _mm256_unpacklo_ps(avx_cast(v0.data()), avx_cast(v2.data())); const m256 tmp1 = _mm256_unpackhi_ps(avx_cast(v0.data()), avx_cast(v2.data())); const m256 tmp2 = _mm256_unpacklo_ps(avx_cast(v1.data()), avx_cast(v1.data())); const m256 tmp3 = _mm256_unpackhi_ps(avx_cast(v1.data()), avx_cast(v1.data())); const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2); const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2); const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3); const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3); const m128i mask = _mm_set_epi32(0, -1, -1, -1); _mm_maskstore_ps(aliasing_cast(&data[i[0]]), mask, lo128(tmp4)); _mm_maskstore_ps(aliasing_cast(&data[i[1]]), mask, lo128(tmp5)); _mm_maskstore_ps(aliasing_cast(&data[i[2]]), mask, lo128(tmp6)); _mm_maskstore_ps(aliasing_cast(&data[i[3]]), mask, lo128(tmp7)); _mm_maskstore_ps(aliasing_cast(&data[i[4]]), mask, hi128(tmp4)); _mm_maskstore_ps(aliasing_cast(&data[i[5]]), mask, hi128(tmp5)); _mm_maskstore_ps(aliasing_cast(&data[i[6]]), mask, hi128(tmp6)); _mm_maskstore_ps(aliasing_cast(&data[i[7]]), mask, hi128(tmp7)); #else interleave(data, i, v0, v1); v2.scatter(data + 2, i); #endif } static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<3> &i, const typename V::AsArg v0_, const typename V::AsArg v1_, const typename V::AsArg v2_) { __m256 v0 = AVX::avx_cast<__m256>(v0_.data()); __m256 v1 = AVX::avx_cast<__m256>(v1_.data()); __m256 v2 = AVX::avx_cast<__m256>(v2_.data()); v0 = _mm256_shuffle_ps(v0, v0, 0x6c); v1 = _mm256_shuffle_ps(v1, v1, 0xb1); v2 = _mm256_shuffle_ps(v2, v2, 0xc6); __m256 w0 = Mem::blend( Mem::blend(v0, v1), v2); __m256 w1 = Mem::blend( Mem::blend(v0, v1), v2); __m256 w2 = Mem::blend( Mem::blend(v0, v1), v2); _mm256_storeu_ps(aliasing_cast(&data[i[0]]), _mm256_permute2f128_ps(w0, w1, 0x20)); _mm256_storeu_ps(aliasing_cast(&data[i[0]] + 8), w2); _mm256_storeu_ps(aliasing_cast(&data[i[0]] + 16), _mm256_permute2f128_ps(w1, w0, 0x31)); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { using namespace AVX; const __m256 tmp0 = _mm256_unpacklo_ps(avx_cast(v0.data()), avx_cast(v2.data())); const __m256 tmp1 = _mm256_unpackhi_ps(avx_cast(v0.data()), avx_cast(v2.data())); const __m256 tmp2 = _mm256_unpacklo_ps(avx_cast(v1.data()), avx_cast(v3.data())); const __m256 tmp3 = _mm256_unpackhi_ps(avx_cast(v1.data()), avx_cast(v3.data())); const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2); const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2); const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3); const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3); _mm_storeu_ps(aliasing_cast(&data[i[0]]), lo128(_04)); _mm_storeu_ps(aliasing_cast(&data[i[1]]), lo128(_15)); _mm_storeu_ps(aliasing_cast(&data[i[2]]), lo128(_26)); _mm_storeu_ps(aliasing_cast(&data[i[3]]), lo128(_37)); _mm_storeu_ps(aliasing_cast(&data[i[4]]), hi128(_04)); _mm_storeu_ps(aliasing_cast(&data[i[5]]), hi128(_15)); _mm_storeu_ps(aliasing_cast(&data[i[6]]), hi128(_26)); _mm_storeu_ps(aliasing_cast(&data[i[7]]), hi128(_37)); } static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { using namespace AVX; const __m256 tmp0 = _mm256_unpacklo_ps(avx_cast(v0.data()), avx_cast(v2.data())); const __m256 tmp1 = _mm256_unpackhi_ps(avx_cast(v0.data()), avx_cast(v2.data())); const __m256 tmp2 = _mm256_unpacklo_ps(avx_cast(v1.data()), avx_cast(v3.data())); const __m256 tmp3 = _mm256_unpackhi_ps(avx_cast(v1.data()), avx_cast(v3.data())); const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2); const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2); const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3); const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3); _mm256_storeu_ps(aliasing_cast(&data[i[0]]), _mm256_permute2f128_ps(_04, _15, 0x20)); _mm256_storeu_ps(aliasing_cast(&data[i[0]] + 8), _mm256_permute2f128_ps(_26, _37, 0x20)); _mm256_storeu_ps(aliasing_cast(&data[i[0]] + 16), _mm256_permute2f128_ps(_04, _15, 0x31)); _mm256_storeu_ps(aliasing_cast(&data[i[0]] + 24), _mm256_permute2f128_ps(_26, _37, 0x31)); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) { interleave(data, i, v0, v1, v2, v3); v4.scatter(data + 4, i); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6, v7); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1) { using namespace AVX; const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[0]])); const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[2]])); const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[4]])); const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[6]])); const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&data[i[1]])); const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&data[i[3]])); const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&data[i[5]])); const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&data[i[7]])); const m256 tmp2 = concat(il01, il45); const m256 tmp3 = concat(il23, il67); const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); v0.data() = avx_cast(_mm256_unpacklo_ps(tmp0, tmp1)); v1.data() = avx_cast(_mm256_unpackhi_ps(tmp0, tmp1)); } static inline void deinterleave(typename V::EntryType const *const data, const Common::SuccessiveEntries<2> &i, V &v0, V &v1) { using namespace AVX; const m256 il0123 = _mm256_loadu_ps(aliasing_cast(&data[i[0]])); const m256 il4567 = _mm256_loadu_ps(aliasing_cast(&data[i[4]])); const m256 tmp2 = Mem::shuffle128(il0123, il4567); const m256 tmp3 = Mem::shuffle128(il0123, il4567); const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3); v0.data() = avx_cast(_mm256_unpacklo_ps(tmp0, tmp1)); v1.data() = avx_cast(_mm256_unpackhi_ps(tmp0, tmp1)); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2) { using namespace AVX; const m128 il0 = _mm_loadu_ps(aliasing_cast(&data[i[0]])); const m128 il1 = _mm_loadu_ps(aliasing_cast(&data[i[1]])); const m128 il2 = _mm_loadu_ps(aliasing_cast(&data[i[2]])); const m128 il3 = _mm_loadu_ps(aliasing_cast(&data[i[3]])); const m128 il4 = _mm_loadu_ps(aliasing_cast(&data[i[4]])); const m128 il5 = _mm_loadu_ps(aliasing_cast(&data[i[5]])); const m128 il6 = _mm_loadu_ps(aliasing_cast(&data[i[6]])); const m128 il7 = _mm_loadu_ps(aliasing_cast(&data[i[7]])); const m256 il04 = concat(il0, il4); const m256 il15 = concat(il1, il5); const m256 il26 = concat(il2, il6); const m256 il37 = concat(il3, il7); const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); v0.data() = avx_cast(_mm256_unpacklo_ps(ab0246, ab1357)); v1.data() = avx_cast(_mm256_unpackhi_ps(ab0246, ab1357)); v2.data() = avx_cast(_mm256_unpacklo_ps(cd0246, cd1357)); } static inline void deinterleave(typename V::EntryType const *const data, const Common::SuccessiveEntries<3> &i, V &v0, V &v1, V &v2) { __m256 in0 = _mm256_loadu_ps(aliasing_cast(&data[i[0]] + 0)); __m256 in1 = _mm256_loadu_ps(aliasing_cast(&data[i[0]] + 8)); __m256 in2 = _mm256_loadu_ps(aliasing_cast(&data[i[0]] + 16)); const __m256 aaabffgg = _mm256_permute2f128_ps(in0, in2, 0x20); const __m256 cdddeeef = in1; const __m256 bbccghhh = _mm256_permute2f128_ps(in0, in2, 0x31); const __m256 x0 = _mm256_blend_ps( _mm256_blend_ps(aaabffgg, cdddeeef, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80), bbccghhh, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0); const __m256 x1 = _mm256_blend_ps( _mm256_blend_ps(aaabffgg, cdddeeef, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0), bbccghhh, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0); const __m256 x2 = _mm256_blend_ps( _mm256_blend_ps(aaabffgg, cdddeeef, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0), bbccghhh, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80); v0 = AVX::avx_cast(_mm256_shuffle_ps(x0, x0, 0x6c)); v1 = AVX::avx_cast(_mm256_shuffle_ps(x1, x1, 0xb1)); v2 = AVX::avx_cast(_mm256_shuffle_ps(x2, x2, 0xc6)); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3) { using namespace AVX; const m128 il0 = _mm_loadu_ps(aliasing_cast(&data[i[0]])); const m128 il1 = _mm_loadu_ps(aliasing_cast(&data[i[1]])); const m128 il2 = _mm_loadu_ps(aliasing_cast(&data[i[2]])); const m128 il3 = _mm_loadu_ps(aliasing_cast(&data[i[3]])); const m128 il4 = _mm_loadu_ps(aliasing_cast(&data[i[4]])); const m128 il5 = _mm_loadu_ps(aliasing_cast(&data[i[5]])); const m128 il6 = _mm_loadu_ps(aliasing_cast(&data[i[6]])); const m128 il7 = _mm_loadu_ps(aliasing_cast(&data[i[7]])); const m256 il04 = concat(il0, il4); const m256 il15 = concat(il1, il5); const m256 il26 = concat(il2, il6); const m256 il37 = concat(il3, il7); const m256 ab0246 = _mm256_unpacklo_ps(il04, il26); const m256 ab1357 = _mm256_unpacklo_ps(il15, il37); const m256 cd0246 = _mm256_unpackhi_ps(il04, il26); const m256 cd1357 = _mm256_unpackhi_ps(il15, il37); v0.data() = avx_cast(_mm256_unpacklo_ps(ab0246, ab1357)); v1.data() = avx_cast(_mm256_unpackhi_ps(ab0246, ab1357)); v2.data() = avx_cast(_mm256_unpacklo_ps(cd0246, cd1357)); v3.data() = avx_cast(_mm256_unpackhi_ps(cd0246, cd1357)); } static inline void deinterleave(typename V::EntryType const *const data, const Common::SuccessiveEntries<4> &i, V &v0, V &v1, V &v2, V &v3) { using namespace AVX; const __m256 il01 = _mm256_loadu_ps( aliasing_cast(&data[i[0]])); const __m256 il23 = _mm256_loadu_ps( aliasing_cast(&data[i[2]])); const __m256 il45 = _mm256_loadu_ps( aliasing_cast(&data[i[4]])); const __m256 il67 = _mm256_loadu_ps( aliasing_cast(&data[i[6]])); const __m256 il04 = _mm256_permute2f128_ps(il01, il45, 0x20); const __m256 il15 = _mm256_permute2f128_ps(il01, il45, 0x31); const __m256 il26 = _mm256_permute2f128_ps(il23, il67, 0x20); const __m256 il37 = _mm256_permute2f128_ps(il23, il67, 0x31); const __m256 ab0246 = _mm256_unpacklo_ps(il04, il26); const __m256 ab1357 = _mm256_unpacklo_ps(il15, il37); const __m256 cd0246 = _mm256_unpackhi_ps(il04, il26); const __m256 cd1357 = _mm256_unpackhi_ps(il15, il37); v0.data() = avx_cast(_mm256_unpacklo_ps(ab0246, ab1357)); v1.data() = avx_cast(_mm256_unpackhi_ps(ab0246, ab1357)); v2.data() = avx_cast(_mm256_unpacklo_ps(cd0246, cd1357)); v3.data() = avx_cast(_mm256_unpackhi_ps(cd0246, cd1357)); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4) { v4.gather(data + 4, i); deinterleave(data, i, v0, v1, v2, v3); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) { deinterleave(data, i, v0, v1, v2, v3); deinterleave(data + 4, i, v4, v5); } static inline void deinterleave(typename V::EntryType const *const data, const Common::SuccessiveEntries<6> &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) { using namespace AVX; const m256 a = _mm256_loadu_ps(aliasing_cast(&data[i[0]])); const m256 b = _mm256_loadu_ps(aliasing_cast(&data[i[0] + 1 * V::Size])); const m256 c = _mm256_loadu_ps(aliasing_cast(&data[i[0] + 2 * V::Size])); const m256 d = _mm256_loadu_ps(aliasing_cast(&data[i[0] + 3 * V::Size])); const m256 e = _mm256_loadu_ps(aliasing_cast(&data[i[0] + 4 * V::Size])); const m256 f = _mm256_loadu_ps(aliasing_cast(&data[i[0] + 5 * V::Size])); const __m256 tmp2 = Mem::shuffle128(a, d); const __m256 tmp3 = Mem::shuffle128(b, e); const __m256 tmp4 = Mem::shuffle128(a, d); const __m256 tmp5 = Mem::shuffle128(c, f); const __m256 tmp8 = Mem::shuffle128(b, e); const __m256 tmp9 = Mem::shuffle128(c, f); const __m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3); const __m256 tmp1 = _mm256_unpackhi_ps(tmp4, tmp5); const __m256 tmp6 = _mm256_unpackhi_ps(tmp2, tmp3); const __m256 tmp7 = _mm256_unpacklo_ps(tmp8, tmp9); const __m256 tmp10 = _mm256_unpacklo_ps(tmp4, tmp5); const __m256 tmp11 = _mm256_unpackhi_ps(tmp8, tmp9); v0.data() = avx_cast(_mm256_unpacklo_ps(tmp0, tmp1)); v1.data() = avx_cast(_mm256_unpackhi_ps(tmp0, tmp1)); v2.data() = avx_cast(_mm256_unpacklo_ps(tmp6, tmp7)); v3.data() = avx_cast(_mm256_unpackhi_ps(tmp6, tmp7)); v4.data() = avx_cast(_mm256_unpacklo_ps(tmp10, tmp11)); v5.data() = avx_cast(_mm256_unpackhi_ps(tmp10, tmp11)); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) { deinterleave(data, i, v0, v1, v2, v3); deinterleave(data + 4, i, v4, v5, v6); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) { deinterleave(data, i, v0, v1, v2, v3); deinterleave(data + 4, i, v4, v5, v6, v7); } }; template struct InterleaveImpl { template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1) { using namespace AVX; const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); _mm_storeu_pd(&data[i[0]], lo128(tmp0)); _mm_storeu_pd(&data[i[1]], lo128(tmp1)); _mm_storeu_pd(&data[i[2]], hi128(tmp0)); _mm_storeu_pd(&data[i[3]], hi128(tmp1)); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2) { using namespace AVX; #ifdef Vc_USE_MASKMOV_SCATTER const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v2.data()); const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v2.data()); #if defined(Vc_MSVC) && (Vc_MSVC < 170000000 || !defined(_WIN64)) const m256i mask = concat(_mm_setallone_si128(), _mm_set_epi32(0, 0, -1, -1)); #else const m256i mask = _mm256_set_epi64x(0, -1, -1, -1); #endif _mm256_maskstore_pd(&data[i[0]], mask, Mem::shuffle128(tmp0, tmp2)); _mm256_maskstore_pd(&data[i[1]], mask, Mem::shuffle128(tmp1, tmp3)); _mm256_maskstore_pd(&data[i[2]], mask, Mem::shuffle128(tmp0, tmp2)); _mm256_maskstore_pd(&data[i[3]], mask, Mem::shuffle128(tmp1, tmp3)); #else interleave(data, i, v0, v1); v2.scatter(data + 2, i); #endif } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3) { using namespace AVX; const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data()); const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data()); const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v3.data()); const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v3.data()); _mm_storeu_pd(&data[i[0] ], lo128(tmp0)); _mm_storeu_pd(&data[i[0]+2], lo128(tmp2)); _mm_storeu_pd(&data[i[1] ], lo128(tmp1)); _mm_storeu_pd(&data[i[1]+2], lo128(tmp3)); _mm_storeu_pd(&data[i[2] ], hi128(tmp0)); _mm_storeu_pd(&data[i[2]+2], hi128(tmp2)); _mm_storeu_pd(&data[i[3] ], hi128(tmp1)); _mm_storeu_pd(&data[i[3]+2], hi128(tmp3)); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4) { interleave(data, i, v0, v1, v2, v3); v4.scatter(data + 4, i); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6); } template static inline void interleave(typename V::EntryType *const data, const I &i, const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4, const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7) { interleave(data, i, v0, v1, v2, v3); interleave(data + 4, i, v4, v5, v6, v7); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1) { using namespace Vc::AVX; const m256d ab02 = concat(_mm_loadu_pd(&data[i[0]]), _mm_loadu_pd(&data[i[2]])); const m256d ab13 = concat(_mm_loadu_pd(&data[i[1]]), _mm_loadu_pd(&data[i[3]])); v0.data() = _mm256_unpacklo_pd(ab02, ab13); v1.data() = _mm256_unpackhi_pd(ab02, ab13); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2) { v2.gather(data + 2, i); deinterleave(data, i, v0, v1); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3) { deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4) { v4.gather(data + 4, i); deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) { deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); deinterleave(data + 4, i, v4, v5); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) { v6.gather(data + 6, i); deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); deinterleave(data + 4, i, v4, v5); } template static inline void deinterleave(typename V::EntryType const *const data, const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) { deinterleave(data, i, v0, v1); deinterleave(data + 2, i, v2, v3); deinterleave(data + 4, i, v4, v5); deinterleave(data + 6, i, v6, v7); } }; } } #endif namespace Vc_VERSIONED_NAMESPACE { template class Mask { public: using abi = VectorAbi::Avx; typedef bool EntryType; using value_type = EntryType; using MaskBool = Common::MaskBool; using VectorEntryType = MaskBool; using Vector = AVX2::Vector; using VectorTypeF = AVX::FloatVectorType::Type>; using VectorTypeD = AVX::DoubleVectorType; using VectorTypeI = AVX::IntegerVectorType; private: typedef const VectorTypeF VArg; typedef const VectorTypeD VdArg; typedef const VectorTypeI ViArg; public: static constexpr size_t Size = sizeof(VectorTypeF) / sizeof(T); static constexpr size_t MemoryAlignment = Size; static constexpr std::size_t size() { return Size; } Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType)); private: typedef Common::Storage Storage; public: using VectorType = typename Storage::VectorType; using EntryReference = Vc::Detail::ElementReference; using reference = EntryReference; #if defined Vc_MSVC && defined _WIN32 typedef const Mask &AsArg; #else typedef const Mask AsArg; #endif Vc_INTRINSIC Mask() {} Vc_INTRINSIC Mask(VArg x) : d(AVX::avx_cast(x)) {} Vc_INTRINSIC Mask(VdArg x) : d(AVX::avx_cast(x)) {} Vc_INTRINSIC Mask(ViArg x) : d(AVX::avx_cast(x)) {} Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : d(Detail::zero()) {} Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : d(Detail::allone()) {} Vc_INTRINSIC explicit Mask(bool b) : d(b ? Detail::allone() : Detail::zero()) { } Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; } Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; } template Vc_INTRINSIC Mask( U &&rhs, Common::enable_if_mask_converts_implicitly = nullarg) : d(AVX::avx_cast( Detail::mask_cast::Size, Size, VectorTypeF>( rhs.dataI()))) { } #if Vc_IS_VERSION_1 template Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between " "mask types") Vc_INTRINSIC explicit Mask(U &&rhs, Common::enable_if_mask_converts_explicitly = nullarg); #endif template Vc_INTRINSIC explicit Mask(const bool *mem, Flags f = Flags()) { load(mem, f); } template Vc_INTRINSIC void load(const bool *mem, Flags = Flags()); template Vc_INTRINSIC void store(bool *mem, Flags = Flags()) const; Vc_INTRINSIC Mask &operator=(const Mask &) = default; Vc_INTRINSIC_L Mask &operator=(const std::array &values) Vc_INTRINSIC_R; Vc_INTRINSIC_L operator std::array() const Vc_INTRINSIC_R; Vc_INTRINSIC Vc_PURE bool operator==(const Mask &rhs) const { return Detail::movemask(d.v()) == Detail::movemask(rhs.d.v()); } Vc_INTRINSIC Vc_PURE bool operator!=(const Mask &rhs) const { return !operator==(rhs); } Vc_INTRINSIC Mask operator!() const { #ifdef Vc_GCC return ~dataI(); #else return Detail::andnot_(dataF(), Detail::allone()); #endif } Vc_INTRINSIC Mask &operator&=(const Mask &rhs) { d.v() = AVX::avx_cast(Detail::and_(data(), rhs.data())); return *this; } Vc_INTRINSIC Mask &operator|=(const Mask &rhs) { d.v() = AVX::avx_cast(Detail::or_ (data(), rhs.data())); return *this; } Vc_INTRINSIC Mask &operator^=(const Mask &rhs) { d.v() = AVX::avx_cast(Detail::xor_(data(), rhs.data())); return *this; } Vc_INTRINSIC Vc_PURE Mask operator&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); } Vc_INTRINSIC Vc_PURE Mask operator|(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); } Vc_INTRINSIC Vc_PURE Mask operator^(const Mask &rhs) const { return Detail::xor_(data(), rhs.data()); } Vc_INTRINSIC Vc_PURE Mask operator&&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); } Vc_INTRINSIC Vc_PURE Mask operator||(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); } Vc_INTRINSIC_L bool isNotEmpty() const Vc_INTRINSIC_R; Vc_INTRINSIC_L bool isEmpty() const Vc_INTRINSIC_R; Vc_INTRINSIC_L bool isFull() const Vc_INTRINSIC_R; Vc_INTRINSIC_L bool isMix() const Vc_INTRINSIC_R; Vc_INTRINSIC Vc_PURE int shiftMask() const { return Detail::movemask(dataI()); } Vc_INTRINSIC Vc_PURE int toInt() const { return Detail::mask_to_int(dataI()); } Vc_INTRINSIC VectorType data () const { return d.v(); } Vc_INTRINSIC VectorTypeF dataF() const { return AVX::avx_cast(d.v()); } Vc_INTRINSIC VectorTypeI dataI() const { return AVX::avx_cast(d.v()); } Vc_INTRINSIC VectorTypeD dataD() const { return AVX::avx_cast(d.v()); } private: friend reference; static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept { return m.toInt() & (1 << i); } template static Vc_INTRINSIC void set(Mask &m, int i, U &&v) noexcept(noexcept(MaskBool(std::declval()))) { m.d.set(i, MaskBool(std::forward(v))); } public: Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept { return {*this, int(index)}; } Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept { return get(*this, index); } Vc_INTRINSIC Vc_PURE int count() const { return Detail::popcnt16(toInt()); } Vc_INTRINSIC Vc_PURE int firstOne() const { return _bit_scan_forward(toInt()); } template static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R; Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R; private: #ifdef Vc_COMPILE_BENCHMARKS public: #endif Storage d; }; template constexpr size_t Mask::Size; template constexpr size_t Mask::MemoryAlignment; } namespace Vc_VERSIONED_NAMESPACE { template template Vc_INTRINSIC void Mask::store(bool *mem, Flags f) const { Detail::mask_store(dataI(), mem, f); } template template Vc_INTRINSIC void Mask::load(const bool *mem, Flags f) { d.v() = AVX::avx_cast(Detail::mask_load(mem, f)); } #ifdef Vc_IMPL_AVX2 template <> Vc_INTRINSIC Vc_PURE bool AVX2::Mask::get(const AVX2::Mask &m, int index) noexcept { return m.shiftMask() & (1 << 2 * index); } template <> Vc_INTRINSIC Vc_PURE bool AVX2::Mask::get(const AVX2::Mask &m, int index) noexcept { return m.shiftMask() & (1 << 2 * index); } #endif template <> Vc_INTRINSIC Vc_PURE bool AVX2::double_m::operator==(const AVX2::double_m &rhs) const { return Detail::movemask(dataD()) == Detail::movemask(rhs.dataD()); } #ifdef Vc_IMPL_AVX2 template <> Vc_INTRINSIC Vc_PURE bool AVX2::short_m::operator==(const AVX2::short_m &rhs) const { return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); } template <> Vc_INTRINSIC Vc_PURE bool AVX2::ushort_m::operator==(const AVX2::ushort_m &rhs) const { return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); } #endif template Vc_INTRINSIC bool Mask::isFull() const { if (sizeof(T) == 8) { return 0 != Detail::testc(dataD(), Detail::allone()); } else if (sizeof(T) == 4) { return 0 != Detail::testc(dataF(), Detail::allone()); } else { return 0 != Detail::testc(dataI(), Detail::allone()); } } template Vc_INTRINSIC bool Mask::isNotEmpty() const { if (sizeof(T) == 8) { return 0 == Detail::testz(dataD(), dataD()); } else if (sizeof(T) == 4) { return 0 == Detail::testz(dataF(), dataF()); } else { return 0 == Detail::testz(dataI(), dataI()); } } template Vc_INTRINSIC bool Mask::isEmpty() const { if (sizeof(T) == 8) { return 0 != Detail::testz(dataD(), dataD()); } else if (sizeof(T) == 4) { return 0 != Detail::testz(dataF(), dataF()); } else { return 0 != Detail::testz(dataI(), dataI()); } } template Vc_INTRINSIC bool Mask::isMix() const { if (sizeof(T) == 8) { return 0 != Detail::testnzc(dataD(), Detail::allone()); } else if (sizeof(T) == 4) { return 0 != Detail::testnzc(dataF(), Detail::allone()); } else { return 0 != Detail::testnzc(dataI(), Detail::allone()); } } template Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant) { return _mm256_setr_epi64x( gen(0) ? 0xffffffffffffffffull : 0, gen(1) ? 0xffffffffffffffffull : 0, gen(2) ? 0xffffffffffffffffull : 0, gen(3) ? 0xffffffffffffffffull : 0); } template Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant) { return _mm256_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0, gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0, gen(4) ? 0xfffffffful : 0, gen(5) ? 0xfffffffful : 0, gen(6) ? 0xfffffffful : 0, gen(7) ? 0xfffffffful : 0); } template Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant) { return _mm256_setr_epi16(gen(0) ? 0xfffful : 0, gen(1) ? 0xfffful : 0, gen(2) ? 0xfffful : 0, gen(3) ? 0xfffful : 0, gen(4) ? 0xfffful : 0, gen(5) ? 0xfffful : 0, gen(6) ? 0xfffful : 0, gen(7) ? 0xfffful : 0, gen(8) ? 0xfffful : 0, gen(9) ? 0xfffful : 0, gen(10) ? 0xfffful : 0, gen(11) ? 0xfffful : 0, gen(12) ? 0xfffful : 0, gen(13) ? 0xfffful : 0, gen(14) ? 0xfffful : 0, gen(15) ? 0xfffful : 0); } template template Vc_INTRINSIC AVX2::Mask Mask::generate(G &&gen) { return generate_impl>(std::forward(gen), std::integral_constant()); } template Vc_INTRINSIC Vc_PURE AVX2::Mask Mask::shifted(int amount) const { switch (amount * int(sizeof(VectorEntryType))) { case 0: return *this; case 1: return Detail::shifted< 1>(dataI()); case 2: return Detail::shifted< 2>(dataI()); case 3: return Detail::shifted< 3>(dataI()); case 4: return Detail::shifted< 4>(dataI()); case 5: return Detail::shifted< 5>(dataI()); case 6: return Detail::shifted< 6>(dataI()); case 7: return Detail::shifted< 7>(dataI()); case 8: return Detail::shifted< 8>(dataI()); case 9: return Detail::shifted< 9>(dataI()); case 10: return Detail::shifted< 10>(dataI()); case 11: return Detail::shifted< 11>(dataI()); case 12: return Detail::shifted< 12>(dataI()); case 13: return Detail::shifted< 13>(dataI()); case 14: return Detail::shifted< 14>(dataI()); case 15: return Detail::shifted< 15>(dataI()); case 16: return Detail::shifted< 16>(dataI()); case 17: return Detail::shifted< 17>(dataI()); case 18: return Detail::shifted< 18>(dataI()); case 19: return Detail::shifted< 19>(dataI()); case 20: return Detail::shifted< 20>(dataI()); case 21: return Detail::shifted< 21>(dataI()); case 22: return Detail::shifted< 22>(dataI()); case 23: return Detail::shifted< 23>(dataI()); case 24: return Detail::shifted< 24>(dataI()); case 25: return Detail::shifted< 25>(dataI()); case 26: return Detail::shifted< 26>(dataI()); case 27: return Detail::shifted< 27>(dataI()); case 28: return Detail::shifted< 28>(dataI()); case 29: return Detail::shifted< 29>(dataI()); case 30: return Detail::shifted< 30>(dataI()); case 31: return Detail::shifted< 31>(dataI()); case -1: return Detail::shifted< -1>(dataI()); case -2: return Detail::shifted< -2>(dataI()); case -3: return Detail::shifted< -3>(dataI()); case -4: return Detail::shifted< -4>(dataI()); case -5: return Detail::shifted< -5>(dataI()); case -6: return Detail::shifted< -6>(dataI()); case -7: return Detail::shifted< -7>(dataI()); case -8: return Detail::shifted< -8>(dataI()); case -9: return Detail::shifted< -9>(dataI()); case -10: return Detail::shifted<-10>(dataI()); case -11: return Detail::shifted<-11>(dataI()); case -12: return Detail::shifted<-12>(dataI()); case -13: return Detail::shifted<-13>(dataI()); case -14: return Detail::shifted<-14>(dataI()); case -15: return Detail::shifted<-15>(dataI()); case -16: return Detail::shifted<-16>(dataI()); case -17: return Detail::shifted<-17>(dataI()); case -18: return Detail::shifted<-18>(dataI()); case -19: return Detail::shifted<-19>(dataI()); case -20: return Detail::shifted<-20>(dataI()); case -21: return Detail::shifted<-21>(dataI()); case -22: return Detail::shifted<-22>(dataI()); case -23: return Detail::shifted<-23>(dataI()); case -24: return Detail::shifted<-24>(dataI()); case -25: return Detail::shifted<-25>(dataI()); case -26: return Detail::shifted<-26>(dataI()); case -27: return Detail::shifted<-27>(dataI()); case -28: return Detail::shifted<-28>(dataI()); case -29: return Detail::shifted<-29>(dataI()); case -30: return Detail::shifted<-30>(dataI()); case -31: return Detail::shifted<-31>(dataI()); } return Zero(); } } #endif #include #include #ifdef isfinite #undef isfinite #endif #ifdef isnan #undef isnan #endif namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template struct VectorTraits { using mask_type = Vc::Mask; using vector_type = Vc::Vector; using writemasked_vector_type = Common::WriteMaskedVector; using intrinsic_type = typename AVX::VectorTypeHelper::Type; }; } #define Vc_CURRENT_CLASS_NAME Vector template class Vector { public: using abi = VectorAbi::Avx; private: using traits_type = Detail::VectorTraits; static_assert( std::is_arithmetic::value, "Vector only accepts arithmetic builtin types as template parameter T."); using WriteMaskedVector = typename traits_type::writemasked_vector_type; public: using VectorType = typename traits_type::intrinsic_type; using vector_type = VectorType; using mask_type = typename traits_type::mask_type; using Mask = mask_type; using MaskType = mask_type; using MaskArg Vc_DEPRECATED_ALIAS("Use MaskArgument instead.") = typename Mask::AsArg; using MaskArgument = typename Mask::AsArg; using reference = Detail::ElementReference; Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType)); using EntryType = T; using value_type = EntryType; typedef EntryType VectorEntryType; static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType); static constexpr size_t MemoryAlignment = alignof(VectorType); using IndexType = fixed_size_simd; using index_type = IndexType; typedef Vector AsArg; typedef VectorType VectorTypeArg; protected: template using V = Vector; typedef AVX::VectorHelper HV; typedef AVX::VectorHelper HT; template static Vc_INTRINSIC VectorType _cast(V v) { return AVX::avx_cast(v); } typedef Common::VectorMemoryUnion StorageType; StorageType d; using WidthT = Common::WidthT; public: public: Vc_INTRINSIC Vector() = default; static constexpr std::size_t size() { return Size; } explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R; explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R; explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R; static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); } static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); } static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero() { return Vector(Vc::IndexesFromZero); } template ()(size_t())), value_type>::value>::type> explicit Vector(G &&g) : Vector(generate(std::forward(g))) { } static Vc_ALWAYS_INLINE_L Vector Random() Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE Vector(VectorTypeArg x) : d(x) {} template Vc_INTRINSIC Vector( V x, typename std::enable_if::value, void *>::type = nullptr) : d(AVX::convert(x.data())) { } #if Vc_IS_VERSION_1 template Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between " "vector types") Vc_INTRINSIC explicit Vector( V x, typename std::enable_if::value, void *>::type = nullptr) : d(Detail::zeroExtendIfNeeded(AVX::convert(x.data()))) { } template ::value && !std::is_same>::value>> Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between " "vector types") Vc_INTRINSIC_L explicit Vector(U &&x) Vc_INTRINSIC_R; #endif Vc_INTRINSIC explicit Vector(reference a) : Vector(static_cast(a)) {} Vc_INTRINSIC Vector(EntryType a) : d(Detail::avx_broadcast(a)) {} template Vc_INTRINSIC Vector(U a, typename std::enable_if::value && !std::is_same::value, void *>::type = nullptr) : Vector(static_cast(a)) { } explicit Vector(std::initializer_list) { static_assert(std::is_same::value, "A SIMD vector object cannot be initialized from an initializer list " "because the number of entries in the vector is target-dependent."); } explicit Vc_INTRINSIC Vector(const EntryType *mem) { load(mem); } template ::value>> explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags) { load(mem, flags); } template ::value || !std::is_integral::value || sizeof(EntryType) >= sizeof(U)) && std::is_arithmetic::value &&Traits::is_load_store_flag::value>> explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags()) { load(x, flags); } Vc_INTRINSIC void load(const EntryType *mem) { load(mem, DefaultLoadTag()); } template Vc_INTRINSIC enable_if::value, void> load(const EntryType *mem, Flags flags) { load(mem, flags); } private: template struct load_concept : public std::enable_if< (!std::is_integral::value || !std::is_integral::value || sizeof(EntryType) >= sizeof(U)) && std::is_arithmetic::value && Traits::is_load_store_flag::value, void> {}; public: template Vc_INTRINSIC_L typename load_concept::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R; template < typename U, typename Flags = DefaultStoreTag, typename = enable_if::value &&Traits::is_load_store_flag::value>> Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R; template < typename U, typename Flags = DefaultStoreTag, typename = enable_if::value &&Traits::is_load_store_flag::value>> Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R; Vc_INTRINSIC void store(EntryType *mem) const { store(mem, DefaultStoreTag()); } template ::value>> Vc_INTRINSIC void store(EntryType *mem, Flags flags) const { store(mem, flags); } Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const { store(mem, mask, DefaultStoreTag()); } template ::value>> Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const { store(mem, mask, flags); } Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R; Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R; Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R; Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R; Vc_INTRINSIC_L void setQnan(MaskArgument k) Vc_INTRINSIC_R; #ifndef Vc_CURRENT_CLASS_NAME #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors." #endif private: template inline void gatherImplementation(const Common::GatherArguments &); template inline void gatherImplementation(const Common::GatherArguments &, MaskArgument mask); public: #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that can be converted to the " \ "EntryType of this SIMD vector type."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes))); } template Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments &args) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args); } template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask); } template Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments &args, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args, mask); } template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes))); } template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask); } template Vc_INTRINSIC void gather(const Common::GatherArguments &args) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args); } template Vc_INTRINSIC void gather(const Common::GatherArguments &args, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args, mask); } #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_ private: template inline void scatterImplementation(MT *mem, IT &&indexes) const; template inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const; public: #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that the EntryType of this " \ "SIMD vector type can be converted to."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes)); } template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes), mask); } template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args) const { scatter(args.address, args.indexes); } template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args, MaskArgument mask) const { scatter(args.address, args.indexes, mask); } #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_ #if defined Vc_IMPL_AVX2 && !defined Vc_MSVC template ::size(), class = enable_if<(Vector::size() >= size() && sizeof(T) >= 4)>> Vc_INTRINSIC void gatherImplementation( const Common::GatherArguments, Scale> &args) { d.v() = AVX::gather( args.address, simd_cast>(args.indexes) .data()); } template ::size(), class = enable_if<(Vector::size() >= size() && sizeof(T) >= 4)>> Vc_INTRINSIC void gatherImplementation( const Common::GatherArguments, Scale> &args, MaskArgument k) { d.v() = AVX::gather( d.v(), k.data(), args.address, simd_cast>(args.indexes) .data()); } template < class MT, class U, class A, int Scale, class = enable_if<(sizeof(T) == 2 && std::is_integral::value && (sizeof(MT) <= 2) && Vector::size() >= size())>> Vc_INTRINSIC void gatherImplementation( const Common::GatherArguments, Scale> &args) { using AVX2::int_v; const auto idx0 = simd_cast(args.indexes).data(); const auto idx1 = simd_cast(args.indexes).data(); *this = simd_cast(int_v(AVX::gather( aliasing_cast(args.address), idx0)), int_v(AVX::gather( aliasing_cast(args.address), idx1))); if (sizeof(MT) == 1) { if (std::is_signed::value) { using Signed = AVX2::Vector::type>; *this = (simd_cast(*this) << 8) >> 8; } else { *this &= 0xff; } } } template < class MT, class U, class A, int Scale, class = enable_if<(sizeof(T) == 2 && std::is_integral::value && (sizeof(MT) <= 2) && Vector::size() >= size())>> Vc_INTRINSIC void gatherImplementation( const Common::GatherArguments, Scale> &args, MaskArgument k) { using AVX2::int_v; const auto idx0 = simd_cast(args.indexes).data(); const auto idx1 = simd_cast(args.indexes).data(); const auto k0 = simd_cast(k).data(); const auto k1 = simd_cast(k).data(); auto v = simd_cast( int_v(AVX::gather( _mm256_setzero_si256(), k0, aliasing_cast(args.address), idx0)), int_v(AVX::gather( _mm256_setzero_si256(), k1, aliasing_cast(args.address), idx1))); if (sizeof(MT) == 1) { if (std::is_signed::value) { using Signed = AVX2::Vector::type>; v = (simd_cast(v) << 8) >> 8; } else { v &= 0xff; } } assign(v, k); } template Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) && Traits::is_valid_vector_argument::value && !std::is_same::value && Vector::size() >= size()), void> gatherImplementation(const Common::GatherArguments, Scale> &args) { *this = simd_cast(fixed_size_simd(args)); } template Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) && Traits::is_valid_vector_argument::value && !std::is_same::value && Vector::size() >= size()), void> gatherImplementation(const Common::GatherArguments, Scale> &args, MaskArgument k) { assign(simd_cast(fixed_size_simd(args, k)), k); } #endif Vc_ALWAYS_INLINE Vector &operator++() { data() = Detail::add(data(), Detail::one(T()), T()); return *this; } Vc_ALWAYS_INLINE Vector &operator--() { data() = Detail::sub(data(), Detail::one(T()), T()); return *this; } Vc_ALWAYS_INLINE Vector operator++(int) { const Vector r = *this; data() = Detail::add(data(), Detail::one(T()), T()); return r; } Vc_ALWAYS_INLINE Vector operator--(int) { const Vector r = *this; data() = Detail::sub(data(), Detail::one(T()), T()); return r; } private: friend reference; Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept { return o.d.m(i); } template Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept( noexcept(std::declval() = v)) { return o.d.set(i, v); } public: Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept { static_assert(noexcept(reference{std::declval(), int()}), ""); return {*this, int(index)}; } Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept { return d.m(index); } Vc_INTRINSIC_L Vc_PURE_L Vector operator[](Permutation::ReversedTag) const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC_L Vc_PURE_L Vector operator[](const IndexType &perm) const Vc_INTRINSIC_R Vc_PURE_R; Vc_INTRINSIC Vc_PURE Mask operator!() const { return *this == Zero(); } Vc_ALWAYS_INLINE Vector operator~() const { #ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS static_assert(std::is_integral::value, "bit-complement can only be used with Vectors of integral type"); #endif return Detail::andnot_(data(), Detail::allone()); } Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R; Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; } #define Vc_OP_VEC(op) \ Vc_INTRINSIC Vector &operator op##=(AsArg x); \ Vc_INTRINSIC Vc_PURE Vector operator op(AsArg x) const \ { \ static_assert( \ std::is_integral::value, \ "bitwise-operators can only be used with Vectors of integral type"); \ } Vc_ALL_SHIFTS(Vc_OP_VEC); #undef Vc_OP_VEC Vc_ALWAYS_INLINE_L Vector &operator>>=(int x) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L Vector &operator<<=(int x) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L Vector operator>>(int x) const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L Vector operator<<(int x) const Vc_ALWAYS_INLINE_R; Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask isNegative() const { return Vc::isnegative(*this); } Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) { data() = Detail::blend(data(), v.data(), mask.data()); } template Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2 staticCast() const { return V2(*this); } template Vc_DEPRECATED("use reinterpret_components_cast instead") Vc_ALWAYS_INLINE V2 reinterpretCast() const { return AVX::avx_cast(data()); } Vc_ALWAYS_INLINE WriteMaskedVector operator()(const Mask &k) { return {*this, k}; } Vc_ALWAYS_INLINE VectorType &data() { return d.v(); } Vc_ALWAYS_INLINE const VectorType &data() const { return d.v(); } template Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R; Vc_INTRINSIC_L std::pair minIndex() const Vc_INTRINSIC_R; Vc_INTRINSIC_L std::pair maxIndex() const Vc_INTRINSIC_R; Vc_ALWAYS_INLINE EntryType min() const { return Detail::min(data(), T()); } Vc_ALWAYS_INLINE EntryType max() const { return Detail::max(data(), T()); } Vc_ALWAYS_INLINE EntryType product() const { return Detail::mul(data(), T()); } Vc_ALWAYS_INLINE EntryType sum() const { return Detail::add(data(), T()); } Vc_ALWAYS_INLINE_L Vector partialSum() const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L EntryType min(MaskArgument m) const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L EntryType max(MaskArgument m) const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L EntryType product(MaskArgument m) const Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L EntryType sum(MaskArgument m) const Vc_ALWAYS_INLINE_R; Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R; Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R; template void callWithValuesSorted(F &&f) { EntryType value = d.m(0); f(value); for (size_t i = 1; i < Size; ++i) { if (d.m(i) != value) { value = d.m(i); f(value); } } } template Vc_INTRINSIC void call(F &&f) const { Common::for_all_vector_entries([&](size_t i) { f(EntryType(d.m(i))); }); } template Vc_INTRINSIC void call(F &&f, const Mask &mask) const { for (size_t i : where(mask)) { f(EntryType(d.m(i))); } } template Vc_INTRINSIC Vector apply(F &&f) const { Vector r; Common::for_all_vector_entries( [&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); }); return r; } template Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const { Vector r(*this); for (size_t i : where(mask)) { r.d.set(i, f(EntryType(r.d.m(i)))); } return r; } template Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) { Common::for_all_vector_entries([&](size_t i) { d.set(i, f(i)); }); } Vc_INTRINSIC void fill(EntryType (&f)()) { Common::for_all_vector_entries([&](size_t i) { d.set(i, f()); }); } template static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R; Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector copySign(AsArg x) const { return Vc::copysign(*this, x); } Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const { Vc::exponent(*this); } Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R; Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R; }; #undef Vc_CURRENT_CLASS_NAME template constexpr size_t Vector::Size; template constexpr size_t Vector::MemoryAlignment; #define Vc_CONDITIONAL_ASSIGN(name_,op_) \ template \ Vc_INTRINSIC enable_if conditional_assign( \ AVX2::Vector &lhs, M &&mask, U &&rhs) \ { \ lhs(mask) op_ rhs; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_CONDITIONAL_ASSIGN( Assign, =); Vc_CONDITIONAL_ASSIGN( PlusAssign, +=); Vc_CONDITIONAL_ASSIGN( MinusAssign, -=); Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=); Vc_CONDITIONAL_ASSIGN( DivideAssign, /=); Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=); Vc_CONDITIONAL_ASSIGN( XorAssign, ^=); Vc_CONDITIONAL_ASSIGN( AndAssign, &=); Vc_CONDITIONAL_ASSIGN( OrAssign, |=); Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=); Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=); #undef Vc_CONDITIONAL_ASSIGN #define Vc_CONDITIONAL_ASSIGN(name_,expr_) \ template \ Vc_INTRINSIC enable_if> conditional_assign( \ AVX2::Vector &lhs, M &&mask) \ { \ return expr_; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++); Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask)); Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--); Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask)); #undef Vc_CONDITIONAL_ASSIGN } #ifndef VC_AVX_LIMITS_H_ #define VC_AVX_LIMITS_H_ namespace std { #define Vc_NUM_LIM(T,_max,_min) \ template <> struct numeric_limits> : public numeric_limits { \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector max() Vc_NOEXCEPT \ { \ return _max; \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector min() Vc_NOEXCEPT \ { \ return _min; \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector lowest() Vc_NOEXCEPT \ { \ return min(); \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector epsilon() Vc_NOEXCEPT \ { \ return Vc::AVX2::Vector::Zero(); \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector round_error() Vc_NOEXCEPT \ { \ return Vc::AVX2::Vector::Zero(); \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector infinity() Vc_NOEXCEPT \ { \ return Vc::AVX2::Vector::Zero(); \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector quiet_NaN() Vc_NOEXCEPT \ { \ return Vc::AVX2::Vector::Zero(); \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector signaling_NaN() Vc_NOEXCEPT \ { \ return Vc::AVX2::Vector::Zero(); \ } \ static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector denorm_min() Vc_NOEXCEPT \ { \ return Vc::AVX2::Vector::Zero(); \ } \ } #ifdef Vc_IMPL_AVX2 Vc_NUM_LIM(unsigned short, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>()); Vc_NUM_LIM( short, _mm256_srli_epi16(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi16()); Vc_NUM_LIM( unsigned int, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>()); Vc_NUM_LIM( int, _mm256_srli_epi32(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi32()); #endif #undef Vc_NUM_LIM } #endif #ifndef VC_AVX_CONST_H_ #define VC_AVX_CONST_H_ #include namespace Vc_VERSIONED_NAMESPACE { namespace AVX { template struct IndexesFromZeroData; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const int *address() { return reinterpret_cast(&_IndexesFromZero32[0]); } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const unsigned int *address() { return &_IndexesFromZero32[0]; } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const short *address() { return reinterpret_cast(&_IndexesFromZero16[0]); } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const unsigned short *address() { return &_IndexesFromZero16[0]; } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const signed char *address() { return reinterpret_cast(&_IndexesFromZero8[0]); } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const char *address() { return reinterpret_cast(&_IndexesFromZero8[0]); } }; template<> struct IndexesFromZeroData { static Vc_ALWAYS_INLINE Vc_CONST const unsigned char *address() { return &_IndexesFromZero8[0]; } }; template struct Const { typedef Vector<_T> V; typedef typename V::EntryType T; typedef typename V::Mask M; static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return V(c_trig::data[0]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return V(c_trig::data[1]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return V(c_trig::data[2]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return V(c_trig::data[3]); } static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return V(c_trig::data[4]); } static Vc_ALWAYS_INLINE Vc_CONST V _16() { return V(c_trig::data[5]); } static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return V(c_trig::data[(12 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return V(c_trig::data[(17 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return V(c_trig::data[22]); } static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return V(c_trig::data[23]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return V(c_trig::data[24]); } static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return V(c_trig::data[8]); } static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return V(c_trig::data[9]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return V(c_trig::data[10]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return V(c_trig::data[11]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return V(c_trig::data[(28 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return V(c_trig::data[(33 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return V(c_trig::data[(37 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return V(c_trig::data[(43 + i)]); } static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return V(c_trig::data[25]); } static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return V(c_trig::data[26]); } static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(V(c_log::d(1)).data()); } static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return V(c_log::d(18)); } static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return V(c_log::d(15)); } static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return V(c_log::d(2 + i)); } static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return V(c_log::d(8 + i)); } static Vc_ALWAYS_INLINE Vc_CONST V min() { return V(c_log::d(14)); } static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return V(c_log::d(17)); } static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return V(c_log::d(16)); } static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return V(c_log::d(13)); } static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return V(c_log::d(19)); } static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return V(c_log::d(20)); } static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R; static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask(int bits) Vc_ALWAYS_INLINE_R Vc_CONST_R; }; template <> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return _mm256_broadcast_ss( reinterpret_cast(&c_general::highMaskFloat)); } template <> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return _mm256_broadcast_sd( reinterpret_cast(&c_general::highMaskDouble)); } template <> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask(int bits) { #ifdef Vc_IMPL_AVX2 #if defined Vc_ICC || defined Vc_MSVC __m256i allone = _mm256_set1_epi64x(~0); #else auto allone = ~__m256i(); #endif return _mm256_castsi256_ps(_mm256_slli_epi32(allone, bits)); #else __m128 tmp = _mm_castsi128_ps(_mm_slli_epi32(_mm_setallone_si128(), bits)); return concat(tmp, tmp); #endif } template <> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask(int bits) { #ifdef Vc_IMPL_AVX2 #if defined Vc_ICC || defined Vc_MSVC __m256i allone = _mm256_set1_epi64x(~0); #else auto allone = ~__m256i(); #endif return _mm256_castsi256_pd(_mm256_slli_epi64(allone, bits)); #else __m128d tmp = _mm_castsi128_pd(_mm_slli_epi64(_mm_setallone_si128(), bits)); return concat(tmp, tmp); #endif } } namespace AVX2 { using AVX::IndexesFromZeroData; using AVX::Const; } } #endif namespace Vc_VERSIONED_NAMESPACE { namespace Detail { Vc_INTRINSIC AVX2::double_m operator==(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpeq_pd(a.data(), b.data()); } Vc_INTRINSIC AVX2:: float_m operator==(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpeq_ps(a.data(), b.data()); } Vc_INTRINSIC AVX2::double_m operator!=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpneq_pd(a.data(), b.data()); } Vc_INTRINSIC AVX2:: float_m operator!=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpneq_ps(a.data(), b.data()); } Vc_INTRINSIC AVX2::double_m operator>=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpnlt_pd(a.data(), b.data()); } Vc_INTRINSIC AVX2:: float_m operator>=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpnlt_ps(a.data(), b.data()); } Vc_INTRINSIC AVX2::double_m operator<=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmple_pd(a.data(), b.data()); } Vc_INTRINSIC AVX2:: float_m operator<=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmple_ps(a.data(), b.data()); } Vc_INTRINSIC AVX2::double_m operator> (AVX2::double_v a, AVX2::double_v b) { return AVX::cmpgt_pd(a.data(), b.data()); } Vc_INTRINSIC AVX2:: float_m operator> (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpgt_ps(a.data(), b.data()); } Vc_INTRINSIC AVX2::double_m operator< (AVX2::double_v a, AVX2::double_v b) { return AVX::cmplt_pd(a.data(), b.data()); } Vc_INTRINSIC AVX2:: float_m operator< (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmplt_ps(a.data(), b.data()); } #ifdef Vc_IMPL_AVX2 Vc_INTRINSIC AVX2:: int_m operator==(AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: uint_m operator==(AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: short_m operator==(AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); } Vc_INTRINSIC AVX2::ushort_m operator==(AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); } Vc_INTRINSIC AVX2:: int_m operator!=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: uint_m operator!=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: short_m operator!=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); } Vc_INTRINSIC AVX2::ushort_m operator!=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); } Vc_INTRINSIC AVX2:: int_m operator>=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmplt_epi32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: uint_m operator>=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmplt_epu32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: short_m operator>=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmplt_epi16(a.data(), b.data())); } Vc_INTRINSIC AVX2::ushort_m operator>=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmplt_epu16(a.data(), b.data())); } Vc_INTRINSIC AVX2:: int_m operator<=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpgt_epi32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: uint_m operator<=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpgt_epu32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: short_m operator<=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpgt_epi16(a.data(), b.data())); } Vc_INTRINSIC AVX2::ushort_m operator<=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpgt_epu16(a.data(), b.data())); } Vc_INTRINSIC AVX2:: int_m operator> (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpgt_epi32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: uint_m operator> (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpgt_epu32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: short_m operator> (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpgt_epi16(a.data(), b.data()); } Vc_INTRINSIC AVX2::ushort_m operator> (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpgt_epu16(a.data(), b.data()); } Vc_INTRINSIC AVX2:: int_m operator< (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmplt_epi32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: uint_m operator< (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmplt_epu32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: short_m operator< (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmplt_epi16(a.data(), b.data()); } Vc_INTRINSIC AVX2::ushort_m operator< (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmplt_epu16(a.data(), b.data()); } #endif template Vc_INTRINSIC AVX2::Vector operator^(AVX2::Vector a, AVX2::Vector b) { return xor_(a.data(), b.data()); } template Vc_INTRINSIC AVX2::Vector operator&(AVX2::Vector a, AVX2::Vector b) { return and_(a.data(), b.data()); } template Vc_INTRINSIC AVX2::Vector operator|(AVX2::Vector a, AVX2::Vector b) { return or_(a.data(), b.data()); } template Vc_INTRINSIC AVX2::Vector operator+(AVX2::Vector a, AVX2::Vector b) { return add(a.data(), b.data(), T()); } template Vc_INTRINSIC AVX2::Vector operator-(AVX2::Vector a, AVX2::Vector b) { return sub(a.data(), b.data(), T()); } template Vc_INTRINSIC AVX2::Vector operator*(AVX2::Vector a, AVX2::Vector b) { return mul(a.data(), b.data(), T()); } template Vc_INTRINSIC AVX2::Vector operator/(AVX2::Vector a, AVX2::Vector b) { return div(a.data(), b.data(), T()); } Vc_INTRINSIC AVX2::Vector operator/(AVX2::Vector a, AVX2::Vector b) { using namespace AVX; const __m256 lo = _mm256_div_ps(convert(lo128(a.data())), convert(lo128(b.data()))); const __m256 hi = _mm256_div_ps(convert(hi128(a.data())), convert(hi128(b.data()))); const float_v threshold = 32767.f; using Detail::operator>; const __m128i loShort = (Vc_IS_UNLIKELY((float_v(lo) > threshold).isNotEmpty())) ? convert(lo) : convert(lo); const __m128i hiShort = (Vc_IS_UNLIKELY((float_v(hi) > threshold).isNotEmpty())) ? convert(hi) : convert(hi); return concat(loShort, hiShort); } template Vc_INTRINSIC enable_if::value, AVX2::Vector> operator%( AVX2::Vector a, AVX2::Vector b) { return a - a / b * b; } } template <> template Vc_INTRINSIC AVX2::double_v AVX2::double_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); return _mm256_setr_pd(tmp0, tmp1, tmp2, tmp3); } template <> template Vc_INTRINSIC AVX2::float_v AVX2::float_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); const auto tmp4 = gen(4); const auto tmp5 = gen(5); const auto tmp6 = gen(6); const auto tmp7 = gen(7); return _mm256_setr_ps(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); } #ifdef Vc_IMPL_AVX2 template <> template Vc_INTRINSIC AVX2::int_v AVX2::int_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); const auto tmp4 = gen(4); const auto tmp5 = gen(5); const auto tmp6 = gen(6); const auto tmp7 = gen(7); return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); } template <> template Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); const auto tmp4 = gen(4); const auto tmp5 = gen(5); const auto tmp6 = gen(6); const auto tmp7 = gen(7); return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); } template <> template Vc_INTRINSIC AVX2::short_v AVX2::short_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); const auto tmp4 = gen(4); const auto tmp5 = gen(5); const auto tmp6 = gen(6); const auto tmp7 = gen(7); const auto tmp8 = gen(8); const auto tmp9 = gen(9); const auto tmp10 = gen(10); const auto tmp11 = gen(11); const auto tmp12 = gen(12); const auto tmp13 = gen(13); const auto tmp14 = gen(14); const auto tmp15 = gen(15); return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15); } template <> template Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::generate(G gen) { const auto tmp0 = gen(0); const auto tmp1 = gen(1); const auto tmp2 = gen(2); const auto tmp3 = gen(3); const auto tmp4 = gen(4); const auto tmp5 = gen(5); const auto tmp6 = gen(6); const auto tmp7 = gen(7); const auto tmp8 = gen(8); const auto tmp9 = gen(9); const auto tmp10 = gen(10); const auto tmp11 = gen(11); const auto tmp12 = gen(12); const auto tmp13 = gen(13); const auto tmp14 = gen(14); const auto tmp15 = gen(15); return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15); } #endif template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerZero) : d{} {} template <> Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerOne) : d(AVX::setone_pd()) {} template <> Vc_INTRINSIC Vector< float, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_ps()) {} #ifdef Vc_IMPL_AVX2 template <> Vc_INTRINSIC Vector< int, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi32()) {} template <> Vc_INTRINSIC Vector< uint, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu32()) {} template <> Vc_INTRINSIC Vector< short, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi16()) {} template <> Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu16()) {} template <> Vc_INTRINSIC Vector< schar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi8()) {} template <> Vc_INTRINSIC Vector< uchar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu8()) {} #endif template Vc_ALWAYS_INLINE Vector::Vector( VectorSpecialInitializerIndexesFromZero) : Vector(AVX::IndexesFromZeroData::address(), Vc::Aligned) { } template <> Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerIndexesFromZero) : Vector(AVX::IndexesFromZeroData::address(), Vc::Aligned) { } template <> Vc_ALWAYS_INLINE Vector::Vector(VectorSpecialInitializerIndexesFromZero) : Vector(AVX::IndexesFromZeroData::address(), Vc::Aligned) { } template template Vc_INTRINSIC typename Vector:: #ifndef Vc_MSVC template #endif load_concept::type Vector::load(const SrcT *mem, Flags flags) { Common::handleLoadPrefetches(mem, flags); d.v() = Detail::load(mem, flags); } template Vc_INTRINSIC void Vector::setZero() { data() = Detail::zero(); } template Vc_INTRINSIC void Vector::setZero(const Mask &k) { data() = Detail::andnot_(k.data(), data()); } template Vc_INTRINSIC void Vector::setZeroInverted(const Mask &k) { data() = Detail::and_(k.data(), data()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = Detail::allone(); } template<> Vc_INTRINSIC void Vector::setQnan(MaskArgument k) { data() = _mm256_or_pd(data(), k.dataD()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = Detail::allone(); } template<> Vc_INTRINSIC void Vector::setQnan(MaskArgument k) { data() = _mm256_or_ps(data(), k.dataF()); } template template Vc_INTRINSIC void Vector::store(U *mem, Flags flags) const { Common::handleStorePrefetches(mem, flags); HV::template store(mem, data()); } template template Vc_INTRINSIC void Vector::store(U *mem, Mask mask, Flags flags) const { Common::handleStorePrefetches(mem, flags); HV::template store(mem, data(), mask.data()); } #ifdef Vc_IMPL_AVX2 template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); } template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); } template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srav_epi32(d.v(), x.d.v()); } template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srlv_epi32(d.v(), x.d.v()); } template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); } template <> Vc_ALWAYS_INLINE AVX2::Vector Vector::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); } template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); } template <> Vc_ALWAYS_INLINE AVX2::Vector Vector::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); } template Vc_ALWAYS_INLINE AVX2::Vector &Vector::operator<<=(AsArg x) { static_assert(std::is_integral::value, "bitwise-operators can only be used with Vectors of integral type"); return *this = *this << x; } template Vc_ALWAYS_INLINE AVX2::Vector &Vector::operator>>=(AsArg x) { static_assert(std::is_integral::value, "bitwise-operators can only be used with Vectors of integral type"); return *this = *this >> x; } #endif template Vc_ALWAYS_INLINE AVX2::Vector &Vector::operator>>=(int shift) { d.v() = Detail::shiftRight(d.v(), shift, T()); return *static_cast *>(this); } template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector Vector::operator>>(int shift) const { return Detail::shiftRight(d.v(), shift, T()); } template Vc_ALWAYS_INLINE AVX2::Vector &Vector::operator<<=(int shift) { d.v() = Detail::shiftLeft(d.v(), shift, T()); return *static_cast *>(this); } template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector Vector::operator<<(int shift) const { return Detail::shiftLeft(d.v(), shift, T()); } Vc_INTRINSIC Vc_CONST AVX2::float_m isnegative(AVX2::float_v x) { return AVX::avx_cast<__m256>(AVX::srai_epi32<31>( AVX::avx_cast<__m256i>(_mm256_and_ps(AVX::setsignmask_ps(), x.data())))); } Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x) { return Mem::permute(AVX::avx_cast<__m256>(AVX::srai_epi32<31>( AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data()))))); } #define Vc_GATHER_IMPL(V_) \ template <> \ template \ inline void AVX2::V_::gatherImplementation( \ const Common::GatherArguments &args) #define Vc_M(i_) static_cast(args.address[Scale * args.indexes[i_]]) Vc_GATHER_IMPL(double_v) { d.v() = _mm256_setr_pd(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); } Vc_GATHER_IMPL(float_v) { d.v() = _mm256_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7)); } #ifdef Vc_IMPL_AVX2 Vc_GATHER_IMPL(int_v) { d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7)); } Vc_GATHER_IMPL(uint_v) { d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7)); } Vc_GATHER_IMPL(short_v) { d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11), Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15)); } Vc_GATHER_IMPL(ushort_v) { d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11), Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15)); } #endif #undef Vc_M #undef Vc_GATHER_IMPL template template inline void Vector::gatherImplementation( const Common::GatherArguments &args, MaskArgument mask) { const auto *mem = args.address; const auto indexes = Scale * args.indexes; using Selector = std::integral_constant < Common::GatherScatterImplementation, #ifdef Vc_USE_SET_GATHERS Traits::is_simd_vector::value ? Common::GatherScatterImplementation::SetIndexZero : #endif #ifdef Vc_USE_BSF_GATHERS Common::GatherScatterImplementation::BitScanLoop #elif defined Vc_USE_POPCNT_BSF_GATHERS Common::GatherScatterImplementation::PopcntSwitch #else Common::GatherScatterImplementation::SimpleLoop #endif > ; Common::executeGather(Selector(), *this, mem, indexes, mask); } template template inline void Vector::scatterImplementation(MT *mem, IT &&indexes) const { Common::unrolled_loop([&](std::size_t i) { mem[indexes[i]] = d.m(i); }); } template template inline void Vector::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const { using Selector = std::integral_constant < Common::GatherScatterImplementation, #ifdef Vc_USE_SET_GATHERS Traits::is_simd_vector::value ? Common::GatherScatterImplementation::SetIndexZero : #endif #ifdef Vc_USE_BSF_GATHERS Common::GatherScatterImplementation::BitScanLoop #elif defined Vc_USE_POPCNT_BSF_GATHERS Common::GatherScatterImplementation::PopcntSwitch #else Common::GatherScatterImplementation::SimpleLoop #endif > ; Common::executeScatter(Selector(), *this, mem, std::forward(indexes), mask); } #ifdef Vc_USE_BUILTIN_VECTOR_TYPES template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector Vector::operator-() const { return VectorType(-d.builtin()); } #else template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector Vector::operator-() const { return Detail::negate(d.v(), std::integral_constant()); } #endif template Vc_INTRINSIC std::pair, int> Vector::minIndex() const { AVX2::Vector x = min(); return std::make_pair(x, (*this == x).firstOne()); } template Vc_INTRINSIC std::pair, int> Vector::maxIndex() const { AVX2::Vector x = max(); return std::make_pair(x, (*this == x).firstOne()); } template <> Vc_INTRINSIC std::pair AVX2::float_v::minIndex() const { __m256 x = d.v(); __m256 idx = Vector::IndexesFromZero().data(); __m256 y = Mem::permute128(x); __m256 idy = Mem::permute128(idx); __m256 less = AVX::cmplt_ps(x, y); x = _mm256_blendv_ps(y, x, less); idx = _mm256_blendv_ps(idy, idx, less); y = Reg::permute(x); idy = Reg::permute(idx); less = AVX::cmplt_ps(x, y); x = _mm256_blendv_ps(y, x, less); idx = _mm256_blendv_ps(idy, idx, less); y = Reg::permute(x); idy = Reg::permute(idx); less = AVX::cmplt_ps(x, y); idx = _mm256_blendv_ps(idy, idx, less); const auto index = _mm_cvtsi128_si32(AVX::avx_cast<__m128i>(idx)); #ifdef Vc_GNU_ASM __asm__ __volatile__(""); #endif x = _mm256_blendv_ps(y, x, less); return std::make_pair(x, index); } template Vc_ALWAYS_INLINE AVX2::Vector Vector::partialSum() const { AVX2::Vector tmp = *this; if (Size > 1) tmp += tmp.shifted(-1); if (Size > 2) tmp += tmp.shifted(-2); if (Size > 4) tmp += tmp.shifted(-4); if (Size > 8) tmp += tmp.shifted(-8); if (Size > 16) tmp += tmp.shifted(-16); return tmp; } template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::min(MaskArgument m) const { AVX2::Vector tmp = std::numeric_limits >::max(); tmp(m) = *this; return tmp.min(); } template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::max(MaskArgument m) const { AVX2::Vector tmp = std::numeric_limits >::min(); tmp(m) = *this; return tmp.max(); } template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::product(MaskArgument m) const { AVX2::Vector tmp(Vc::One); tmp(m) = *this; return tmp.product(); } template Vc_ALWAYS_INLINE typename Vector::EntryType Vector::sum(MaskArgument m) const { AVX2::Vector tmp(Vc::Zero); tmp(m) = *this; return tmp.sum(); } namespace Detail { Vc_INTRINSIC Vc_CONST __m256 exponent(__m256 v) { using namespace AVX; __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(v), 23); __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(v)), 23); tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f)); tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f)); return _mm256_cvtepi32_ps(concat(tmp0, tmp1)); } Vc_INTRINSIC Vc_CONST __m256d exponent(__m256d v) { using namespace AVX; __m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(v), 52); __m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(v)), 52); tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff)); tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff)); return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1)))); } } Vc_INTRINSIC Vc_CONST AVX2::float_v exponent(AVX2::float_v x) { using Detail::operator>=; Vc_ASSERT((x >= x.Zero()).isFull()); return Detail::exponent(x.data()); } Vc_INTRINSIC Vc_CONST AVX2::double_v exponent(AVX2::double_v x) { using Detail::operator>=; Vc_ASSERT((x >= x.Zero()).isFull()); return Detail::exponent(x.data()); } static Vc_ALWAYS_INLINE __m256i _doRandomStep() { using Detail::operator*; using Detail::operator+; #ifdef Vc_IMPL_AVX2 using AVX2::uint_v; uint_v state0(&Common::RandomState[0]); uint_v state1(&Common::RandomState[uint_v::Size]); (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]); uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(), _mm256_srli_epi32(state1.data(), 16))) .store(&Common::RandomState[0]); return state0.data(); #else using SSE::uint_v; uint_v state0(&Common::RandomState[0]); uint_v state1(&Common::RandomState[uint_v::Size]); uint_v state2(&Common::RandomState[2 * uint_v::Size]); uint_v state3(&Common::RandomState[3 * uint_v::Size]); (state2 * uint_v(0xdeece66du) + uint_v(11)) .store(&Common::RandomState[2 * uint_v::Size]); (state3 * uint_v(0xdeece66du) + uint_v(11)) .store(&Common::RandomState[3 * uint_v::Size]); uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(), _mm_srli_epi32(state2.data(), 16))) .store(&Common::RandomState[0]); uint_v(Detail::xor_((state1 * uint_v(0xdeece66du) + uint_v(11)).data(), _mm_srli_epi32(state3.data(), 16))) .store(&Common::RandomState[uint_v::Size]); return AVX::concat(state0.data(), state1.data()); #endif } #ifdef Vc_IMPL_AVX2 template Vc_ALWAYS_INLINE AVX2::Vector Vector::Random() { return {_doRandomStep()}; } #endif template <> Vc_ALWAYS_INLINE AVX2::float_v AVX2::float_v::Random() { return HT::sub(Detail::or_(_cast(AVX::srli_epi32<2>(_doRandomStep())), HT::one()), HT::one()); } template<> Vc_ALWAYS_INLINE AVX2::double_v AVX2::double_v::Random() { const __m256i state = Detail::load(&Common::RandomState[0], Vc::Aligned, Detail::LoadTag<__m256i, int>()); for (size_t k = 0; k < 8; k += 2) { typedef unsigned long long uint64 Vc_MAY_ALIAS; const uint64 stateX = *aliasing_cast(&Common::RandomState[k]); *aliasing_cast(&Common::RandomState[k]) = (stateX * 0x5deece66dull + 11); } return HT::sub(Detail::or_(_cast(AVX::srli_epi64<12>(state)), HT::one()), HT::one()); } template Vc_INTRINSIC AVX2::Vector Vector::shifted(int amount) const { return Detail::shifted(d.v(), amount); } template Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m128>) { return Mem::shuffle(left, right); } template Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m256>) { return Mem::shuffle128(left, right); } template Vc_INTRINSIC AVX2::Vector Vector::shifted(int amount, Vector shiftIn) const { #ifdef __GNUC__ if (__builtin_constant_p(amount)) { const __m256i a = AVX::avx_cast<__m256i>(d.v()); const __m256i b = AVX::avx_cast<__m256i>(shiftIn.d.v()); if (amount * 2 == int(Size)) { return shifted_shortcut(d.v(), shiftIn.d.v(), WidthT()); } if (amount * 2 == -int(Size)) { return shifted_shortcut(shiftIn.d.v(), d.v(), WidthT()); } switch (amount) { case 1: return AVX::avx_cast( #ifdef Vc_IMPL_AVX2 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a, sizeof(EntryType)) #else AVX::concat( _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), sizeof(EntryType)), _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), sizeof(EntryType))) #endif ); case 2: return AVX::avx_cast( #ifdef Vc_IMPL_AVX2 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a, 2 * sizeof(EntryType)) #else AVX::concat( _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), 2 * sizeof(EntryType)), _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), 2 * sizeof(EntryType))) #endif ); case 3: if (6u < Size) { return AVX::avx_cast( #ifdef Vc_IMPL_AVX2 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a, 3 * sizeof(EntryType)) #else AVX::concat(_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), 3 * sizeof(EntryType)), _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), 3 * sizeof(EntryType))) #endif ); } } } #endif using Detail::operator|; return shifted(amount) | (amount > 0 ? shiftIn.shifted(amount - Size) : shiftIn.shifted(Size + amount)); } template Vc_INTRINSIC AVX2::Vector Vector::rotated(int amount) const { return Detail::rotated(d.v(), amount); } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::sorted() const { return Detail::sorted(*this); } template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveLow(AVX2::double_v x) const { return Mem::shuffle128(_mm256_unpacklo_pd(data(), x.data()), _mm256_unpackhi_pd(data(), x.data())); } template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveHigh(AVX2::double_v x) const { return Mem::shuffle128(_mm256_unpacklo_pd(data(), x.data()), _mm256_unpackhi_pd(data(), x.data())); } template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveLow(AVX2::float_v x) const { return Mem::shuffle128(_mm256_unpacklo_ps(data(), x.data()), _mm256_unpackhi_ps(data(), x.data())); } template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveHigh(AVX2::float_v x) const { return Mem::shuffle128(_mm256_unpacklo_ps(data(), x.data()), _mm256_unpackhi_ps(data(), x.data())); } #ifdef Vc_IMPL_AVX2 template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveLow ( AVX2::int_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi32(data(), x.data()), _mm256_unpackhi_epi32(data(), x.data())); } template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveHigh( AVX2::int_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi32(data(), x.data()), _mm256_unpackhi_epi32(data(), x.data())); } template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveLow ( AVX2::uint_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi32(data(), x.data()), _mm256_unpackhi_epi32(data(), x.data())); } template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveHigh( AVX2::uint_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi32(data(), x.data()), _mm256_unpackhi_epi32(data(), x.data())); } template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveLow ( AVX2::short_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi16(data(), x.data()), _mm256_unpackhi_epi16(data(), x.data())); } template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveHigh( AVX2::short_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi16(data(), x.data()), _mm256_unpackhi_epi16(data(), x.data())); } template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveLow (AVX2::ushort_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi16(data(), x.data()), _mm256_unpackhi_epi16(data(), x.data())); } template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveHigh(AVX2::ushort_v x) const { return Mem::shuffle128(_mm256_unpacklo_epi16(data(), x.data()), _mm256_unpackhi_epi16(data(), x.data())); } #endif template <> Vc_INTRINSIC Vc_PURE AVX2::double_v AVX2::double_v::operator[](Permutation::ReversedTag) const { return Mem::permute128(Mem::permute(d.v())); } template <> Vc_INTRINSIC Vc_PURE AVX2::float_v AVX2::float_v::operator[](Permutation::ReversedTag) const { return Mem::permute128(Mem::permute(d.v())); } #ifdef Vc_IMPL_AVX2 template <> Vc_INTRINSIC Vc_PURE AVX2::int_v AVX2::int_v::operator[](Permutation::ReversedTag) const { return Mem::permute128(Mem::permute(d.v())); } template <> Vc_INTRINSIC Vc_PURE AVX2::uint_v AVX2::uint_v::operator[](Permutation::ReversedTag) const { return Mem::permute128(Mem::permute(d.v())); } template <> Vc_INTRINSIC Vc_PURE AVX2::short_v AVX2::short_v::operator[]( Permutation::ReversedTag) const { return Mem::permute128(AVX::avx_cast<__m256i>(Mem::shuffle( AVX::avx_cast<__m256d>(Mem::permuteHi(d.v())), AVX::avx_cast<__m256d>(Mem::permuteLo(d.v()))))); } template <> Vc_INTRINSIC Vc_PURE AVX2::ushort_v AVX2::ushort_v::operator[]( Permutation::ReversedTag) const { return Mem::permute128(AVX::avx_cast<__m256i>(Mem::shuffle( AVX::avx_cast<__m256d>(Mem::permuteHi(d.v())), AVX::avx_cast<__m256d>(Mem::permuteLo(d.v()))))); } #endif template <> Vc_INTRINSIC AVX2::float_v Vector::operator[](const IndexType & ) const { return *this; #ifdef Vc_IMPL_AVX2 #else #endif } template Vc_INTRINSIC Vc_PURE Vector Vector::reversed() const { return (*this)[Permutation::Reversed]; } template <> template Vc_INTRINSIC AVX2::float_v AVX2::float_v::broadcast() const { constexpr VecPos Inner = static_cast(Index & 0x3); constexpr VecPos Outer = static_cast((Index & 0x4) / 4); return Mem::permute(Mem::permute128(d.v())); } template <> template Vc_INTRINSIC AVX2::double_v AVX2::double_v::broadcast() const { constexpr VecPos Inner = static_cast(Index & 0x1); constexpr VecPos Outer = static_cast((Index & 0x2) / 2); return Mem::permute(Mem::permute128(d.v())); } } #ifndef VC_AVX_SIMD_CAST_H_ #define VC_AVX_SIMD_CAST_H_ #ifndef VC_AVX_VECTOR_H_ #error "Vc/avx/vector.h needs to be included before Vc/avx/simd_cast.h" #endif namespace Vc_VERSIONED_NAMESPACE { #define Vc_SIMD_CAST_AVX_1(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ AVX2::from_ x, enable_if::value> = nullarg) #define Vc_SIMD_CAST_AVX_2(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ AVX2::from_ x0, AVX2::from_ x1, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_AVX_3(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_AVX_4(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, AVX2::from_ x3, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_1(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x, enable_if::value> = nullarg) #define Vc_SIMD_CAST_2(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, enable_if::value> = nullarg) #define Vc_SIMD_CAST_3(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, enable_if::value> = nullarg) #define Vc_SIMD_CAST_4(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, from_ x3, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_5(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_6(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_7(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_8(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, from_ x7, \ enable_if::value> = nullarg) #define Vc_SIMD_CAST_OFFSET(from_,to_,offset_) \ static_assert(from_::size() >= to_::size() * (offset_ + 1), \ "this offset cannot exist for this type combination"); \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x, \ enable_if<(offset == offset_ && std::is_same::value)> = nullarg) template Vc_INTRINSIC Vc_CONST To simd_cast(From x, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)> = nullarg); template Vc_INTRINSIC Vc_CONST To simd_cast( From x0, From x1, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)> = nullarg); template Vc_INTRINSIC Vc_CONST To simd_cast( From x0, From x1, From x2, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)> = nullarg); template Vc_INTRINSIC Vc_CONST To simd_cast( From x0, From x1, From x2, From x3, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)> = nullarg); template Vc_INTRINSIC Vc_CONST To simd_cast( From x0, From x1, From x2, From x3, From x4, From x5, From x6, From x7, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)> = nullarg); Vc_SIMD_CAST_AVX_1( float_v, double_v); Vc_SIMD_CAST_AVX_1(double_v, float_v); Vc_SIMD_CAST_AVX_2(double_v, float_v); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1( int_v, double_v); Vc_SIMD_CAST_AVX_1( uint_v, double_v); Vc_SIMD_CAST_AVX_1( short_v, double_v); Vc_SIMD_CAST_AVX_1(ushort_v, double_v); Vc_SIMD_CAST_AVX_1( int_v, float_v); Vc_SIMD_CAST_AVX_1( uint_v, float_v); Vc_SIMD_CAST_AVX_1( short_v, float_v); Vc_SIMD_CAST_AVX_1(ushort_v, float_v); Vc_SIMD_CAST_AVX_1(double_v, int_v); Vc_SIMD_CAST_AVX_1( float_v, int_v); Vc_SIMD_CAST_AVX_1( uint_v, int_v); Vc_SIMD_CAST_AVX_1( short_v, int_v); Vc_SIMD_CAST_AVX_1(ushort_v, int_v); Vc_SIMD_CAST_AVX_2(double_v, int_v); Vc_SIMD_CAST_AVX_1(double_v, uint_v); Vc_SIMD_CAST_AVX_1( float_v, uint_v); Vc_SIMD_CAST_AVX_1( int_v, uint_v); Vc_SIMD_CAST_AVX_1( short_v, uint_v); Vc_SIMD_CAST_AVX_1(ushort_v, uint_v); Vc_SIMD_CAST_AVX_2(double_v, uint_v); Vc_SIMD_CAST_AVX_1(double_v, short_v); Vc_SIMD_CAST_AVX_1( float_v, short_v); Vc_SIMD_CAST_AVX_1( int_v, short_v); Vc_SIMD_CAST_AVX_1( uint_v, short_v); Vc_SIMD_CAST_AVX_1(ushort_v, short_v); Vc_SIMD_CAST_AVX_2(double_v, short_v); Vc_SIMD_CAST_AVX_2( float_v, short_v); Vc_SIMD_CAST_AVX_2( int_v, short_v); Vc_SIMD_CAST_AVX_2( uint_v, short_v); Vc_SIMD_CAST_AVX_3(double_v, short_v); Vc_SIMD_CAST_AVX_4(double_v, short_v); Vc_SIMD_CAST_AVX_1(double_v, ushort_v); Vc_SIMD_CAST_AVX_1( float_v, ushort_v); Vc_SIMD_CAST_AVX_1( int_v, ushort_v); Vc_SIMD_CAST_AVX_1( uint_v, ushort_v); Vc_SIMD_CAST_AVX_1( short_v, ushort_v); Vc_SIMD_CAST_AVX_2(double_v, ushort_v); Vc_SIMD_CAST_AVX_2( float_v, ushort_v); Vc_SIMD_CAST_AVX_2( int_v, ushort_v); Vc_SIMD_CAST_AVX_2( uint_v, ushort_v); Vc_SIMD_CAST_AVX_3(double_v, ushort_v); Vc_SIMD_CAST_AVX_4(double_v, ushort_v); #endif Vc_SIMD_CAST_1(SSE::double_v, AVX2::double_v); Vc_SIMD_CAST_1(SSE:: float_v, AVX2::double_v); Vc_SIMD_CAST_1(SSE:: int_v, AVX2::double_v); Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::double_v); Vc_SIMD_CAST_1(SSE:: short_v, AVX2::double_v); Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::double_v); Vc_SIMD_CAST_1(SSE::double_v, AVX2:: float_v); Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: float_v); Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: float_v); Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: float_v); Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: float_v); Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: float_v); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(SSE::double_v, AVX2:: int_v); Vc_SIMD_CAST_1(SSE::double_v, AVX2:: uint_v); Vc_SIMD_CAST_1(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_1(SSE::double_v, AVX2::ushort_v); Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: int_v); Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: uint_v); Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: short_v); Vc_SIMD_CAST_1(SSE:: float_v, AVX2::ushort_v); Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: int_v); Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: int_v); Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: int_v); Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: int_v); Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: uint_v); Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: uint_v); Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: uint_v); Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: uint_v); Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: short_v); Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: short_v); Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: short_v); Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: short_v); Vc_SIMD_CAST_1(SSE:: int_v, AVX2::ushort_v); Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::ushort_v); Vc_SIMD_CAST_1(SSE:: short_v, AVX2::ushort_v); Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::ushort_v); #endif Vc_SIMD_CAST_2(SSE::double_v, AVX2::double_v); Vc_SIMD_CAST_2(SSE::double_v, AVX2:: float_v); Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: float_v); Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: float_v); Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: float_v); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_2(SSE::double_v, AVX2:: int_v); Vc_SIMD_CAST_2(SSE::double_v, AVX2:: uint_v); Vc_SIMD_CAST_2(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_2(SSE::double_v, AVX2::ushort_v); Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: int_v); Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: uint_v); Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: short_v); Vc_SIMD_CAST_2(SSE:: float_v, AVX2::ushort_v); Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: int_v); Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: int_v); Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: uint_v); Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: uint_v); Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: short_v); Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: short_v); Vc_SIMD_CAST_2(SSE:: short_v, AVX2:: short_v); Vc_SIMD_CAST_2(SSE::ushort_v, AVX2:: short_v); Vc_SIMD_CAST_2(SSE:: int_v, AVX2::ushort_v); Vc_SIMD_CAST_2(SSE:: uint_v, AVX2::ushort_v); Vc_SIMD_CAST_2(SSE:: short_v, AVX2::ushort_v); Vc_SIMD_CAST_2(SSE::ushort_v, AVX2::ushort_v); #endif Vc_SIMD_CAST_3(SSE::double_v, AVX2:: float_v); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_3(SSE::double_v, AVX2:: int_v); Vc_SIMD_CAST_3(SSE::double_v, AVX2:: uint_v); Vc_SIMD_CAST_3(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_3(SSE::double_v, AVX2::ushort_v); Vc_SIMD_CAST_3(SSE:: float_v, AVX2:: short_v); Vc_SIMD_CAST_3(SSE:: float_v, AVX2::ushort_v); Vc_SIMD_CAST_3(SSE:: int_v, AVX2:: short_v); Vc_SIMD_CAST_3(SSE:: uint_v, AVX2:: short_v); Vc_SIMD_CAST_3(SSE:: int_v, AVX2::ushort_v); Vc_SIMD_CAST_3(SSE:: uint_v, AVX2::ushort_v); #endif Vc_SIMD_CAST_4(SSE::double_v, AVX2:: float_v); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_4(SSE::double_v, AVX2:: int_v); Vc_SIMD_CAST_4(SSE::double_v, AVX2:: uint_v); Vc_SIMD_CAST_4(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_4(SSE::double_v, AVX2::ushort_v); Vc_SIMD_CAST_4(SSE:: float_v, AVX2:: short_v); Vc_SIMD_CAST_4(SSE:: float_v, AVX2::ushort_v); Vc_SIMD_CAST_4(SSE:: int_v, AVX2:: short_v); Vc_SIMD_CAST_4(SSE:: uint_v, AVX2:: short_v); Vc_SIMD_CAST_4(SSE:: int_v, AVX2::ushort_v); Vc_SIMD_CAST_4(SSE:: uint_v, AVX2::ushort_v); #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_5(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_5(SSE::double_v, AVX2::ushort_v); #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_6(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_6(SSE::double_v, AVX2::ushort_v); #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_7(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_7(SSE::double_v, AVX2::ushort_v); #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_8(SSE::double_v, AVX2:: short_v); Vc_SIMD_CAST_8(SSE::double_v, AVX2::ushort_v); #endif Vc_SIMD_CAST_1(AVX2::double_v, SSE::double_v); Vc_SIMD_CAST_1(AVX2::double_v, SSE:: float_v); Vc_SIMD_CAST_1(AVX2::double_v, SSE:: int_v); Vc_SIMD_CAST_1(AVX2::double_v, SSE:: uint_v); Vc_SIMD_CAST_1(AVX2::double_v, SSE:: short_v); Vc_SIMD_CAST_1(AVX2::double_v, SSE::ushort_v); Vc_SIMD_CAST_1(AVX2:: float_v, SSE::double_v); Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: float_v); Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: int_v); Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: uint_v); Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: short_v); Vc_SIMD_CAST_1(AVX2:: float_v, SSE::ushort_v); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(AVX2:: int_v, SSE::double_v); Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: float_v); Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: uint_v); Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: int_v); Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: short_v); Vc_SIMD_CAST_1(AVX2:: int_v, SSE::ushort_v); Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::double_v); Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: float_v); Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: int_v); Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: uint_v); Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: short_v); Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::ushort_v); Vc_SIMD_CAST_1(AVX2:: short_v, SSE::double_v); Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: float_v); Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: int_v); Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: uint_v); Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: short_v); Vc_SIMD_CAST_1(AVX2:: short_v, SSE::ushort_v); Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::double_v); Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: float_v); Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: int_v); Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: uint_v); Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: short_v); Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::ushort_v); #endif Vc_SIMD_CAST_2(AVX2::double_v, SSE:: short_v); Vc_SIMD_CAST_2(AVX2::double_v, SSE::ushort_v); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value> = nullarg); #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value> = nullarg); #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value> = nullarg); #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value> = nullarg); #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value> = nullarg); #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value> = nullarg); #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value> = nullarg); #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> = nullarg); #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value> = nullarg); #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, enable_if::value> = nullarg); #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, enable_if::value> = nullarg); #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, enable_if::value> = nullarg); #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, enable_if::value> = nullarg); #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, enable_if::value> = nullarg); #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, enable_if::value> = nullarg); #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, enable_if::value> = nullarg); #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, Scalar::Vector x15, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, Scalar::Vector x15, enable_if::value> = nullarg); #endif template Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Vector x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(const AVX2::Mask &k, enable_if::value> = nullarg); Vc_SIMD_CAST_AVX_2(double_m, float_m); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_2(double_m, int_m); Vc_SIMD_CAST_AVX_2(double_m, uint_m); Vc_SIMD_CAST_AVX_2(double_m, short_m); Vc_SIMD_CAST_AVX_2(double_m, ushort_m); Vc_SIMD_CAST_AVX_2( float_m, short_m); Vc_SIMD_CAST_AVX_2( float_m, ushort_m); Vc_SIMD_CAST_AVX_2( int_m, short_m); Vc_SIMD_CAST_AVX_2( int_m, ushort_m); Vc_SIMD_CAST_AVX_2( uint_m, short_m); Vc_SIMD_CAST_AVX_2( uint_m, ushort_m); #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_4(double_m, short_m); Vc_SIMD_CAST_AVX_4(double_m, ushort_m); #endif Vc_SIMD_CAST_1(SSE::double_m, AVX2::double_m); Vc_SIMD_CAST_1(SSE::double_m, AVX2:: float_m); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(SSE::double_m, AVX2:: int_m); Vc_SIMD_CAST_1(SSE::double_m, AVX2:: uint_m); Vc_SIMD_CAST_1(SSE::double_m, AVX2:: short_m); Vc_SIMD_CAST_1(SSE::double_m, AVX2::ushort_m); #endif Vc_SIMD_CAST_1(SSE:: float_m, AVX2::double_m); Vc_SIMD_CAST_1(SSE:: int_m, AVX2::double_m); Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::double_m); Vc_SIMD_CAST_1(SSE:: short_m, AVX2::double_m); Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::double_m); Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: float_m); Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: float_m); Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: float_m); Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: float_m); Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: float_m); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: int_m); Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: uint_m); Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: int_m); Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: uint_m); Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: int_m); Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: uint_m); Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: short_m); Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: short_m); Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: short_m); Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: short_m); Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: short_m); Vc_SIMD_CAST_1(SSE:: float_m, AVX2::ushort_m); Vc_SIMD_CAST_1(SSE:: int_m, AVX2::ushort_m); Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::ushort_m); Vc_SIMD_CAST_1(SSE:: short_m, AVX2::ushort_m); Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::ushort_m); Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: int_m); Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: uint_m); Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: int_m); Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: uint_m); #endif Vc_SIMD_CAST_2(SSE::double_m, AVX2::double_m); Vc_SIMD_CAST_2(SSE::double_m, AVX2:: float_m); Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: float_m); Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: float_m); Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: float_m); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_2(SSE::double_m, AVX2:: int_m); Vc_SIMD_CAST_2(SSE::double_m, AVX2:: uint_m); Vc_SIMD_CAST_2(SSE::double_m, AVX2:: short_m); Vc_SIMD_CAST_2(SSE::double_m, AVX2::ushort_m); Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: int_m); Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: uint_m); Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: short_m); Vc_SIMD_CAST_2(SSE:: float_m, AVX2::ushort_m); Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: int_m); Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: uint_m); Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: short_m); Vc_SIMD_CAST_2(SSE:: int_m, AVX2::ushort_m); Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: int_m); Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: uint_m); Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: short_m); Vc_SIMD_CAST_2(SSE:: uint_m, AVX2::ushort_m); Vc_SIMD_CAST_2(SSE:: short_m, AVX2:: short_m); Vc_SIMD_CAST_2(SSE:: short_m, AVX2::ushort_m); Vc_SIMD_CAST_2(SSE::ushort_m, AVX2:: short_m); Vc_SIMD_CAST_2(SSE::ushort_m, AVX2::ushort_m); #endif Vc_SIMD_CAST_4(SSE::double_m, AVX2:: float_m); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_4(SSE::double_m, AVX2:: int_m); Vc_SIMD_CAST_4(SSE::double_m, AVX2:: uint_m); Vc_SIMD_CAST_4(SSE::double_m, AVX2:: short_m); Vc_SIMD_CAST_4(SSE::double_m, AVX2::ushort_m); Vc_SIMD_CAST_4(SSE:: float_m, AVX2:: short_m); Vc_SIMD_CAST_4(SSE:: float_m, AVX2::ushort_m); Vc_SIMD_CAST_4(SSE:: int_m, AVX2:: short_m); Vc_SIMD_CAST_4(SSE:: int_m, AVX2::ushort_m); Vc_SIMD_CAST_4(SSE:: uint_m, AVX2:: short_m); Vc_SIMD_CAST_4(SSE:: uint_m, AVX2::ushort_m); #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k0, Scalar::Mask k1, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast( Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, enable_if<(AVX2::is_mask::value && Return::Size >= 4)> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast( Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, Scalar::Mask k4, Scalar::Mask k5, Scalar::Mask k6, Scalar::Mask k7, enable_if<(AVX2::is_mask::value && Return::Size >= 8)> = nullarg); template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, Scalar::Mask k4, Scalar::Mask k5, Scalar::Mask k6, Scalar::Mask k7, Scalar::Mask k8, Scalar::Mask k9, Scalar::Mask k10, Scalar::Mask k11, Scalar::Mask k12, Scalar::Mask k13, Scalar::Mask k14, Scalar::Mask k15, enable_if<(AVX2::is_mask::value && Return::Size >= 16)> = nullarg); Vc_SIMD_CAST_1(AVX2::double_m, SSE::double_m); Vc_SIMD_CAST_1(AVX2::double_m, SSE:: float_m); Vc_SIMD_CAST_1(AVX2::double_m, SSE:: int_m); Vc_SIMD_CAST_1(AVX2::double_m, SSE:: uint_m); Vc_SIMD_CAST_1(AVX2::double_m, SSE:: short_m); Vc_SIMD_CAST_1(AVX2::double_m, SSE::ushort_m); Vc_SIMD_CAST_1(AVX2:: float_m, SSE::double_m); Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: float_m); Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: int_m); Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: uint_m); Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: short_m); Vc_SIMD_CAST_1(AVX2:: float_m, SSE::ushort_m); #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(AVX2:: int_m, SSE::double_m); Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: float_m); Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: int_m); Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: uint_m); Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: short_m); Vc_SIMD_CAST_1(AVX2:: int_m, SSE::ushort_m); Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::double_m); Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: float_m); Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: int_m); Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: uint_m); Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: short_m); Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::ushort_m); Vc_SIMD_CAST_1(AVX2:: short_m, SSE::double_m); Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: float_m); Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: int_m); Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: uint_m); Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: short_m); Vc_SIMD_CAST_1(AVX2:: short_m, SSE::ushort_m); Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::double_m); Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: float_m); Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: int_m); Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: uint_m); Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: short_m); Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::ushort_m); #endif Vc_SIMD_CAST_2(AVX2::double_m, SSE:: short_m); Vc_SIMD_CAST_2(AVX2::double_m, SSE::ushort_m); template Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Mask x, enable_if::value> = nullarg); template Vc_INTRINSIC Vc_CONST enable_if< (offset == 0 && ((AVX2::is_vector::value && !Scalar::is_vector::value && Traits::is_simd_vector::value && !Traits::isSimdArray::value) || (AVX2::is_mask::value && !Scalar::is_mask::value && Traits::is_simd_mask::value && !Traits::isSimdMaskArray::value))), Return> simd_cast(const From &x); template Vc_INTRINSIC Vc_CONST Return simd_cast( const From &x, enable_if::value && AVX2::is_vector::value) || (SSE::is_mask::value && AVX2::is_mask::value))> = nullarg); template Vc_INTRINSIC Vc_CONST enable_if<(AVX2::is_vector::value && offset != 0), Return> simd_cast(AVX2::Vector x); template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector::value && sizeof(AVX2::Vector) == 32), Return> simd_cast(AVX2::Vector x); template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector::value && sizeof(AVX2::Vector) == 16), Return> simd_cast(AVX2::Vector x); Vc_SIMD_CAST_OFFSET(SSE:: short_v, AVX2::double_v, 1); Vc_SIMD_CAST_OFFSET(SSE::ushort_v, AVX2::double_v, 1); Vc_SIMD_CAST_OFFSET(SSE:: short_m, AVX2::double_m, 1); Vc_SIMD_CAST_OFFSET(SSE::ushort_m, AVX2::double_m, 1); template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask::value && sizeof(AVX2::Mask) == 32), Return> simd_cast(AVX2::Mask x); template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask::value && sizeof(AVX2::Mask) == 16), Return> simd_cast(AVX2::Mask x); #undef Vc_SIMD_CAST_AVX_1 #define Vc_SIMD_CAST_AVX_1(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x, \ enable_if::value>) #undef Vc_SIMD_CAST_AVX_2 #define Vc_SIMD_CAST_AVX_2(from_,to_) \ static_assert(AVX2::from_::size() * 2 <= AVX2::to_::size(), \ "this type combination is wrong"); \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, \ enable_if::value>) #undef Vc_SIMD_CAST_AVX_3 #define Vc_SIMD_CAST_AVX_3(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \ enable_if::value>) #undef Vc_SIMD_CAST_AVX_4 #define Vc_SIMD_CAST_AVX_4(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \ AVX2::from_ x3, \ enable_if::value>) #undef Vc_SIMD_CAST_1 #define Vc_SIMD_CAST_1(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x, enable_if::value>) #undef Vc_SIMD_CAST_2 #define Vc_SIMD_CAST_2(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, \ enable_if::value>) #undef Vc_SIMD_CAST_3 #define Vc_SIMD_CAST_3(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, \ enable_if::value>) #undef Vc_SIMD_CAST_4 #define Vc_SIMD_CAST_4(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, \ enable_if::value>) #undef Vc_SIMD_CAST_5 #define Vc_SIMD_CAST_5(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ enable_if::value>) #undef Vc_SIMD_CAST_6 #define Vc_SIMD_CAST_6(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ from_ x5, \ enable_if::value>) #undef Vc_SIMD_CAST_7 #define Vc_SIMD_CAST_7(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ from_ x5, from_ x6, \ enable_if::value>) #undef Vc_SIMD_CAST_8 #define Vc_SIMD_CAST_8(from_,to_) \ template \ Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \ from_ x5, from_ x6, from_ x7, \ enable_if::value>) #undef Vc_SIMD_CAST_OFFSET #define Vc_SIMD_CAST_OFFSET(from_,to_,offset_) \ static_assert(from_::size() >= to_::size() * (offset_ + 1), \ "this offset cannot exist for this type combination"); \ template \ Vc_INTRINSIC Vc_CONST To simd_cast( \ from_ x, enable_if<(offset == offset_ && std::is_same::value)>) template Vc_INTRINSIC Vc_CONST To simd_cast(From x, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)>) { return simd_cast>(x).data(); } template Vc_INTRINSIC Vc_CONST To simd_cast(From x0, From x1, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)>) { return simd_cast>(x0, x1).data(); } template Vc_INTRINSIC Vc_CONST To simd_cast(From x0, From x1, From x2, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)>) { return simd_cast>(x0, x1, x2).data(); } template Vc_INTRINSIC Vc_CONST To simd_cast(From x0, From x1, From x2, From x3, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)>) { return simd_cast>(x0, x1, x2, x3).data(); } template Vc_INTRINSIC Vc_CONST To simd_cast(From x0, From x1, From x2, From x3, From x4, From x5, From x6, From x7, enable_if<(AVX2::is_vector::value && SSE::is_vector::value && SSE::Vector::Size == To::Size)>) { return simd_cast>(x0, x1, x2, x3, x4, x5, x6, x7) .data(); } Vc_SIMD_CAST_AVX_1( float_v, double_v) { return _mm256_cvtps_pd(AVX::lo128(x.data())); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1( int_v, double_v) { return AVX::convert< int, double>(AVX::lo128(x.data())); } Vc_SIMD_CAST_AVX_1( uint_v, double_v) { return AVX::convert< uint, double>(AVX::lo128(x.data())); } Vc_SIMD_CAST_AVX_1( short_v, double_v) { return AVX::convert< short, double>(AVX::lo128(x.data())); } Vc_SIMD_CAST_AVX_1(ushort_v, double_v) { return AVX::convert(AVX::lo128(x.data())); } #endif Vc_SIMD_CAST_AVX_1(double_v, float_v) { return AVX::zeroExtend(_mm256_cvtpd_ps(x.data())); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1( int_v, float_v) { return AVX::convert< int, float>(x.data()); } Vc_SIMD_CAST_AVX_1( uint_v, float_v) { return AVX::convert< uint, float>(x.data()); } Vc_SIMD_CAST_AVX_1( short_v, float_v) { return AVX::convert< short, float>(AVX::lo128(x.data())); } Vc_SIMD_CAST_AVX_1(ushort_v, float_v) { return AVX::convert(AVX::lo128(x.data())); } #endif Vc_SIMD_CAST_AVX_2(double_v, float_v) { return AVX::concat(_mm256_cvtpd_ps(x0.data()), _mm256_cvtpd_ps(x1.data())); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1(double_v, int_v) { return AVX::zeroExtend(_mm256_cvttpd_epi32(x.data())); } Vc_SIMD_CAST_AVX_1( float_v, int_v) { return _mm256_cvttps_epi32(x.data()); } Vc_SIMD_CAST_AVX_1( uint_v, int_v) { return x.data(); } Vc_SIMD_CAST_AVX_1( short_v, int_v) { return _mm256_cvtepi16_epi32(AVX::lo128(x.data())); } Vc_SIMD_CAST_AVX_1(ushort_v, int_v) { return _mm256_cvtepu16_epi32(AVX::lo128(x.data())); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_2(double_v, int_v) { return AVX::concat(_mm256_cvttpd_epi32(x0.data()), _mm256_cvttpd_epi32(x1.data())); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1(double_v, uint_v) { return AVX::zeroExtend(AVX::convert(x.data())); } Vc_SIMD_CAST_AVX_1( float_v, uint_v) { return _mm256_blendv_epi8( _mm256_cvttps_epi32(x.data()), _mm256_add_epi32( _mm256_cvttps_epi32(_mm256_sub_ps(x.data(), AVX::set2power31_ps())), AVX::set2power31_epu32()), _mm256_castps_si256(AVX::cmpge_ps(x.data(), AVX::set2power31_ps()))); } Vc_SIMD_CAST_AVX_1( int_v, uint_v) { return x.data(); } Vc_SIMD_CAST_AVX_1( short_v, uint_v) { return _mm256_cvtepi16_epi32(AVX::lo128(x.data())); } Vc_SIMD_CAST_AVX_1(ushort_v, uint_v) { return _mm256_cvtepu16_epi32(AVX::lo128(x.data())); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_2(double_v, uint_v) { return AVX::concat(AVX::convert(x0.data()), AVX::convert(x1.data())); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1(double_v, short_v) { return AVX::zeroExtend(_mm_packs_epi32(_mm256_cvttpd_epi32(x.data()), _mm_setzero_si128())); } Vc_SIMD_CAST_AVX_1( float_v, short_v) { const auto tmp = _mm256_cvttps_epi32(x.data()); return AVX::zeroExtend(_mm_packs_epi32(AVX::lo128(tmp), AVX::hi128(tmp))); } Vc_SIMD_CAST_AVX_1( int_v, short_v) { return AVX::zeroExtend(AVX::convert< int, short>(x.data())); } Vc_SIMD_CAST_AVX_1( uint_v, short_v) { return AVX::zeroExtend(AVX::convert(x.data())); } Vc_SIMD_CAST_AVX_1(ushort_v, short_v) { return x.data(); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_2(double_v, short_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); return AVX::zeroExtend(_mm_packs_epi32(tmp0, tmp1)); } Vc_SIMD_CAST_AVX_2( float_v, short_v) { using AVX2::short_v; using AVX2::int_v; return simd_cast(simd_cast(x0), simd_cast(x1)); } Vc_SIMD_CAST_AVX_2( int_v, short_v) { const auto shuf = _mm256_setr_epi8( 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80); auto a = _mm256_shuffle_epi8(x0.data(), shuf); auto b = _mm256_shuffle_epi8(x1.data(), shuf); return Mem::permute4x64(_mm256_unpacklo_epi64(a, b)); } Vc_SIMD_CAST_AVX_2( uint_v, short_v) { const auto shuf = _mm256_setr_epi8( 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80); auto a = _mm256_shuffle_epi8(x0.data(), shuf); auto b = _mm256_shuffle_epi8(x1.data(), shuf); return Mem::permute4x64(_mm256_unpacklo_epi64(a, b)); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_3(double_v, short_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); const auto tmp2 = _mm256_cvttpd_epi32(x2.data()); return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, _mm_setzero_si128())); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_4(double_v, short_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); const auto tmp2 = _mm256_cvttpd_epi32(x2.data()); const auto tmp3 = _mm256_cvttpd_epi32(x3.data()); return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, tmp3)); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_1(double_v, ushort_v) { const auto tmp = _mm256_cvttpd_epi32(x.data()); return AVX::zeroExtend(_mm_packus_epi32(tmp, _mm_setzero_si128())); } Vc_SIMD_CAST_AVX_1( float_v, ushort_v) { const auto tmp = _mm256_cvttps_epi32(x.data()); return AVX::zeroExtend(_mm_packus_epi32(AVX::lo128(tmp), AVX::hi128(tmp))); } Vc_SIMD_CAST_AVX_1( int_v, ushort_v) { return AVX::zeroExtend(AVX::convert< int, ushort>(x.data())); } Vc_SIMD_CAST_AVX_1( uint_v, ushort_v) { return AVX::zeroExtend(AVX::convert(x.data())); } Vc_SIMD_CAST_AVX_1( short_v, ushort_v) { return x.data(); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_2(double_v, ushort_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); return AVX::zeroExtend(_mm_packus_epi32(tmp0, tmp1)); } Vc_SIMD_CAST_AVX_2( float_v, ushort_v) { using AVX2::ushort_v; using AVX2::int_v; return simd_cast(simd_cast(x0), simd_cast(x1)); } Vc_SIMD_CAST_AVX_2( int_v, ushort_v) { auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data()); auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data()); auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1); return Mem::permute4x64(_mm256_unpacklo_epi16(tmp2, tmp3)); } Vc_SIMD_CAST_AVX_2( uint_v, ushort_v) { auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data()); auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data()); auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1); auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1); return Mem::permute4x64(_mm256_unpacklo_epi16(tmp2, tmp3)); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_3(double_v, ushort_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); const auto tmp2 = _mm256_cvttpd_epi32(x2.data()); return AVX::concat(_mm_packus_epi32(tmp0, tmp1), _mm_packus_epi32(tmp2, _mm_setzero_si128())); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_4(double_v, ushort_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); const auto tmp2 = _mm256_cvttpd_epi32(x2.data()); const auto tmp3 = _mm256_cvttpd_epi32(x3.data()); return AVX::concat(_mm_packus_epi32(tmp0, tmp1), _mm_packus_epi32(tmp2, tmp3)); } #endif Vc_SIMD_CAST_1(SSE::double_v, AVX2::double_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: float_v, AVX2::double_v) { return _mm256_cvtps_pd(x.data()); } Vc_SIMD_CAST_1(SSE:: int_v, AVX2::double_v) { return _mm256_cvtepi32_pd(x.data()); } Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::double_v) { using namespace AvxIntrinsics; return _mm256_add_pd(_mm256_cvtepi32_pd(_mm_sub_epi32(x.data(), _mm_setmin_epi32())), set1_pd(1u << 31)); } Vc_SIMD_CAST_1(SSE:: short_v, AVX2::double_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::double_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(SSE::double_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: float_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: float_v) { return AVX::zeroExtend(_mm_cvtepi32_ps(x.data())); } Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: float_v) { return AVX::convert< short, float>(x.data()); } Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: float_v) { return AVX::convert(x.data()); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(SSE::double_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::double_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: float_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: int_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: int_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: int_v) { return AVX::convert< short, int>(x.data()); } Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: int_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: uint_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: uint_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: uint_v) { return AVX::convert< short, uint>(x.data()); } Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: uint_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: short_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: short_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: int_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: short_v, AVX2::ushort_v) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::ushort_v) { return AVX::zeroExtend(x.data()); } #endif Vc_SIMD_CAST_2(SSE::double_v, AVX2::double_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE::double_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: float_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: float_v) { return AVX::convert< int, float>(AVX::concat(x0.data(), x1.data())); } Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: float_v) { return AVX::convert(AVX::concat(x0.data(), x1.data())); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_2(SSE::double_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE::double_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: int_v) { return simd_cast(simd_cast(x0, x1)); } Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: uint_v) { return simd_cast(simd_cast(x0, x1)); } Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: float_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: int_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: int_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: uint_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: uint_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: short_v, AVX2:: short_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE::ushort_v, AVX2:: short_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: uint_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1).data()); } Vc_SIMD_CAST_2(SSE:: short_v, AVX2::ushort_v) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE::ushort_v, AVX2::ushort_v) { return AVX::concat(x0.data(), x1.data()); } #endif Vc_SIMD_CAST_3(SSE::double_v, AVX2:: float_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_3(SSE::double_v, AVX2:: int_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE::double_v, AVX2:: uint_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1, x2).data()); } Vc_SIMD_CAST_3(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1, x2).data()); } Vc_SIMD_CAST_3(SSE:: float_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE:: float_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE:: int_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE:: uint_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE:: int_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } Vc_SIMD_CAST_3(SSE:: uint_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2)); } #endif Vc_SIMD_CAST_4(SSE::double_v, AVX2:: float_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_4(SSE::double_v, AVX2:: int_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE::double_v, AVX2:: uint_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast(x0, x1, x2, x3).data()); } Vc_SIMD_CAST_4(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast(x0, x1, x2, x3).data()); } Vc_SIMD_CAST_4(SSE:: float_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE:: float_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE:: int_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE:: uint_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE:: int_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } Vc_SIMD_CAST_4(SSE:: uint_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3)); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_5(SSE::double_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4)); } Vc_SIMD_CAST_5(SSE::double_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4)); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_6(SSE::double_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5)); } Vc_SIMD_CAST_6(SSE::double_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5)); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_7(SSE::double_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5), simd_cast(x6)); } Vc_SIMD_CAST_7(SSE::double_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5), simd_cast(x6)); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_8(SSE::double_v, AVX2:: short_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5), simd_cast(x6, x7)); } Vc_SIMD_CAST_8(SSE::double_v, AVX2::ushort_v) { return simd_cast(simd_cast(x0, x1), simd_cast(x2, x3), simd_cast(x4, x5), simd_cast(x6, x7)); } #endif Vc_SIMD_CAST_1(AVX2::double_v, SSE::double_v) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: float_v) { return AVX::lo128(x.data()); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: int_v) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: uint_v) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: short_v) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::ushort_v) { return AVX::lo128(x.data()); } #endif Vc_SIMD_CAST_1(AVX2::double_v, SSE:: float_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2::double_v, SSE:: int_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2::double_v, SSE:: uint_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2::double_v, SSE:: short_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2::double_v, SSE::ushort_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2:: float_v, SSE::double_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: int_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: uint_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: short_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2:: float_v, SSE::ushort_v) { return AVX::convert(x.data()); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(AVX2:: int_v, SSE::double_v) { return SSE::convert(AVX::lo128(x.data())); } Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: float_v) { return SSE::convert(AVX::lo128(x.data())); } Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: uint_v) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: short_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2:: int_v, SSE::ushort_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::double_v) { return SSE::convert(AVX::lo128(x.data())); } Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: float_v) { return SSE::convert(AVX::lo128(x.data())); } Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: int_v) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: short_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::ushort_v) { return AVX::convert(x.data()); } Vc_SIMD_CAST_1(AVX2:: short_v, SSE::double_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: float_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: int_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: uint_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2:: short_v, SSE::ushort_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::double_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: float_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: int_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: uint_v) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: short_v) { return simd_cast(simd_cast(x)); } #endif Vc_SIMD_CAST_2(AVX2::double_v, SSE:: short_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); return _mm_packs_epi32(tmp0, tmp1); } Vc_SIMD_CAST_2(AVX2::double_v, SSE::ushort_v) { const auto tmp0 = _mm256_cvttpd_epi32(x0.data()); const auto tmp1 = _mm256_cvttpd_epi32(x1.data()); return _mm_packus_epi32(tmp0, tmp1); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value>) { return AVX::zeroExtend(_mm_setr_pd(x.data(), 0.)); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value>) { return AVX::zeroExtend(_mm_setr_ps(x.data(), 0.f, 0.f, 0.f)); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value>) { return _mm256_setr_epi32(x.data(), 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value>) { return _mm256_setr_epi32(uint(x.data()), 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value>) { return _mm256_setr_epi16(x.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x, enable_if::value>) { return _mm256_setr_epi16(x.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value>) { return AVX::zeroExtend(_mm_setr_pd(x0.data(), x1.data())); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value>) { return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), 0.f, 0.f)); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm256_setr_pd(x0.data(), x1.data(), x2.data(), 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), x2.data(), 0)); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value>) { return _mm256_setr_pd(x0.data(), x1.data(), x2.data(), x3.data()); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value>) { return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), x2.data(), x3.data())); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), uint(x3.data()), 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value>) { return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), uint(x3.data()), uint(x4.data()), 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value>) { return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), 0, 0); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), uint(x3.data()), uint(x4.data()), uint(x5.data()), 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value>) { return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), 0); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), uint(x3.data()), uint(x4.data()), uint(x5.data()), uint(x6.data()), 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0); } #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value>) { return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data()); } #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value>) { return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data()); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value>) { return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), uint(x3.data()), uint(x4.data()), uint(x5.data()), uint(x6.data()), uint(x7.data())); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), 0, 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), 0, 0, 0, 0, 0, 0, 0, 0); } #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), 0, 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), 0, 0, 0, 0, 0, 0, 0); } #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), 0, 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), 0, 0, 0, 0, 0, 0); } #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), 0, 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), 0, 0, 0, 0, 0); } #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), 0, 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), 0, 0, 0, 0); } #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), 0, 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), 0, 0, 0); } #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), x13.data(), 0, 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), x13.data(), 0, 0); } #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), x13.data(), x14.data(), 0); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), x13.data(), x14.data(), 0); } #endif #ifdef Vc_IMPL_AVX2 template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, Scalar::Vector x15, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), x13.data(), x14.data(), x15.data()); } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Vector x0, Scalar::Vector x1, Scalar::Vector x2, Scalar::Vector x3, Scalar::Vector x4, Scalar::Vector x5, Scalar::Vector x6, Scalar::Vector x7, Scalar::Vector x8, Scalar::Vector x9, Scalar::Vector x10, Scalar::Vector x11, Scalar::Vector x12, Scalar::Vector x13, Scalar::Vector x14, Scalar::Vector x15, enable_if::value>) { return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), x10.data(), x11.data(), x12.data(), x13.data(), x14.data(), x15.data()); } #endif template Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Vector x, enable_if::value>) { return static_cast(x[0]); } template Vc_INTRINSIC Vc_CONST Return simd_cast(const AVX2::Mask &k, enable_if::value>) { return {Detail::mask_cast::Size, Return::Size, typename Return::VectorTypeF>(k.dataI())}; } Vc_SIMD_CAST_AVX_2(double_m, float_m) { return AVX::concat(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_2(double_m, int_m) { return Mem::permute4x64(_mm256_packs_epi32(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2(double_m, uint_m) { return Mem::permute4x64(_mm256_packs_epi32(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2(double_m, short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI())))); } Vc_SIMD_CAST_AVX_2(double_m, ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI())))); } Vc_SIMD_CAST_AVX_2( float_m, short_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2( float_m, ushort_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2( int_m, short_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2( int_m, ushort_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2( uint_m, short_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_AVX_2( uint_m, ushort_m) { return Mem::permute4x64(_mm256_packs_epi16(x0.dataI(), x1.dataI())); } #endif #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_AVX_4(double_m, short_m) { using namespace AVX; const auto tmp = _mm256_packs_epi32( _mm256_packs_epi32(x0.dataI(), x1.dataI()) , _mm256_packs_epi32(x2.dataI(), x3.dataI()) ); return concat(_mm_unpacklo_epi32(lo128(tmp), hi128(tmp)), _mm_unpackhi_epi32(lo128(tmp), hi128(tmp))); } Vc_SIMD_CAST_AVX_4(double_m, ushort_m) { return simd_cast(x0, x1, x2, x3).data(); } #endif Vc_SIMD_CAST_1(SSE::double_m, AVX2::double_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE::double_m, AVX2:: float_m) { return AVX::zeroExtend(simd_cast(x).data()); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(SSE::double_m, AVX2:: int_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::double_m, AVX2:: uint_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } #endif Vc_SIMD_CAST_1(SSE:: float_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.dataF(), x.dataF()), _mm_unpackhi_ps(x.dataF(), x.dataF())); } Vc_SIMD_CAST_1(SSE:: int_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.dataF(), x.dataF()), _mm_unpackhi_ps(x.dataF(), x.dataF())); } Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.dataF(), x.dataF()), _mm_unpackhi_ps(x.dataF(), x.dataF())); } Vc_SIMD_CAST_1(SSE:: short_m, AVX2::double_m) { auto tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); } Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::double_m) { auto tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); } Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: float_m) { return AVX::zeroExtend(x.dataF()); } Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: float_m) { return AVX::zeroExtend(x.dataF()); } Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: float_m) { return AVX::zeroExtend(x.dataF()); } Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: float_m) { return AVX::concat(_mm_unpacklo_epi16(x.dataI(), x.dataI()), _mm_unpackhi_epi16(x.dataI(), x.dataI())); } Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: float_m) { return AVX::concat(_mm_unpacklo_epi16(x.dataI(), x.dataI()), _mm_unpackhi_epi16(x.dataI(), x.dataI())); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); } Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: float_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: int_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: short_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast(x).data()); } Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: int_m) { const auto v = Mem::permute4x64(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); } Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: uint_m) { const auto v = Mem::permute4x64(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); } Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: int_m) { const auto v = Mem::permute4x64(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); } Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: uint_m) { const auto v = Mem::permute4x64(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); } #endif Vc_SIMD_CAST_2(SSE::double_m, AVX2::double_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE::double_m, AVX2:: float_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_2(SSE::double_m, AVX2:: int_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE::double_m, AVX2:: uint_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_setzero_si128())); } Vc_SIMD_CAST_2(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_setzero_si128())); } Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: float_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: int_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: uint_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); } Vc_SIMD_CAST_2(SSE:: short_m, AVX2:: short_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE:: short_m, AVX2::ushort_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE::ushort_m, AVX2:: short_m) { return AVX::concat(x0.data(), x1.data()); } Vc_SIMD_CAST_2(SSE::ushort_m, AVX2::ushort_m) { return AVX::concat(x0.data(), x1.data()); } #endif Vc_SIMD_CAST_4(SSE::double_m, AVX2:: float_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_4(SSE::double_m, AVX2:: int_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE::double_m, AVX2:: uint_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI()))); } Vc_SIMD_CAST_4(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI()))); } Vc_SIMD_CAST_4(SSE:: float_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE:: float_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE:: int_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE:: int_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE:: uint_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } Vc_SIMD_CAST_4(SSE:: uint_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); } #endif template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k, enable_if::value>) { Return r{false}; r[0] = k.data(); return r; } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k0, Scalar::Mask k1, enable_if::value>) { Return r{false}; r[0] = k0.data(); r[1] = k1.data(); return r; } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, enable_if<(AVX2::is_mask::value && Return::Size >= 4)>) { Return r{false}; r[0] = k0.data(); r[1] = k1.data(); r[2] = k2.data(); r[3] = k3.data(); return r; } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, Scalar::Mask k4, Scalar::Mask k5, Scalar::Mask k6, Scalar::Mask k7, enable_if<(AVX2::is_mask::value && Return::Size >= 8)>) { Return r{false}; r[0] = k0.data(); r[1] = k1.data(); r[2] = k2.data(); r[3] = k3.data(); r[4] = k4.data(); r[5] = k5.data(); r[6] = k6.data(); r[7] = k7.data(); return r; } template Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask k0, Scalar::Mask k1, Scalar::Mask k2, Scalar::Mask k3, Scalar::Mask k4, Scalar::Mask k5, Scalar::Mask k6, Scalar::Mask k7, Scalar::Mask k8, Scalar::Mask k9, Scalar::Mask k10, Scalar::Mask k11, Scalar::Mask k12, Scalar::Mask k13, Scalar::Mask k14, Scalar::Mask k15, enable_if<(AVX2::is_mask::value && Return::Size >= 16)>) { Return r{false}; r[0] = k0.data(); r[1] = k1.data(); r[2] = k2.data(); r[3] = k3.data(); r[4] = k4.data(); r[5] = k5.data(); r[6] = k6.data(); r[7] = k7.data(); r[8] = k8.data(); r[9] = k9.data(); r[10] = k10.data(); r[11] = k11.data(); r[12] = k12.data(); r[13] = k13.data(); r[14] = k14.data(); r[15] = k15.data(); return r; } Vc_SIMD_CAST_1(AVX2::double_m, SSE::double_m) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2::double_m, SSE:: float_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2::double_m, SSE:: int_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2::double_m, SSE:: uint_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2::double_m, SSE:: short_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())), _mm_setzero_si128()); } Vc_SIMD_CAST_1(AVX2::double_m, SSE::ushort_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())), _mm_setzero_si128()); } Vc_SIMD_CAST_1(AVX2:: float_m, SSE::double_m) { return _mm_unpacklo_ps(AVX::lo128(x.data()), AVX::lo128(x.data())); } Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: float_m) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: int_m) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: uint_m) { return AVX::lo128(x.data()); } Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: float_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } #ifdef Vc_IMPL_AVX2 Vc_SIMD_CAST_1(AVX2:: int_m, SSE::double_m) { return _mm_unpacklo_epi32(AVX::lo128(x.dataI()), AVX::lo128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: float_m) { return AVX::lo128(x.dataI()); } Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: int_m) { return AVX::lo128(x.dataI()); } Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: uint_m) { return AVX::lo128(x.dataI()); } Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: int_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::double_m) { return _mm_unpacklo_epi32(AVX::lo128(x.dataI()), AVX::lo128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: float_m) { return AVX::lo128(x.dataI()); } Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: int_m) { return AVX::lo128(x.dataI()); } Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: uint_m) { return AVX::lo128(x.dataI()); } Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); } Vc_SIMD_CAST_1(AVX2:: short_m, SSE::double_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: float_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: int_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: uint_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: short_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2:: short_m, SSE::ushort_m) { return simd_cast(SSE::short_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::double_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: float_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: int_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: uint_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: short_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::ushort_m) { return simd_cast(SSE::ushort_m(AVX::lo128(x.data()))); } #endif Vc_SIMD_CAST_2(AVX2::double_m, SSE:: short_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); } Vc_SIMD_CAST_2(AVX2::double_m, SSE::ushort_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); } template Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Mask x, enable_if::value>) { return static_cast(x[0]); } template Vc_INTRINSIC Vc_CONST enable_if< (offset == 0 && ((AVX2::is_vector::value && !Scalar::is_vector::value && Traits::is_simd_vector::value && !Traits::isSimdArray::value) || (AVX2::is_mask::value && !Scalar::is_mask::value && Traits::is_simd_mask::value && !Traits::isSimdMaskArray::value))), Return> simd_cast(const From &x) { return simd_cast(x); } template Vc_INTRINSIC Vc_CONST Return simd_cast(const From &x, enable_if::value && AVX2::is_vector::value) || (SSE::is_mask::value && AVX2::is_mask::value))>) { return simd_cast(x); } template Vc_INTRINSIC Vc_CONST enable_if<(AVX2::is_vector::value && offset != 0), Return> simd_cast(AVX2::Vector x) { using V = AVX2::Vector; constexpr int shift = sizeof(T) * offset * Return::Size; static_assert(shift > 0 && shift < sizeof(x), ""); if (shift < 16) { return simd_cast(V{AVX::avx_cast( _mm_srli_si128(AVX::avx_cast<__m128i>(AVX::lo128(x.data())), shift))}); } else if (shift == 16) { return simd_cast(V{Mem::permute128(x.data())}); } else { #ifdef Vc_MSVC #pragma warning(push) #pragma warning(disable : 4556) #endif return simd_cast(V{AVX::avx_cast( _mm_srli_si128(AVX::avx_cast<__m128i>(AVX::hi128(x.data())), shift - 16))}); #ifdef Vc_MSVC #pragma warning(pop) #endif } } template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector::value && sizeof(AVX2::Vector) == 32), Return> simd_cast(AVX2::Vector x) { using V = AVX2::Vector; constexpr int shift = sizeof(V) / V::Size * offset * Return::Size; static_assert(shift > 0, ""); static_assert(shift < sizeof(V), ""); using SseVector = SSE::Vector; if (shift == 16) { return simd_cast(SseVector{AVX::hi128(x.data())}); } using Intrin = typename SseVector::VectorType; return simd_cast(SseVector{AVX::avx_cast( _mm_alignr_epi8(AVX::avx_cast<__m128i>(AVX::hi128(x.data())), AVX::avx_cast<__m128i>(AVX::lo128(x.data())), shift))}); } template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector::value && sizeof(AVX2::Vector) == 16), Return> simd_cast(AVX2::Vector x) { using V = AVX2::Vector; constexpr int shift = sizeof(V) / V::Size * offset * Return::Size; static_assert(shift > 0, ""); static_assert(shift < sizeof(V), ""); using SseVector = SSE::Vector; return simd_cast(SseVector{_mm_srli_si128(x.data(), shift)}); } Vc_SIMD_CAST_OFFSET(SSE:: short_v, AVX2::double_v, 1) { return simd_cast(simd_cast(x)); } Vc_SIMD_CAST_OFFSET(SSE::ushort_v, AVX2::double_v, 1) { return simd_cast(simd_cast(x)); } template Vc_INTRINSIC Vc_CONST Return simd_cast(const AVX2::Mask &k, enable_if<(AVX2::is_mask::value && offset == 1 && AVX2::Mask::Size == Return::Size * 2)> = nullarg) { const auto tmp = AVX::hi128(k.dataI()); return AVX::concat(_mm_unpacklo_epi8(tmp, tmp), _mm_unpackhi_epi8(tmp, tmp)); } template Vc_INTRINSIC Vc_CONST Return simd_cast(const AVX2::Mask &k, enable_if<(AVX2::is_mask::value && offset == 1 && AVX2::Mask::Size == Return::Size * 4)> = nullarg) { auto tmp = AVX::lo128(k.dataI()); tmp = _mm_unpackhi_epi8(tmp, tmp); return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp)); } template Vc_INTRINSIC Vc_CONST Return simd_cast(const AVX2::Mask &k, enable_if<(AVX2::is_mask::value && offset == 2 && AVX2::Mask::Size == Return::Size * 4)> = nullarg) { auto tmp = AVX::hi128(k.dataI()); tmp = _mm_unpacklo_epi8(tmp, tmp); return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp)); } template Vc_INTRINSIC Vc_CONST Return simd_cast(const AVX2::Mask &k, enable_if<(AVX2::is_mask::value && offset == 3 && AVX2::Mask::Size == Return::Size * 4)> = nullarg) { auto tmp = AVX::hi128(k.dataI()); tmp = _mm_unpackhi_epi8(tmp, tmp); return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp)); } Vc_SIMD_CAST_OFFSET(SSE:: short_m, AVX2::double_m, 1) { auto tmp = _mm_unpackhi_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); } Vc_SIMD_CAST_OFFSET(SSE::ushort_m, AVX2::double_m, 1) { auto tmp = _mm_unpackhi_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); } template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask::value && sizeof(AVX2::Mask) == 32), Return> simd_cast(AVX2::Mask x) { using M = AVX2::Mask; constexpr int shift = sizeof(M) / M::Size * offset * Return::Size; static_assert(shift > 0, ""); static_assert(shift < sizeof(M), ""); using SseVector = SSE::Mask>; if (shift == 16) { return simd_cast(SseVector{AVX::hi128(x.data())}); } using Intrin = typename SseVector::VectorType; return simd_cast(SseVector{AVX::avx_cast( _mm_alignr_epi8(AVX::hi128(x.dataI()), AVX::lo128(x.dataI()), shift))}); } template Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask::value && sizeof(AVX2::Mask) == 16), Return> simd_cast(AVX2::Mask x) { return simd_cast(simd_cast>(x)); } #undef Vc_SIMD_CAST_AVX_1 #undef Vc_SIMD_CAST_AVX_2 #undef Vc_SIMD_CAST_AVX_3 #undef Vc_SIMD_CAST_AVX_4 #undef Vc_SIMD_CAST_1 #undef Vc_SIMD_CAST_2 #undef Vc_SIMD_CAST_3 #undef Vc_SIMD_CAST_4 #undef Vc_SIMD_CAST_5 #undef Vc_SIMD_CAST_6 #undef Vc_SIMD_CAST_7 #undef Vc_SIMD_CAST_8 #undef Vc_SIMD_CAST_OFFSET } #endif #endif #endif namespace Vc_VERSIONED_NAMESPACE { using double_v = Vector; using float_v = Vector; using int_v = Vector; using uint_v = Vector; using short_v = Vector; using ushort_v = Vector; using llong_v = Vector; using ullong_v = Vector; using long_v = Vector; using ulong_v = Vector; using schar_v = Vector; using uchar_v = Vector; using double_m = Mask; using float_m = Mask< float>; using llong_m = Mask< llong>; using ullong_m = Mask; using long_m = Mask< long>; using ulong_m = Mask< ulong>; using int_m = Mask< int>; using uint_m = Mask< uint>; using short_m = Mask< short>; using ushort_m = Mask; using schar_m = Mask< schar>; using uchar_m = Mask< uchar>; typedef Vector int_least64_v; typedef Vector uint_least64_v; typedef Vector int_least32_v; typedef Vector uint_least32_v; typedef Vector int_least16_v; typedef Vector uint_least16_v; typedef Vector int_least8_v; typedef Vector uint_least8_v; typedef Mask int_least64_m; typedef Mask uint_least64_m; typedef Mask int_least32_m; typedef Mask uint_least32_m; typedef Mask int_least16_m; typedef Mask uint_least16_m; typedef Mask int_least8_m; typedef Mask uint_least8_m; typedef Vector int_fast64_v; typedef Vector uint_fast64_v; typedef Vector int_fast32_v; typedef Vector uint_fast32_v; typedef Vector int_fast16_v; typedef Vector uint_fast16_v; typedef Vector int_fast8_v; typedef Vector uint_fast8_v; typedef Mask int_fast64_m; typedef Mask uint_fast64_m; typedef Mask int_fast32_m; typedef Mask uint_fast32_m; typedef Mask int_fast16_m; typedef Mask uint_fast16_m; typedef Mask int_fast8_m; typedef Mask uint_fast8_m; #if defined INT64_MAX && defined UINT64_MAX typedef Vector int64_v; typedef Vector uint64_v; typedef Mask int64_m; typedef Mask uint64_m; #endif #if defined INT32_MAX && defined UINT32_MAX typedef Vector int32_v; typedef Vector uint32_v; typedef Mask int32_m; typedef Mask uint32_m; #endif #if defined INT16_MAX && defined UINT16_MAX typedef Vector int16_v; typedef Vector uint16_v; typedef Mask int16_m; typedef Mask uint16_m; #endif #if defined INT8_MAX && defined UINT8_MAX typedef Vector int8_v; typedef Vector uint8_v; typedef Mask int8_m; typedef Mask uint8_m; #endif namespace { static_assert(double_v::Size == Vc_DOUBLE_V_SIZE, "Vc_DOUBLE_V_SIZE macro defined to an incorrect value"); static_assert(float_v::Size == Vc_FLOAT_V_SIZE , "Vc_FLOAT_V_SIZE macro defined to an incorrect value "); static_assert(int_v::Size == Vc_INT_V_SIZE , "Vc_INT_V_SIZE macro defined to an incorrect value "); static_assert(uint_v::Size == Vc_UINT_V_SIZE , "Vc_UINT_V_SIZE macro defined to an incorrect value "); static_assert(short_v::Size == Vc_SHORT_V_SIZE , "Vc_SHORT_V_SIZE macro defined to an incorrect value "); static_assert(ushort_v::Size == Vc_USHORT_V_SIZE, "Vc_USHORT_V_SIZE macro defined to an incorrect value"); } } #ifndef COMMON_OPERATORS_H_ #define COMMON_OPERATORS_H_ #ifndef VC_COMMON_SIMDARRAY_H_ #define VC_COMMON_SIMDARRAY_H_ #include #include #ifndef VC_COMMON_SIMDARRAYHELPER_H_ #define VC_COMMON_SIMDARRAYHELPER_H_ namespace Vc_VERSIONED_NAMESPACE { namespace { static constexpr struct private_init_t {} private_init = {}; } namespace Common { namespace Operations { struct tag {}; #define Vc_DEFINE_OPERATION(name_) \ struct name_ : public tag { \ template \ Vc_INTRINSIC void operator()(V &v, Args &&... args) \ { \ v.name_(std::forward(args)...); \ } \ } Vc_DEFINE_OPERATION(gather); Vc_DEFINE_OPERATION(scatter); Vc_DEFINE_OPERATION(load); Vc_DEFINE_OPERATION(store); Vc_DEFINE_OPERATION(setZero); Vc_DEFINE_OPERATION(setZeroInverted); Vc_DEFINE_OPERATION(assign); #undef Vc_DEFINE_OPERATION #define Vc_DEFINE_OPERATION(name_,code_) \ struct name_ : public tag { \ template Vc_INTRINSIC void operator()(V &v) { code_; } \ } Vc_DEFINE_OPERATION(increment, ++(v)); Vc_DEFINE_OPERATION(decrement, --(v)); Vc_DEFINE_OPERATION(random, v = V::Random()); #undef Vc_DEFINE_OPERATION #define Vc_DEFINE_OPERATION_FORWARD(name_) \ struct Forward_##name_ : public tag \ { \ template ()...))> \ Vc_INTRINSIC void operator()(decltype(name_(std::declval()...)) &v, \ Args &&... args) \ { \ v = name_(std::forward(args)...); \ } \ template ()...))> \ Vc_INTRINSIC void operator()(std::nullptr_t, Args && ... args) \ { \ name_(std::forward(args)...); \ } \ } Vc_DEFINE_OPERATION_FORWARD(abs); Vc_DEFINE_OPERATION_FORWARD(asin); Vc_DEFINE_OPERATION_FORWARD(atan); Vc_DEFINE_OPERATION_FORWARD(atan2); Vc_DEFINE_OPERATION_FORWARD(cos); Vc_DEFINE_OPERATION_FORWARD(ceil); Vc_DEFINE_OPERATION_FORWARD(copysign); Vc_DEFINE_OPERATION_FORWARD(exp); Vc_DEFINE_OPERATION_FORWARD(exponent); Vc_DEFINE_OPERATION_FORWARD(fma); Vc_DEFINE_OPERATION_FORWARD(floor); Vc_DEFINE_OPERATION_FORWARD(frexp); Vc_DEFINE_OPERATION_FORWARD(isfinite); Vc_DEFINE_OPERATION_FORWARD(isinf); Vc_DEFINE_OPERATION_FORWARD(isnan); Vc_DEFINE_OPERATION_FORWARD(isnegative); Vc_DEFINE_OPERATION_FORWARD(ldexp); Vc_DEFINE_OPERATION_FORWARD(log); Vc_DEFINE_OPERATION_FORWARD(log10); Vc_DEFINE_OPERATION_FORWARD(log2); Vc_DEFINE_OPERATION_FORWARD(reciprocal); Vc_DEFINE_OPERATION_FORWARD(round); Vc_DEFINE_OPERATION_FORWARD(rsqrt); Vc_DEFINE_OPERATION_FORWARD(sin); Vc_DEFINE_OPERATION_FORWARD(sincos); Vc_DEFINE_OPERATION_FORWARD(sqrt); Vc_DEFINE_OPERATION_FORWARD(trunc); Vc_DEFINE_OPERATION_FORWARD(min); Vc_DEFINE_OPERATION_FORWARD(max); #undef Vc_DEFINE_OPERATION_FORWARD template using is_operation = std::is_base_of; } template struct Segment { static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report."); using type = T_; using type_decayed = typename std::decay::type; static constexpr std::size_t Pieces = Pieces_; static constexpr std::size_t Index = Index_; using fixed_size_type = fixed_size_simd::value, typename type_decayed::EntryType, float>, type_decayed::Size / Pieces>; type data; static constexpr std::size_t EntryOffset = Index * type_decayed::Size / Pieces; decltype(std::declval()[0]) operator[](size_t i) const { return data[i + EntryOffset]; } fixed_size_type to_fixed_size() const { return simd_cast(data); } }; template struct Segment { static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report."); using type = T_ *; using type_decayed = typename std::decay::type; static constexpr size_t Pieces = Pieces_; static constexpr size_t Index = Index_; using fixed_size_type = fixed_size_simd< typename std::conditional::value, typename type_decayed::VectorEntryType, float>::type, type_decayed::Size / Pieces> *; type data; static constexpr std::size_t EntryOffset = Index * type_decayed::size() / Pieces; fixed_size_type to_fixed_size() const { return reinterpret_cast< #ifdef Vc_GCC typename std::remove_pointer::type #else MayAlias::type> #endif *>(data) + Index; } }; template struct AddOffset { constexpr AddOffset() = default; }; template class Split { template > static Vc_INTRINSIC auto loImpl(const SimdArray &x) -> decltype(internal_data0(x)) { return internal_data0(x); } template > static Vc_INTRINSIC auto hiImpl(const SimdArray &x) -> decltype(internal_data1(x)) { return internal_data1(x); } template > static Vc_INTRINSIC auto loImpl(SimdArray *x) -> decltype(&internal_data0(*x)) { return &internal_data0(*x); } template > static Vc_INTRINSIC auto hiImpl(SimdArray *x) -> decltype(&internal_data1(*x)) { return &internal_data1(*x); } template static Vc_INTRINSIC Segment loImpl(const SimdArray &x) { return {internal_data(x)}; } template static Vc_INTRINSIC Segment hiImpl(const SimdArray &x) { return {internal_data(x)}; } template static Vc_INTRINSIC Segment loImpl(SimdArray *x) { return {&internal_data(*x)}; } template static Vc_INTRINSIC Segment hiImpl(SimdArray *x) { return {&internal_data(*x)}; } template static Vc_INTRINSIC auto loImpl(const SimdMaskArray &x) -> decltype(internal_data0(x)) { return internal_data0(x); } template static Vc_INTRINSIC auto hiImpl(const SimdMaskArray &x) -> decltype(internal_data1(x)) { return internal_data1(x); } template static Vc_INTRINSIC Segment::mask_type, 2, 0> loImpl( const SimdMaskArray &x) { return {internal_data(x)}; } template static Vc_INTRINSIC Segment::mask_type, 2, 1> hiImpl( const SimdMaskArray &x) { return {internal_data(x)}; } #ifdef Vc_IMPL_AVX template static Vc_INTRINSIC SSE::Vector loImpl(Vector &&x) { return simd_cast, 0>(x); } template static Vc_INTRINSIC SSE::Vector hiImpl(Vector &&x) { return simd_cast, 1>(x); } template static Vc_INTRINSIC SSE::Mask loImpl(Mask &&x) { return simd_cast, 0>(x); } template static Vc_INTRINSIC SSE::Mask hiImpl(Mask &&x) { return simd_cast, 1>(x); } #endif template static constexpr bool is_vector_or_mask(){ return (Traits::is_simd_vector::value && !Traits::isSimdArray::value) || (Traits::is_simd_mask::value && !Traits::isSimdMaskArray::value); } template static Vc_INTRINSIC Segment loImpl(V &&x, enable_if()> = nullarg) { return {std::forward(x)}; } template static Vc_INTRINSIC Segment hiImpl(V &&x, enable_if()> = nullarg) { return {std::forward(x)}; } template static Vc_INTRINSIC const T *loImpl(const std::vector &x) { return x.data(); } template static Vc_INTRINSIC const T *hiImpl(const std::vector &x) { return x.data() + secondOffset; } template static Vc_INTRINSIC Segment loImpl( const Segment &x) { return {x.data}; } template static Vc_INTRINSIC Segment hiImpl( const Segment &x) { return {x.data}; } template ()))> static std::true_type have_lo_impl(int); template static std::false_type have_lo_impl(float); template static constexpr bool have_lo_impl() { return decltype(have_lo_impl(1))::value; } template ()))> static std::true_type have_hi_impl(int); template static std::false_type have_hi_impl(float); template static constexpr bool have_hi_impl() { return decltype(have_hi_impl(1))::value; } public: template static Vc_INTRINSIC const U *lo(Operations::gather, const U *ptr) { return ptr; } template static Vc_INTRINSIC const U *hi(Operations::gather, const U *ptr) { return ptr + secondOffset; } template ::value>> static Vc_ALWAYS_INLINE decltype(loImpl(std::declval())) lo(Operations::gather, U &&x) { return loImpl(std::forward(x)); } template ::value>> static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval())) hi(Operations::gather, U &&x) { return hiImpl(std::forward(x)); } template static Vc_INTRINSIC const U *lo(Operations::scatter, const U *ptr) { return ptr; } template static Vc_INTRINSIC const U *hi(Operations::scatter, const U *ptr) { return ptr + secondOffset; } template static Vc_ALWAYS_INLINE decltype(loImpl(std::declval())) lo(U &&x) { return loImpl(std::forward(x)); } template static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval())) hi(U &&x) { return hiImpl(std::forward(x)); } template static Vc_ALWAYS_INLINE enable_if(), U> lo(U &&x) { return std::forward(x); } template static Vc_ALWAYS_INLINE enable_if(), U> hi(U &&x) { return std::forward(x); } }; template static Vc_INTRINSIC const V &actual_value(Op, const SimdArray &x) { return internal_data(x); } template static Vc_INTRINSIC V *actual_value(Op, SimdArray *x) { return &internal_data(*x); } template static Vc_INTRINSIC typename Segment::fixed_size_type actual_value( Op, Segment &&seg) { return seg.to_fixed_size(); } template static Vc_INTRINSIC const typename V::Mask &actual_value(Op, const SimdMaskArray &x) { return internal_data(x); } template static Vc_INTRINSIC typename V::Mask *actual_value(Op, SimdMaskArray *x) { return &internal_data(*x); } template Vc_INTRINSIC decltype(actual_value(std::declval(), std::declval())) conditionalUnpack(std::true_type, Op op, Arg &&arg) { return actual_value(op, std::forward(arg)); } template Vc_INTRINSIC Arg conditionalUnpack(std::false_type, Op, Arg &&arg) { return std::forward(arg); } template struct selectorType : public std::integral_constant { }; template Vc_INTRINSIC decltype(std::declval()(std::declval(), conditionalUnpack(selectorType(), std::declval(), std::declval())...)) unpackArgumentsAutoImpl(int, index_sequence, Op op, R &&r, Args &&... args) { op(std::forward(r), conditionalUnpack(selectorType(), op, std::forward(args))...); } template Vc_INTRINSIC enable_if<(I <= (size_t(1) << sizeof...(Args))), void> unpackArgumentsAutoImpl( float, index_sequence is, Op op, R &&r, Args &&... args) { static_assert( I < (1 << sizeof...(Args)) - (std::is_same::value ? 1 : 0), "Vc or compiler bug. Please report. Failed to find a combination of " "actual_value(arg) transformations that allows calling Op."); unpackArgumentsAutoImpl(int(), is, op, std::forward(r), std::forward(args)...); } #ifdef Vc_ICC template struct IccWorkaround { using type = void; }; template struct IccWorkaround<2, Ts...> { using type = typename std::remove_pointer>::type>::type>::type; }; #endif template Vc_INTRINSIC void unpackArgumentsAuto(Op op, R &&r, Args &&... args) { #ifdef Vc_ICC const int recursionStart = Traits::isSimdArray< typename IccWorkaround::type>::value && (std::is_same::value || std::is_same::value) ? 2 : 0; #else const int recursionStart = 0; #endif unpackArgumentsAutoImpl( int(), make_index_sequence(), op, std::forward(r), std::forward(args)...); } } } #endif #ifndef VC_COMMON_SIMDMASKARRAY_H_ #define VC_COMMON_SIMDMASKARRAY_H_ #include #include namespace Vc_VERSIONED_NAMESPACE { template class SimdMaskArray { public: using VectorType = VectorType_; using vector_type = VectorType; using mask_type = typename vector_type::Mask; using storage_type = mask_type; friend storage_type &internal_data(SimdMaskArray &m) { return m.data; } friend const storage_type &internal_data(const SimdMaskArray &m) { return m.data; } static constexpr std::size_t size() { return N; } static constexpr std::size_t Size = size(); static constexpr std::size_t MemoryAlignment = storage_type::MemoryAlignment; static_assert(Size == vector_type::Size, "size mismatch"); using vectorentry_type = typename mask_type::VectorEntryType; using value_type = typename mask_type::EntryType; using Mask = mask_type; using VectorEntryType = vectorentry_type; using EntryType = value_type; using EntryReference = Vc::Detail::ElementReference; using reference = EntryReference; using Vector = fixed_size_simd; Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(mask_type)); SimdMaskArray() = default; SimdMaskArray(const SimdMaskArray &) = default; SimdMaskArray(SimdMaskArray &&) = default; SimdMaskArray &operator=(const SimdMaskArray &) = default; SimdMaskArray &operator=(SimdMaskArray &&) = default; Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerOne one) : data(one) {} Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerZero zero) : data(zero) {} Vc_INTRINSIC explicit SimdMaskArray(bool b) : data(b) {} Vc_INTRINSIC static SimdMaskArray Zero() { return {private_init, storage_type::Zero()}; } Vc_INTRINSIC static SimdMaskArray One() { return {private_init, storage_type::One()}; } template > Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray &x) Vc_INTRINSIC_R; template V::Size && N <= 2 * V::Size)>, class = U> Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray &x) Vc_INTRINSIC_R; template 2 * V::Size && N <= 4 * V::Size)>, class = U, class = U> Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray &x) Vc_INTRINSIC_R; template Vc_INTRINSIC_L SimdMaskArray( Common::Segment &&x, enable_if::value == Size * Pieces> = nullarg) Vc_INTRINSIC_R; template ::value && !Traits::isSimdMaskArray::value && Traits::simd_vector_size::value == Size)>> Vc_INTRINSIC_L SimdMaskArray(M k) Vc_INTRINSIC_R; template ::Size == N && !detail::is_fixed_size_abi::value>> operator Vc::Mask() const { return simd_cast>(data); } operator fixed_size_simd_mask &() { return static_cast &>(*this); } operator const fixed_size_simd_mask &() const { return static_cast &>(*this); } template Vc_INTRINSIC explicit SimdMaskArray(const bool *mem, Flags f = Flags()) : data(mem, f) { } Vc_INTRINSIC void load(const bool *mem) { data.load(mem); } template Vc_INTRINSIC void load(const bool *mem, Flags f) { data.load(mem, f); } Vc_INTRINSIC void store(bool *mem) const { data.store(mem); } template Vc_INTRINSIC void store(bool *mem, Flags f) const { data.store(mem, f); } Vc_INTRINSIC Vc_PURE bool operator==(const SimdMaskArray &rhs) const { return data == rhs.data; } Vc_INTRINSIC Vc_PURE bool operator!=(const SimdMaskArray &rhs) const { return data != rhs.data; } Vc_INTRINSIC Vc_PURE fixed_size_simd_mask operator!() const { return {private_init, !data}; } Vc_INTRINSIC SimdMaskArray &operator&=(const SimdMaskArray &rhs) { data &= rhs.data; return *this; } Vc_INTRINSIC SimdMaskArray &operator|=(const SimdMaskArray &rhs) { data |= rhs.data; return *this; } Vc_INTRINSIC SimdMaskArray &operator^=(const SimdMaskArray &rhs) { data ^= rhs.data; return *this; } Vc_INTRINSIC Vc_PURE fixed_size_simd_mask operator&( const SimdMaskArray &rhs) const { return {private_init, data & rhs.data}; } Vc_INTRINSIC Vc_PURE fixed_size_simd_mask operator|( const SimdMaskArray &rhs) const { return {private_init, data | rhs.data}; } Vc_INTRINSIC Vc_PURE fixed_size_simd_mask operator^( const SimdMaskArray &rhs) const { return {private_init, data ^ rhs.data}; } Vc_INTRINSIC Vc_PURE fixed_size_simd_mask operator&&( const SimdMaskArray &rhs) const { return {private_init, data && rhs.data}; } Vc_INTRINSIC Vc_PURE fixed_size_simd_mask operator||( const SimdMaskArray &rhs) const { return {private_init, data || rhs.data}; } Vc_INTRINSIC Vc_PURE bool isFull() const { return data.isFull(); } Vc_INTRINSIC Vc_PURE bool isNotEmpty() const { return data.isNotEmpty(); } Vc_INTRINSIC Vc_PURE bool isEmpty() const { return data.isEmpty(); } Vc_INTRINSIC Vc_PURE bool isMix() const { return data.isMix(); } Vc_INTRINSIC Vc_PURE int shiftMask() const { return data.shiftMask(); } Vc_INTRINSIC Vc_PURE int toInt() const { return data.toInt(); } private: friend reference; static Vc_INTRINSIC value_type get(const storage_type &k, int i) noexcept { return k[i]; } template static Vc_INTRINSIC void set(storage_type &k, int i, U &&v) noexcept( noexcept(std::declval()[0] = std::declval())) { k[i] = std::forward(v); } public: Vc_INTRINSIC Vc_PURE reference operator[](size_t index) noexcept { return {data, int(index)}; } Vc_INTRINSIC Vc_PURE value_type operator[](size_t index) const noexcept { return data[index]; } Vc_INTRINSIC Vc_PURE int count() const { return data.count(); } Vc_INTRINSIC Vc_PURE int firstOne() const { return data.firstOne(); } template static Vc_INTRINSIC fixed_size_simd_mask generate(const G &gen) { return {private_init, mask_type::generate(gen)}; } Vc_INTRINSIC Vc_PURE fixed_size_simd_mask shifted(int amount) const { return {private_init, data.shifted(amount)}; } template static Vc_INTRINSIC fixed_size_simd_mask fromOperation(Op op, Args &&... args) { fixed_size_simd_mask r; Common::unpackArgumentsAuto(op, r.data, std::forward(args)...); return r; } Vc_INTRINSIC SimdMaskArray(private_init_t, mask_type &&x) : data(std::move(x)) {} private: alignas(static_cast( Common::BoundedAlignment::value * sizeof(VectorType_) / VectorType_::size()>::value)) storage_type data; }; template constexpr std::size_t SimdMaskArray::Size; template constexpr std::size_t SimdMaskArray::MemoryAlignment; template class SimdMaskArray { static constexpr std::size_t N0 = Common::left_size(); using Split = Common::Split; public: using storage_type0 = fixed_size_simd_mask; using storage_type1 = fixed_size_simd_mask; static_assert(storage_type0::size() == N0, ""); using vector_type = fixed_size_simd; friend storage_type0 &internal_data0(SimdMaskArray &m) { return m.data0; } friend storage_type1 &internal_data1(SimdMaskArray &m) { return m.data1; } friend const storage_type0 &internal_data0(const SimdMaskArray &m) { return m.data0; } friend const storage_type1 &internal_data1(const SimdMaskArray &m) { return m.data1; } using mask_type = SimdMaskArray; static constexpr std::size_t size() { return N; } static constexpr std::size_t Size = size(); static constexpr std::size_t MemoryAlignment = storage_type0::MemoryAlignment > storage_type1::MemoryAlignment ? storage_type0::MemoryAlignment : storage_type1::MemoryAlignment; static_assert(Size == vector_type::Size, "size mismatch"); using vectorentry_type = typename storage_type0::VectorEntryType; using value_type = typename storage_type0::EntryType; using MaskType = mask_type; using VectorEntryType = vectorentry_type; using EntryType = value_type; using EntryReference = Vc::Detail::ElementReference; using reference = EntryReference; using Vector = fixed_size_simd; Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(mask_type)); SimdMaskArray() = default; SimdMaskArray(const SimdMaskArray &) = default; SimdMaskArray(SimdMaskArray &&) = default; SimdMaskArray &operator=(const SimdMaskArray &) = default; SimdMaskArray &operator=(SimdMaskArray &&) = default; template Vc_INTRINSIC SimdMaskArray(const SimdMaskArray &rhs) : data0(Split::lo(rhs)), data1(Split::hi(rhs)) { } template Vc_INTRINSIC SimdMaskArray( Common::Segment &&rhs, enable_if::value == Size * Pieces> = nullarg) : data0(Split::lo(rhs)), data1(Split::hi(rhs)) { } template ::value && !Traits::isSimdMaskArray::value && Traits::simd_vector_size::value == Size)>> Vc_INTRINSIC SimdMaskArray(M k) : data0(Split::lo(k)), data1(Split::hi(k)) { } template ::Size == N && !detail::is_fixed_size_abi::value>> operator Vc::Mask() const { return simd_cast>(data0, data1); } Vc_INTRINSIC operator fixed_size_simd_mask &() { return static_cast &>(*this); } Vc_INTRINSIC operator const fixed_size_simd_mask &() const { return static_cast &>(*this); } Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerOne one) : data0(one), data1(one) { } Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerZero zero) : data0(zero), data1(zero) { } Vc_INTRINSIC explicit SimdMaskArray(bool b) : data0(b), data1(b) {} Vc_INTRINSIC static fixed_size_simd_mask Zero() { return {storage_type0::Zero(), storage_type1::Zero()}; } Vc_INTRINSIC static fixed_size_simd_mask One() { return {storage_type0::One(), storage_type1::One()}; } template Vc_INTRINSIC explicit SimdMaskArray(const bool *mem, Flags f = Flags()) : data0(mem, f), data1(mem + storage_type0::size(), f) { } Vc_INTRINSIC void load(const bool *mem) { data0.load(mem); data1.load(mem + storage_type0::size()); } template Vc_INTRINSIC void load(const bool *mem, Flags f) { data0.load(mem, f); data1.load(mem + storage_type0::size(), f); } Vc_INTRINSIC void store(bool *mem) const { data0.store(mem); data1.store(mem + storage_type0::size()); } template Vc_INTRINSIC void store(bool *mem, Flags f) const { data0.store(mem, f); data1.store(mem + storage_type0::size(), f); } Vc_INTRINSIC Vc_PURE bool operator==(const SimdMaskArray &mask) const { return data0 == mask.data0 && data1 == mask.data1; } Vc_INTRINSIC Vc_PURE bool operator!=(const SimdMaskArray &mask) const { return data0 != mask.data0 || data1 != mask.data1; } Vc_INTRINSIC Vc_PURE fixed_size_simd_mask operator!() const { return {!data0, !data1}; } Vc_INTRINSIC SimdMaskArray &operator&=(const SimdMaskArray &rhs) { data0 &= rhs.data0; data1 &= rhs.data1; return *this; } Vc_INTRINSIC SimdMaskArray &operator|=(const SimdMaskArray &rhs) { data0 |= rhs.data0; data1 |= rhs.data1; return *this; } Vc_INTRINSIC SimdMaskArray &operator^=(const SimdMaskArray &rhs) { data0 ^= rhs.data0; data1 ^= rhs.data1; return *this; } Vc_INTRINSIC Vc_PURE fixed_size_simd_mask operator&( const SimdMaskArray &rhs) const { return {data0 & rhs.data0, data1 & rhs.data1}; } Vc_INTRINSIC Vc_PURE fixed_size_simd_mask operator|( const SimdMaskArray &rhs) const { return {data0 | rhs.data0, data1 | rhs.data1}; } Vc_INTRINSIC Vc_PURE fixed_size_simd_mask operator^( const SimdMaskArray &rhs) const { return {data0 ^ rhs.data0, data1 ^ rhs.data1}; } Vc_INTRINSIC Vc_PURE fixed_size_simd_mask operator&&( const SimdMaskArray &rhs) const { return {data0 && rhs.data0, data1 && rhs.data1}; } Vc_INTRINSIC Vc_PURE fixed_size_simd_mask operator||( const SimdMaskArray &rhs) const { return {data0 || rhs.data0, data1 || rhs.data1}; } Vc_INTRINSIC Vc_PURE bool isFull() const { return data0.isFull() && data1.isFull(); } Vc_INTRINSIC Vc_PURE bool isNotEmpty() const { return data0.isNotEmpty() || data1.isNotEmpty(); } Vc_INTRINSIC Vc_PURE bool isEmpty() const { return data0.isEmpty() && data1.isEmpty(); } Vc_INTRINSIC Vc_PURE bool isMix() const { return !isFull() && !isEmpty(); } Vc_INTRINSIC Vc_PURE int toInt() const { return data0.toInt() | (data1.toInt() << data0.size()); } private: friend reference; static Vc_INTRINSIC value_type get(const SimdMaskArray &o, int i) noexcept { if (i < int(o.data0.size())) { return o.data0[i]; } else { return o.data1[i - o.data0.size()]; } } template static Vc_INTRINSIC void set(SimdMaskArray &o, int i, U &&v) noexcept( noexcept(std::declval()[0] = std::declval()) && noexcept(std::declval()[0] = std::declval())) { if (i < int(o.data0.size())) { o.data0[i] = std::forward(v); } else { o.data1[i - o.data0.size()] = std::forward(v); } } public: Vc_INTRINSIC Vc_PURE reference operator[](size_t index) noexcept { return {*this, int(index)}; } Vc_INTRINSIC Vc_PURE value_type operator[](size_t index) const noexcept { return get(*this, index); } Vc_INTRINSIC Vc_PURE int count() const { return data0.count() + data1.count(); } Vc_INTRINSIC Vc_PURE int firstOne() const { if (data0.isEmpty()) { return data1.firstOne() + storage_type0::size(); } return data0.firstOne(); } template static Vc_INTRINSIC fixed_size_simd_mask generate(const G &gen) { return {storage_type0::generate(gen), storage_type1::generate([&](std::size_t i) { return gen(i + N0); })}; } inline Vc_PURE fixed_size_simd_mask shifted(int amount) const { if (Vc_IS_UNLIKELY(amount == 0)) { return *this; } return generate([&](unsigned i) { const unsigned j = i + amount; return j < size() ? get(*this, j) : false; }); } template static Vc_INTRINSIC fixed_size_simd_mask fromOperation(Op op, Args &&... args) { fixed_size_simd_mask r = { storage_type0::fromOperation(op, Split::lo(args)...), storage_type1::fromOperation(op, Split::hi(std::forward(args))...)}; return r; } Vc_INTRINSIC SimdMaskArray(storage_type0 &&x, storage_type1 &&y) : data0(std::move(x)), data1(std::move(y)) { } private: alignas(static_cast( Common::BoundedAlignment::value * sizeof(V) / V::size()>::value)) storage_type0 data0; storage_type1 data1; }; template constexpr std::size_t SimdMaskArray::Size; template constexpr std::size_t SimdMaskArray::MemoryAlignment; } #ifndef VC_COMMON_SIMD_CAST_CALLER_TCC_ #define VC_COMMON_SIMD_CAST_CALLER_TCC_ namespace Vc_VERSIONED_NAMESPACE { template template Vc_INTRINSIC SimdMaskArray::SimdMaskArray( const SimdMaskArray &x) : data(simd_cast(internal_data(x))) { } template template Vc_INTRINSIC SimdMaskArray::SimdMaskArray( const SimdMaskArray &x) : data(simd_cast(internal_data(internal_data0(x)), internal_data(internal_data1(x)))) { } template template Vc_INTRINSIC SimdMaskArray::SimdMaskArray( const SimdMaskArray &x) : data(simd_cast(internal_data(internal_data0(internal_data0(x))), internal_data(internal_data1(internal_data0(x))), internal_data(internal_data0(internal_data1(x))), internal_data(internal_data1(internal_data1(x))))) { } template template Vc_INTRINSIC SimdMaskArray::SimdMaskArray( Common::Segment &&x, enable_if::value == Size * Pieces>) : data(simd_cast(x.data)) { } template template Vc_INTRINSIC SimdMaskArray::SimdMaskArray(M k) : data(simd_cast(k)) { } } #endif #endif #ifndef VC_COMMON_INTERLEAVE_H_ #define VC_COMMON_INTERLEAVE_H_ namespace Vc_VERSIONED_NAMESPACE { template ::value>> std::pair interleave(const V &a, const V &b) { return {a.interleaveLow(b), a.interleaveHigh(b)}; } } #endif namespace Vc_VERSIONED_NAMESPACE { namespace Common { template struct select_best_vector_type_impl; template struct select_best_vector_type_impl { using type = T; }; template struct select_best_vector_type_impl { using type = typename std::conditional< (N < T::Size), typename select_best_vector_type_impl::type, T>::type; }; template struct select_best_vector_type : select_best_vector_type_impl, #elif defined Vc_IMPL_AVX Vc::AVX::Vector, #endif #ifdef Vc_IMPL_SSE Vc::SSE::Vector, #endif Vc::Scalar::Vector> { }; } namespace internal { template T Vc_INTRINSIC Vc_PURE product_helper_(const T &l, const T &r) { return l * r; } template T Vc_INTRINSIC Vc_PURE sum_helper_(const T &l, const T &r) { return l + r; } } template inline fixed_size_simd min(const SimdArray &x, const SimdArray &y); template inline fixed_size_simd max(const SimdArray &x, const SimdArray &y); #define Vc_CURRENT_CLASS_NAME SimdArray template class SimdArray { static_assert(std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value, "SimdArray may only be used with T = { double, float, int32_t, uint32_t, " "int16_t, uint16_t }"); static_assert( std::is_same::type>::value && VectorType_::size() == N, "ERROR: leave the third and fourth template parameters with their defaults. They " "are implementation details."); public: static constexpr bool is_atomic = true; using VectorType = VectorType_; using vector_type = VectorType; using storage_type = vector_type; using vectorentry_type = typename vector_type::VectorEntryType; using value_type = T; using mask_type = fixed_size_simd_mask; using index_type = fixed_size_simd; static constexpr std::size_t size() { return N; } using Mask = mask_type; using MaskType = Mask; using MaskArgument = const MaskType &; using VectorEntryType = vectorentry_type; using EntryType = value_type; using IndexType = index_type; using AsArg = const SimdArray &; using reference = Detail::ElementReference; static constexpr std::size_t Size = size(); static constexpr std::size_t MemoryAlignment = storage_type::MemoryAlignment; Vc_INTRINSIC SimdArray() = default; Vc_INTRINSIC SimdArray(const SimdArray &) = default; Vc_INTRINSIC SimdArray(SimdArray &&) = default; Vc_INTRINSIC SimdArray &operator=(const SimdArray &) = default; Vc_INTRINSIC SimdArray(const value_type &a) : data(a) {} Vc_INTRINSIC SimdArray(value_type &a) : data(a) {} Vc_INTRINSIC SimdArray(value_type &&a) : data(a) {} template < typename U, typename = enable_if::value && !std::is_same::value>> Vc_INTRINSIC SimdArray(U a) : SimdArray(static_cast(a)) { } template > Vc_INTRINSIC SimdArray(const SimdArray &x) : data(simd_cast(internal_data(x))) { } template V::Size && N <= 2 * V::Size)>, class = U> Vc_INTRINSIC SimdArray(const SimdArray &x) : data(simd_cast(internal_data(internal_data0(x)), internal_data(internal_data1(x)))) { } template 2 * V::Size && N <= 4 * V::Size)>, class = U, class = U> Vc_INTRINSIC SimdArray(const SimdArray &x) : data(simd_cast(internal_data(internal_data0(internal_data0(x))), internal_data(internal_data1(internal_data0(x))), internal_data(internal_data0(internal_data1(x))), internal_data(internal_data1(internal_data1(x))))) { } template Vc_INTRINSIC SimdArray(Common::Segment &&x) : data(simd_cast(x.data)) { } Vc_INTRINSIC SimdArray(const std::initializer_list &init) : data(init.begin(), Vc::Unaligned) { Vc_ASSERT(init.size() == size()); } template < typename V, typename = enable_if::value && !Traits::isSimdArray::value>> Vc_INTRINSIC SimdArray(const V &x) : data(simd_cast(x)) { } template ::value && Vector::Size == N && !std::is_same>::value>> Vc_INTRINSIC operator Vector() const { return simd_cast>(data); } operator fixed_size_simd &() { return static_cast &>(*this); } operator const fixed_size_simd &() const { return static_cast &>(*this); } #ifndef Vc_CURRENT_CLASS_NAME #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors." #endif private: template inline void gatherImplementation(const Common::GatherArguments &); template inline void gatherImplementation(const Common::GatherArguments &, MaskArgument mask); public: #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that can be converted to the " \ "EntryType of this SIMD vector type."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes))); } template Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments &args) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args); } template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask); } template Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments &args, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args, mask); } template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes))); } template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask); } template Vc_INTRINSIC void gather(const Common::GatherArguments &args) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args); } template Vc_INTRINSIC void gather(const Common::GatherArguments &args, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args, mask); } #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_ private: template inline void scatterImplementation(MT *mem, IT &&indexes) const; template inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const; public: #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that the EntryType of this " \ "SIMD vector type can be converted to."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes)); } template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes), mask); } template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args) const { scatter(args.address, args.indexes); } template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args, MaskArgument mask) const { scatter(args.address, args.indexes, mask); } #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_ explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerZero) : data() {} explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerOne o) : data(o) {} explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerIndexesFromZero i) : data(i) { } template explicit Vc_INTRINSIC SimdArray( Common::AddOffset) : data(Vc::IndexesFromZero) { data += value_type(Offset); } Vc_INTRINSIC void setZero() { data.setZero(); } Vc_INTRINSIC void setZero(mask_type k) { data.setZero(internal_data(k)); } Vc_INTRINSIC void setZeroInverted() { data.setZeroInverted(); } Vc_INTRINSIC void setZeroInverted(mask_type k) { data.setZeroInverted(internal_data(k)); } Vc_INTRINSIC void setQnan() { data.setQnan(); } Vc_INTRINSIC void setQnan(mask_type m) { data.setQnan(internal_data(m)); } template static Vc_INTRINSIC fixed_size_simd fromOperation(Op op, Args &&... args) { fixed_size_simd r; Common::unpackArgumentsAuto(op, r.data, std::forward(args)...); return r; } template static Vc_INTRINSIC void callOperation(Op op, Args &&... args) { Common::unpackArgumentsAuto(op, nullptr, std::forward(args)...); } static Vc_INTRINSIC fixed_size_simd Zero() { return SimdArray(Vc::Zero); } static Vc_INTRINSIC fixed_size_simd One() { return SimdArray(Vc::One); } static Vc_INTRINSIC fixed_size_simd IndexesFromZero() { return SimdArray(Vc::IndexesFromZero); } static Vc_INTRINSIC fixed_size_simd Random() { return fromOperation(Common::Operations::random()); } template ::value && Traits::is_load_store_flag::value>> explicit Vc_INTRINSIC SimdArray(const U *mem, Flags f = {}) : data(mem, f) { } template Vc_INTRINSIC void load(Args &&... args) { data.load(std::forward(args)...); } template Vc_INTRINSIC void store(Args &&... args) const { data.store(std::forward(args)...); } Vc_INTRINSIC mask_type operator!() const { return {private_init, !data}; } Vc_INTRINSIC fixed_size_simd operator-() const { return {private_init, -data}; } Vc_INTRINSIC fixed_size_simd operator+() const { return *this; } Vc_INTRINSIC fixed_size_simd operator~() const { return {private_init, ~data}; } template ::value && std::is_integral::value>> Vc_INTRINSIC Vc_CONST fixed_size_simd operator<<(U x) const { return {private_init, data << x}; } template ::value && std::is_integral::value>> Vc_INTRINSIC fixed_size_simd &operator<<=(U x) { data <<= x; return *this; } template ::value && std::is_integral::value>> Vc_INTRINSIC Vc_CONST fixed_size_simd operator>>(U x) const { return {private_init, data >> x}; } template ::value && std::is_integral::value>> Vc_INTRINSIC fixed_size_simd &operator>>=(U x) { data >>= x; return *this; } #define Vc_BINARY_OPERATOR_(op) \ Vc_INTRINSIC fixed_size_simd &operator op##=(const SimdArray &rhs) \ { \ data op## = rhs.data; \ return *this; \ } Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATOR_); Vc_ALL_BINARY(Vc_BINARY_OPERATOR_); Vc_ALL_SHIFTS(Vc_BINARY_OPERATOR_); #undef Vc_BINARY_OPERATOR_ Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC MaskType isNegative() const { return {private_init, isnegative(data)}; } private: friend reference; Vc_INTRINSIC static value_type get(const SimdArray &o, int i) noexcept { return o.data[i]; } template Vc_INTRINSIC static void set(SimdArray &o, int i, U &&v) noexcept( noexcept(std::declval() = v)) { o.data[i] = v; } public: Vc_INTRINSIC reference operator[](size_t i) noexcept { static_assert(noexcept(reference{std::declval(), int()}), ""); return {*this, int(i)}; } Vc_INTRINSIC value_type operator[](size_t i) const noexcept { return get(*this, int(i)); } Vc_INTRINSIC Common::WriteMaskedVector operator()(const mask_type &k) { return {*this, k}; } Vc_INTRINSIC void assign(const SimdArray &v, const mask_type &k) { data.assign(v.data, internal_data(k)); } #define Vc_REDUCTION_FUNCTION_(name_) \ Vc_INTRINSIC Vc_PURE value_type name_() const { return data.name_(); } \ Vc_INTRINSIC Vc_PURE value_type name_(mask_type mask) const \ { \ return data.name_(internal_data(mask)); \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_REDUCTION_FUNCTION_(min); Vc_REDUCTION_FUNCTION_(max); Vc_REDUCTION_FUNCTION_(product); Vc_REDUCTION_FUNCTION_(sum); #undef Vc_REDUCTION_FUNCTION_ Vc_INTRINSIC Vc_PURE fixed_size_simd partialSum() const { return {private_init, data.partialSum()}; } template Vc_INTRINSIC fixed_size_simd apply(F &&f) const { return {private_init, data.apply(std::forward(f))}; } template Vc_INTRINSIC fixed_size_simd apply(F &&f, const mask_type &k) const { return {private_init, data.apply(std::forward(f), k)}; } Vc_INTRINSIC fixed_size_simd shifted(int amount) const { return {private_init, data.shifted(amount)}; } template Vc_INTRINSIC fixed_size_simd shifted(int amount, const SimdArray &shiftIn) const { return {private_init, data.shifted(amount, simd_cast(shiftIn))}; } Vc_INTRINSIC fixed_size_simd rotated(int amount) const { return {private_init, data.rotated(amount)}; } Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC fixed_size_simd exponent() const { return {private_init, exponent(data)}; } Vc_INTRINSIC fixed_size_simd interleaveLow(SimdArray x) const { return {private_init, data.interleaveLow(x.data)}; } Vc_INTRINSIC fixed_size_simd interleaveHigh(SimdArray x) const { return {private_init, data.interleaveHigh(x.data)}; } Vc_INTRINSIC fixed_size_simd reversed() const { return {private_init, data.reversed()}; } Vc_INTRINSIC fixed_size_simd sorted() const { return {private_init, data.sorted()}; } template ()(std::size_t())), class = enable_if::value>> Vc_INTRINSIC SimdArray(const G &gen) : data(gen) { } template static Vc_INTRINSIC fixed_size_simd generate(const G &gen) { return {private_init, VectorType::generate(gen)}; } Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC fixed_size_simd copySign(const SimdArray &x) const { return {private_init, Vc::copysign(data, x.data)}; } friend VectorType &internal_data<>(SimdArray &x); friend const VectorType &internal_data<>(const SimdArray &x); Vc_INTRINSIC SimdArray(private_init_t, VectorType &&x) : data(std::move(x)) {} Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(storage_type)); private: alignas(static_cast( Common::BoundedAlignment::value * sizeof(VectorType_) / VectorType_::size()>::value)) storage_type data; }; template constexpr std::size_t SimdArray::Size; template constexpr std::size_t SimdArray::MemoryAlignment; template #ifndef Vc_MSVC Vc_INTRINSIC #endif VectorType &internal_data(SimdArray &x) { return x.data; } template #ifndef Vc_MSVC Vc_INTRINSIC #endif const VectorType &internal_data(const SimdArray &x) { return x.data; } template Vc_INTRINSIC T unwrap(const T &x) { return x; } template Vc_INTRINSIC V unwrap(const SimdArray &x) { return internal_data(x); } template Vc_INTRINSIC auto unwrap(const Common::Segment &x) -> decltype(x.to_fixed_size()) { return unwrap(x.to_fixed_size()); } template template Vc_INTRINSIC void SimdArray::gatherImplementation( const Common::GatherArguments &args) { data.gather(Common::make_gather(args.address, unwrap(args.indexes))); } template template Vc_INTRINSIC void SimdArray::gatherImplementation( const Common::GatherArguments &args, MaskArgument mask) { data.gather(Common::make_gather(args.address, unwrap(args.indexes)), mask); } template template inline void SimdArray::scatterImplementation(MT *mem, IT &&indexes) const { data.scatter(mem, unwrap(std::forward(indexes))); } template template inline void SimdArray::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const { data.scatter(mem, unwrap(std::forward(indexes)), mask); } template class SimdArray { static_assert(std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value, "SimdArray may only be used with T = { double, float, int32_t, uint32_t, int16_t, uint16_t }"); static_assert( std::is_same::type>::value && V::size() == Wt, "ERROR: leave the third and fourth template parameters with their defaults. They " "are implementation details."); static_assert( std::is_same::value || (N % V::size() == 0), "SimdArray<(un)signed short, N> on MIC only works correctly for N = k * " "MIC::(u)short_v::size(), i.e. k * 16."); using my_traits = SimdArrayTraits; static constexpr std::size_t N0 = my_traits::N0; static constexpr std::size_t N1 = my_traits::N1; using Split = Common::Split; template using CArray = U[K]; public: static constexpr bool is_atomic = false; using storage_type0 = typename my_traits::storage_type0; using storage_type1 = typename my_traits::storage_type1; static_assert(storage_type0::size() == N0, ""); using vector_type = V; using vectorentry_type = typename storage_type0::vectorentry_type; typedef vectorentry_type alias_type Vc_MAY_ALIAS; using value_type = T; using mask_type = fixed_size_simd_mask; using index_type = fixed_size_simd; static constexpr std::size_t size() { return N; } using Mask = mask_type; using MaskType = Mask; using MaskArgument = const MaskType &; using VectorEntryType = vectorentry_type; using EntryType = value_type; using IndexType = index_type; using AsArg = const SimdArray &; using reference = Detail::ElementReference; static constexpr std::size_t MemoryAlignment = storage_type0::MemoryAlignment > storage_type1::MemoryAlignment ? storage_type0::MemoryAlignment : storage_type1::MemoryAlignment; static Vc_INTRINSIC fixed_size_simd Zero() { return SimdArray(Vc::Zero); } static Vc_INTRINSIC fixed_size_simd One() { return SimdArray(Vc::One); } static Vc_INTRINSIC fixed_size_simd IndexesFromZero() { return SimdArray(Vc::IndexesFromZero); } static Vc_INTRINSIC fixed_size_simd Random() { return fromOperation(Common::Operations::random()); } template ()(std::size_t())), class = enable_if::value>> Vc_INTRINSIC SimdArray(const G &gen) : data0(gen), data1([&](std::size_t i) { return gen(i + storage_type0::size()); }) { } template static Vc_INTRINSIC fixed_size_simd generate(const G &gen) { auto tmp = storage_type0::generate(gen); return {std::move(tmp), storage_type1::generate([&](std::size_t i) { return gen(i + N0); })}; } SimdArray() = default; Vc_INTRINSIC SimdArray(value_type a) : data0(a), data1(a) {} template < typename U, typename = enable_if::value && !std::is_same::value>> SimdArray(U a) : SimdArray(static_cast(a)) { } SimdArray(const SimdArray &) = default; SimdArray(SimdArray &&) = default; SimdArray &operator=(const SimdArray &) = default; template ::value && Traits::is_load_store_flag::value>> explicit Vc_INTRINSIC SimdArray(const U *mem, Flags f = {}) : data0(mem, f), data1(mem + storage_type0::size(), f) { } #ifndef Vc_MSVC template ::value && Traits::is_load_store_flag::value>> explicit Vc_INTRINSIC SimdArray(CArray &mem, Flags f = {}) : data0(&mem[0], f), data1(&mem[storage_type0::size()], f) { } template ::value && Traits::is_load_store_flag::value>> explicit Vc_INTRINSIC SimdArray(const CArray &mem, Flags f = {}) : data0(&mem[0], f), data1(&mem[storage_type0::size()], f) { } #endif Vc_INTRINSIC SimdArray(const std::initializer_list &init) : data0(init.begin(), Vc::Unaligned) , data1(init.begin() + storage_type0::size(), Vc::Unaligned) { Vc_ASSERT(init.size() == size()); } #ifndef Vc_CURRENT_CLASS_NAME #error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors." #endif private: template inline void gatherImplementation(const Common::GatherArguments &); template inline void gatherImplementation(const Common::GatherArguments &, MaskArgument mask); public: #define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that can be converted to the " \ "EntryType of this SIMD vector type."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes))); } template Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments &args) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args); } template ::value>> Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask); } template Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments &args, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args, mask); } template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes))); } template ::value>> Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation( Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask); } template Vc_INTRINSIC void gather(const Common::GatherArguments &args) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args); } template Vc_INTRINSIC void gather(const Common::GatherArguments &args, MaskArgument mask) { Vc_ASSERT_GATHER_PARAMETER_TYPES_; gatherImplementation(args, mask); } #undef Vc_ASSERT_GATHER_PARAMETER_TYPES_ private: template inline void scatterImplementation(MT *mem, IT &&indexes) const; template inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const; public: #define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \ static_assert( \ std::is_convertible::value, \ "The memory pointer needs to point to a type that the EntryType of this " \ "SIMD vector type can be converted to."); \ static_assert( \ Vc::Traits::has_subscript_operator::value, \ "The indexes argument must be a type that implements the subscript operator."); \ static_assert( \ !Traits::is_simd_vector::value || \ Traits::simd_vector_size::value >= Size, \ "If you use a SIMD vector for the indexes parameter, the index vector must " \ "have at least as many entries as this SIMD vector."); \ static_assert( \ !std::is_array::value || \ (std::rank::value == 1 && \ (std::extent::value == 0 || std::extent::value >= Size)), \ "If you use a simple array for the indexes parameter, the array must have " \ "at least as many entries as this SIMD vector.") template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes)); } template ::value>> Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const { Vc_ASSERT_SCATTER_PARAMETER_TYPES_; scatterImplementation(mem, std::forward(indexes), mask); } template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args) const { scatter(args.address, args.indexes); } template Vc_INTRINSIC void scatter(const Common::ScatterArguments &args, MaskArgument mask) const { scatter(args.address, args.indexes, mask); } #undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_ explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerZero) : data0(), data1() {} explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerOne o) : data0(o), data1(o) {} explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerIndexesFromZero i) : data0(i) , data1(Common::AddOffset()) { } template explicit Vc_INTRINSIC SimdArray( Common::AddOffset i) : data0(i) , data1(Common::AddOffset()) { } template ::value && Traits::simd_vector_size::value == N && !(std::is_convertible, T>::value && Traits::isSimdArray::value))>> Vc_INTRINSIC explicit SimdArray(W &&x) : data0(Split::lo(x)), data1(Split::hi(x)) { } template ::value && Traits::simd_vector_size::value == N && std::is_convertible, T>::value)>, class = W> Vc_INTRINSIC SimdArray(W &&x) : data0(Split::lo(x)), data1(Split::hi(x)) { } template Vc_INTRINSIC SimdArray(Common::Segment &&x) : data0(Common::Segment{x.data}) , data1(Common::Segment{x.data}) { } template ::value && Vector::Size == N && !std::is_same>::value>> operator Vector() const { auto r = simd_cast>(data0, data1); return r; } Vc_INTRINSIC operator fixed_size_simd &() { return static_cast &>(*this); } Vc_INTRINSIC operator const fixed_size_simd &() const { return static_cast &>(*this); } Vc_INTRINSIC void setZero() { data0.setZero(); data1.setZero(); } Vc_INTRINSIC void setZero(const mask_type &k) { data0.setZero(Split::lo(k)); data1.setZero(Split::hi(k)); } Vc_INTRINSIC void setZeroInverted() { data0.setZeroInverted(); data1.setZeroInverted(); } Vc_INTRINSIC void setZeroInverted(const mask_type &k) { data0.setZeroInverted(Split::lo(k)); data1.setZeroInverted(Split::hi(k)); } Vc_INTRINSIC void setQnan() { data0.setQnan(); data1.setQnan(); } Vc_INTRINSIC void setQnan(const mask_type &m) { data0.setQnan(Split::lo(m)); data1.setQnan(Split::hi(m)); } template static Vc_INTRINSIC fixed_size_simd fromOperation(Op op, Args &&... args) { fixed_size_simd r = { storage_type0::fromOperation(op, Split::lo(args)...), storage_type1::fromOperation(op, Split::hi(std::forward(args))...)}; return r; } template static Vc_INTRINSIC void callOperation(Op op, Args &&... args) { storage_type0::callOperation(op, Split::lo(args)...); storage_type1::callOperation(op, Split::hi(std::forward(args))...); } template Vc_INTRINSIC void load(const U *mem, Args &&... args) { data0.load(mem, Split::lo(args)...); data1.load(mem + storage_type0::size(), Split::hi(std::forward(args))...); } template Vc_INTRINSIC void store(U *mem, Args &&... args) const { data0.store(mem, Split::lo(args)...); data1.store(mem + storage_type0::size(), Split::hi(std::forward(args))...); } Vc_INTRINSIC mask_type operator!() const { return {!data0, !data1}; } Vc_INTRINSIC fixed_size_simd operator-() const { return {-data0, -data1}; } Vc_INTRINSIC fixed_size_simd operator+() const { return *this; } Vc_INTRINSIC fixed_size_simd operator~() const { return {~data0, ~data1}; } template ::value && std::is_integral::value>> Vc_INTRINSIC Vc_CONST fixed_size_simd operator<<(U x) const { return {data0 << x, data1 << x}; } template ::value && std::is_integral::value>> Vc_INTRINSIC fixed_size_simd &operator<<=(U x) { data0 <<= x; data1 <<= x; return *this; } template ::value && std::is_integral::value>> Vc_INTRINSIC Vc_CONST fixed_size_simd operator>>(U x) const { return {data0 >> x, data1 >> x}; } template ::value && std::is_integral::value>> Vc_INTRINSIC fixed_size_simd &operator>>=(U x) { data0 >>= x; data1 >>= x; return *this; } #define Vc_BINARY_OPERATOR_(op) \ Vc_INTRINSIC fixed_size_simd &operator op##=(const SimdArray &rhs) \ { \ data0 op## = rhs.data0; \ data1 op## = rhs.data1; \ return *this; \ } Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATOR_); Vc_ALL_BINARY(Vc_BINARY_OPERATOR_); Vc_ALL_SHIFTS(Vc_BINARY_OPERATOR_); #undef Vc_BINARY_OPERATOR_ private: friend reference; Vc_INTRINSIC static value_type get(const SimdArray &o, int i) noexcept { return reinterpret_cast(&o)[i]; } template Vc_INTRINSIC static void set(SimdArray &o, int i, U &&v) noexcept( noexcept(std::declval() = v)) { reinterpret_cast(&o)[i] = v; } public: Vc_INTRINSIC reference operator[](size_t i) noexcept { static_assert(noexcept(reference{std::declval(), int()}), ""); return {*this, int(i)}; } Vc_INTRINSIC value_type operator[](size_t index) const noexcept { return get(*this, int(index)); } Vc_INTRINSIC Common::WriteMaskedVector operator()( const mask_type &mask) { return {*this, mask}; } Vc_INTRINSIC void assign(const SimdArray &v, const mask_type &k) { data0.assign(v.data0, internal_data0(k)); data1.assign(v.data1, internal_data1(k)); } #define Vc_REDUCTION_FUNCTION_(name_,binary_fun_,scalar_fun_) \ private: \ template \ Vc_INTRINSIC enable_if::value && \ storage_type0::Size == storage_type1::Size, \ value_type> name_##_impl() const \ { \ return binary_fun_(data0, data1).name_(); \ } \ \ template \ Vc_INTRINSIC enable_if::value && \ storage_type0::Size != storage_type1::Size, \ value_type> name_##_impl() const \ { \ return scalar_fun_(data0.name_(), data1.name_()); \ } \ \ public: \ \ Vc_INTRINSIC value_type name_() const { return name_##_impl(); } \ \ Vc_INTRINSIC value_type name_(const mask_type &mask) const \ { \ if (Vc_IS_UNLIKELY(Split::lo(mask).isEmpty())) { \ return data1.name_(Split::hi(mask)); \ } else if (Vc_IS_UNLIKELY(Split::hi(mask).isEmpty())) { \ return data0.name_(Split::lo(mask)); \ } else { \ return scalar_fun_(data0.name_(Split::lo(mask)), \ data1.name_(Split::hi(mask))); \ } \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_REDUCTION_FUNCTION_(min, Vc::min, std::min); Vc_REDUCTION_FUNCTION_(max, Vc::max, std::max); Vc_REDUCTION_FUNCTION_(product, internal::product_helper_, internal::product_helper_); Vc_REDUCTION_FUNCTION_(sum, internal::sum_helper_, internal::sum_helper_); #undef Vc_REDUCTION_FUNCTION_ Vc_INTRINSIC Vc_PURE fixed_size_simd partialSum() const { auto ps0 = data0.partialSum(); auto tmp = data1; tmp[0] += ps0[data0.size() - 1]; return {std::move(ps0), tmp.partialSum()}; } template inline fixed_size_simd apply(F &&f) const { return {data0.apply(f), data1.apply(f)}; } template inline fixed_size_simd apply(F &&f, const mask_type &k) const { return {data0.apply(f, Split::lo(k)), data1.apply(f, Split::hi(k))}; } inline fixed_size_simd shifted(int amount) const { constexpr int SSize = Size; constexpr int SSize0 = storage_type0::Size; constexpr int SSize1 = storage_type1::Size; if (amount == 0) { return *this; } if (amount < 0) { if (amount > -SSize0) { return {data0.shifted(amount), data1.shifted(amount, data0)}; } if (amount == -SSize0) { return {storage_type0(0), simd_cast(data0)}; } if (amount < -SSize0) { return {storage_type0(0), simd_cast(data0.shifted( amount + SSize0))}; } return Zero(); } else { if (amount >= SSize) { return Zero(); } else if (amount >= SSize0) { return { simd_cast(data1).shifted(amount - SSize0), storage_type1(0)}; } else if (amount >= SSize1) { return {data0.shifted(amount, data1), storage_type1(0)}; } else { return {data0.shifted(amount, data1), data1.shifted(amount)}; } } } template inline enable_if< !(std::is_same::value && N == NN), fixed_size_simd> shifted(int amount, const SimdArray &shiftIn) const { constexpr int SSize = Size; if (amount < 0) { return fixed_size_simd([&](int i) -> value_type { i += amount; if (i >= 0) { return operator[](i); } else if (i >= -SSize) { return shiftIn[i + SSize]; } return 0; }); } return fixed_size_simd([&](int i) -> value_type { i += amount; if (i < SSize) { return operator[](i); } else if (i < 2 * SSize) { return shiftIn[i - SSize]; } return 0; }); } private: template struct bisectable_shift : public std::integral_constant::value && N == NN> { }; public: template inline fixed_size_simd shifted( enable_if::value, int> amount, const SimdArray &shiftIn) const { constexpr int SSize = Size; if (amount < 0) { if (amount > -static_cast(storage_type0::Size)) { return {data0.shifted(amount, internal_data1(shiftIn)), data1.shifted(amount, data0)}; } if (amount == -static_cast(storage_type0::Size)) { return {storage_type0(internal_data1(shiftIn)), storage_type1(data0)}; } if (amount > -SSize) { return { internal_data1(shiftIn) .shifted(amount + static_cast(storage_type0::Size), internal_data0(shiftIn)), data0.shifted(amount + static_cast(storage_type0::Size), internal_data1(shiftIn))}; } if (amount == -SSize) { return shiftIn; } if (amount > -2 * SSize) { return shiftIn.shifted(amount + SSize); } } if (amount == 0) { return *this; } if (amount < static_cast(storage_type0::Size)) { return {data0.shifted(amount, data1), data1.shifted(amount, internal_data0(shiftIn))}; } if (amount == static_cast(storage_type0::Size)) { return {storage_type0(data1), storage_type1(internal_data0(shiftIn))}; } if (amount < SSize) { return {data1.shifted(amount - static_cast(storage_type0::Size), internal_data0(shiftIn)), internal_data0(shiftIn) .shifted(amount - static_cast(storage_type0::Size), internal_data1(shiftIn))}; } if (amount == SSize) { return shiftIn; } if (amount < 2 * SSize) { return shiftIn.shifted(amount - SSize); } return Zero(); } Vc_INTRINSIC fixed_size_simd rotated(int amount) const { amount %= int(size()); if (amount == 0) { return *this; } else if (amount < 0) { amount += size(); } #ifdef Vc_MSVC alignas(MemoryAlignment) T tmp[N + data0.size()]; data0.store(&tmp[0], Vc::Aligned); data1.store(&tmp[data0.size()], Vc::Aligned); data0.store(&tmp[N], Vc::Unaligned); fixed_size_simd r; r.data0.load(&tmp[amount], Vc::Unaligned); r.data1.load(&tmp[(amount + data0.size()) % size()], Vc::Unaligned); return r; #else auto &&d0cvtd = simd_cast(data0); auto &&d1cvtd = simd_cast(data1); constexpr int size0 = storage_type0::size(); constexpr int size1 = storage_type1::size(); if (amount == size0 && std::is_same::value) { return {std::move(d1cvtd), std::move(d0cvtd)}; } else if (amount < size1) { return {data0.shifted(amount, d1cvtd), data1.shifted(amount, d0cvtd)}; } else if (amount == size1) { return {data0.shifted(amount, d1cvtd), std::move(d0cvtd)}; } else if (int(size()) - amount < size1) { return {data0.shifted(amount - int(size()), d1cvtd.shifted(size1 - size0)), data1.shifted(amount - int(size()), data0.shifted(size0 - size1))}; } else if (int(size()) - amount == size1) { return {data0.shifted(-size1, d1cvtd.shifted(size1 - size0)), simd_cast(data0.shifted(size0 - size1))}; } else if (amount <= size0) { return {data0.shifted(size1, d1cvtd).shifted(amount - size1, data0), simd_cast(data0.shifted(amount - size1))}; } else { return {data0.shifted(size1, d1cvtd).shifted(amount - size1, data0), simd_cast(data0.shifted(amount - size1, d1cvtd))}; } return *this; #endif } Vc_INTRINSIC fixed_size_simd interleaveLow(const SimdArray &x) const { return {data0.interleaveLow(x.data0), simd_cast(data0.interleaveHigh(x.data0))}; } Vc_INTRINSIC fixed_size_simd interleaveHigh(const SimdArray &x) const { return interleaveHighImpl( x, std::integral_constant()); } private: Vc_INTRINSIC fixed_size_simd interleaveHighImpl(const SimdArray &x, std::true_type) const { return {data1.interleaveLow(x.data1), data1.interleaveHigh(x.data1)}; } inline fixed_size_simd interleaveHighImpl(const SimdArray &x, std::false_type) const { return {data0.interleaveHigh(x.data0) .shifted(storage_type1::Size, simd_cast(data1.interleaveLow(x.data1))), data1.interleaveHigh(x.data1)}; } public: inline fixed_size_simd reversed() const { if (std::is_same::value) { return {simd_cast(data1).reversed(), simd_cast(data0).reversed()}; } else { #ifdef Vc_MSVC alignas(MemoryAlignment) T tmp[N]; data1.reversed().store(&tmp[0], Vc::Aligned); data0.reversed().store(&tmp[data1.size()], Vc::Unaligned); return fixed_size_simd{&tmp[0], Vc::Aligned}; #else return {data0.shifted(storage_type1::Size, data1).reversed(), simd_cast(data0.reversed().shifted( storage_type0::Size - storage_type1::Size))}; #endif } } inline fixed_size_simd sorted() const { return sortedImpl( std::integral_constant()); } Vc_INTRINSIC fixed_size_simd sortedImpl(std::true_type) const { #ifdef Vc_DEBUG_SORTED std::cerr << "-- " << data0 << data1 << '\n'; #endif const auto a = data0.sorted(); const auto b = data1.sorted().reversed(); const auto lo = Vc::min(a, b); const auto hi = Vc::max(a, b); return {lo.sorted(), hi.sorted()}; } Vc_INTRINSIC fixed_size_simd sortedImpl(std::false_type) const { using SortableArray = fixed_size_simd::value>; auto sortable = simd_cast(*this); for (std::size_t i = Size; i < SortableArray::Size; ++i) { using limits = std::numeric_limits; if (limits::has_infinity) { sortable[i] = limits::infinity(); } else { sortable[i] = std::numeric_limits::max(); } } return simd_cast>(sortable.sorted()); } static constexpr std::size_t Size = size(); Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC fixed_size_simd exponent() const { return {exponent(data0), exponent(data1)}; } Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC MaskType isNegative() const { return {isnegative(data0), isnegative(data1)}; } Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC fixed_size_simd copySign(const SimdArray &x) const { return {Vc::copysign(data0, x.data0), Vc::copysign(data1, x.data1)}; } friend storage_type0 &internal_data0<>(SimdArray &x); friend storage_type1 &internal_data1<>(SimdArray &x); friend const storage_type0 &internal_data0<>(const SimdArray &x); friend const storage_type1 &internal_data1<>(const SimdArray &x); Vc_INTRINSIC SimdArray(storage_type0 &&x, storage_type1 &&y) : data0(std::move(x)), data1(std::move(y)) { } Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(storage_type0)); private: alignas(static_cast( Common::BoundedAlignment::value * sizeof(V) / V::size()>::value)) storage_type0 data0; storage_type1 data1; }; #undef Vc_CURRENT_CLASS_NAME template constexpr std::size_t SimdArray::Size; template constexpr std::size_t SimdArray::MemoryAlignment; template template inline void SimdArray::gatherImplementation( const Common::GatherArguments &args) { data0.gather(Common::make_gather( args.address, Split::lo(Common::Operations::gather(), args.indexes))); data1.gather(Common::make_gather( args.address, Split::hi(Common::Operations::gather(), args.indexes))); } template template inline void SimdArray::gatherImplementation( const Common::GatherArguments &args, MaskArgument mask) { data0.gather(Common::make_gather( args.address, Split::lo(Common::Operations::gather(), args.indexes)), Split::lo(mask)); data1.gather(Common::make_gather( args.address, Split::hi(Common::Operations::gather(), args.indexes)), Split::hi(mask)); } template template inline void SimdArray::scatterImplementation(MT *mem, IT &&indexes) const { data0.scatter(mem, Split::lo(Common::Operations::gather(), indexes)); data1.scatter(mem, Split::hi(Common::Operations::gather(), std::forward(indexes))); } template template inline void SimdArray::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const { data0.scatter(mem, Split::lo(Common::Operations::gather(), indexes), Split::lo(mask)); data1.scatter(mem, Split::hi(Common::Operations::gather(), std::forward(indexes)), Split::hi(mask)); } template #ifndef Vc_MSVC Vc_INTRINSIC #endif typename SimdArrayTraits::storage_type0 &internal_data0( SimdArray &x) { return x.data0; } template #ifndef Vc_MSVC Vc_INTRINSIC #endif typename SimdArrayTraits::storage_type1 &internal_data1( SimdArray &x) { return x.data1; } template #ifndef Vc_MSVC Vc_INTRINSIC #endif const typename SimdArrayTraits::storage_type0 &internal_data0( const SimdArray &x) { return x.data0; } template #ifndef Vc_MSVC Vc_INTRINSIC #endif const typename SimdArrayTraits::storage_type1 &internal_data1( const SimdArray &x) { return x.data1; } #if defined Vc_MSVC && defined Vc_IMPL_SSE && !defined Vc_IMPL_AVX template <> Vc_INTRINSIC SimdArray::SimdArray(fixed_size_simd &&x, fixed_size_simd &&y) : data0(x), data1(0) { data1 = y; } #endif namespace Detail { #define Vc_FIXED_OP(op) \ template ::is_atomic>::type> \ fixed_size_simd operator op(const fixed_size_simd &a, \ const fixed_size_simd &b) \ { \ return {private_init, internal_data(a) op internal_data(b)}; \ } \ template ::is_atomic>::type, \ class = T> \ fixed_size_simd operator op(const fixed_size_simd &a, \ const fixed_size_simd &b) \ { \ return {internal_data0(a) op internal_data0(b), \ internal_data1(a) op internal_data1(b)}; \ } Vc_ALL_ARITHMETICS(Vc_FIXED_OP); Vc_ALL_BINARY(Vc_FIXED_OP); Vc_ALL_SHIFTS(Vc_FIXED_OP); #undef Vc_FIXED_OP #define Vc_FIXED_OP(op) \ template ::is_atomic>::type> \ fixed_size_simd_mask operator op(const fixed_size_simd &a, \ const fixed_size_simd &b) \ { \ return {private_init, internal_data(a) op internal_data(b)}; \ } \ template ::is_atomic>::type, \ class = T> \ fixed_size_simd_mask operator op(const fixed_size_simd &a, \ const fixed_size_simd &b) \ { \ return {internal_data0(a) op internal_data0(b), \ internal_data1(a) op internal_data1(b)}; \ } Vc_ALL_COMPARES(Vc_FIXED_OP); #undef Vc_FIXED_OP } namespace result_vector_type_internal { template using remove_cvref = typename std::remove_cv::type>::type; template using is_integer_larger_than_int = std::integral_constant< bool, std::is_integral::value &&(sizeof(T) > sizeof(int) || std::is_same::value || std::is_same::value)>; template < typename L, typename R, std::size_t N = Traits::isSimdArray::value ? Traits::simd_vector_size::value : Traits::simd_vector_size::value, bool = (Traits::isSimdArray::value || Traits::isSimdArray::value) && !(Traits::is_fixed_size_simd::value && Traits::is_fixed_size_simd::value) && ((std::is_arithmetic>::value && !is_integer_larger_than_int>::value) || (std::is_arithmetic>::value && !is_integer_larger_than_int>::value) || Traits::simd_vector_size::value == Traits::simd_vector_size::value)> struct evaluate; template struct evaluate { private: using LScalar = Traits::entry_type_of; using RScalar = Traits::entry_type_of; template using conditional = typename std::conditional::type; public: using type = fixed_size_simd< conditional<(std::is_integral::value &&std::is_integral::value && sizeof(LScalar) < sizeof(int) && sizeof(RScalar) < sizeof(int)), conditional<(sizeof(LScalar) == sizeof(RScalar)), conditional::value, LScalar, RScalar>, conditional<(sizeof(LScalar) > sizeof(RScalar)), LScalar, RScalar>>, decltype(std::declval() + std::declval())>, N>; }; } template using result_vector_type = typename result_vector_type_internal::evaluate::type; #define Vc_BINARY_OPERATORS_(op_) \ \ template \ Vc_INTRINSIC result_vector_type operator op_(L &&lhs, R &&rhs) \ { \ using Return = result_vector_type; \ return Vc::Detail::operator op_( \ static_cast(std::forward(lhs)), \ static_cast(std::forward(rhs))); \ } Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATORS_); Vc_ALL_BINARY(Vc_BINARY_OPERATORS_); #undef Vc_BINARY_OPERATORS_ #define Vc_BINARY_OPERATORS_(op_) \ \ template \ Vc_INTRINSIC typename result_vector_type::mask_type operator op_(L &&lhs, \ R &&rhs) \ { \ using Promote = result_vector_type; \ return Promote(std::forward(lhs)) op_ Promote(std::forward(rhs)); \ } Vc_ALL_COMPARES(Vc_BINARY_OPERATORS_); #undef Vc_BINARY_OPERATORS_ #define Vc_FORWARD_UNARY_OPERATOR(name_) \ \ template \ inline fixed_size_simd name_(const SimdArray &x) \ { \ return fixed_size_simd::fromOperation( \ Common::Operations::Forward_##name_(), x); \ } \ template \ fixed_size_simd name_(const fixed_size_simd &x) \ { \ return fixed_size_simd::fromOperation( \ Common::Operations::Forward_##name_(), x); \ } \ Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_FORWARD_UNARY_BOOL_OPERATOR(name_) \ \ template \ inline fixed_size_simd_mask name_(const SimdArray &x) \ { \ return fixed_size_simd_mask::fromOperation( \ Common::Operations::Forward_##name_(), x); \ } \ template \ fixed_size_simd_mask name_(const fixed_size_simd &x) \ { \ return fixed_size_simd_mask::fromOperation( \ Common::Operations::Forward_##name_(), x); \ } \ Vc_NOTHING_EXPECTING_SEMICOLON #define Vc_FORWARD_BINARY_OPERATOR(name_) \ \ template \ inline fixed_size_simd name_(const SimdArray &x, \ const SimdArray &y) \ { \ return fixed_size_simd::fromOperation( \ Common::Operations::Forward_##name_(), x, y); \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_FORWARD_UNARY_OPERATOR(abs); Vc_FORWARD_UNARY_OPERATOR(asin); Vc_FORWARD_UNARY_OPERATOR(atan); Vc_FORWARD_BINARY_OPERATOR(atan2); Vc_FORWARD_UNARY_OPERATOR(ceil); Vc_FORWARD_BINARY_OPERATOR(copysign); Vc_FORWARD_UNARY_OPERATOR(cos); Vc_FORWARD_UNARY_OPERATOR(exp); Vc_FORWARD_UNARY_OPERATOR(exponent); Vc_FORWARD_UNARY_OPERATOR(floor); template inline SimdArray fma(const SimdArray &a, const SimdArray &b, const SimdArray &c) { return SimdArray::fromOperation(Common::Operations::Forward_fma(), a, b, c); } Vc_FORWARD_UNARY_BOOL_OPERATOR(isfinite); Vc_FORWARD_UNARY_BOOL_OPERATOR(isinf); Vc_FORWARD_UNARY_BOOL_OPERATOR(isnan); Vc_FORWARD_UNARY_BOOL_OPERATOR(isnegative); template inline SimdArray frexp(const SimdArray &x, SimdArray *e) { return SimdArray::fromOperation(Common::Operations::Forward_frexp(), x, e); } template inline SimdArray ldexp(const SimdArray &x, const SimdArray &e) { return SimdArray::fromOperation(Common::Operations::Forward_ldexp(), x, e); } Vc_FORWARD_UNARY_OPERATOR(log); Vc_FORWARD_UNARY_OPERATOR(log10); Vc_FORWARD_UNARY_OPERATOR(log2); Vc_FORWARD_UNARY_OPERATOR(reciprocal); Vc_FORWARD_UNARY_OPERATOR(round); Vc_FORWARD_UNARY_OPERATOR(rsqrt); Vc_FORWARD_UNARY_OPERATOR(sin); template void sincos(const SimdArray &x, SimdArray *sin, SimdArray *cos) { SimdArray::callOperation(Common::Operations::Forward_sincos(), x, sin, cos); } Vc_FORWARD_UNARY_OPERATOR(sqrt); Vc_FORWARD_UNARY_OPERATOR(trunc); Vc_FORWARD_BINARY_OPERATOR(min); Vc_FORWARD_BINARY_OPERATOR(max); #undef Vc_FORWARD_UNARY_OPERATOR #undef Vc_FORWARD_UNARY_BOOL_OPERATOR #undef Vc_FORWARD_BINARY_OPERATOR #ifdef Vc_MSVC #define Vc_DUMMY_ARG0 , int = 0 #define Vc_DUMMY_ARG1 , long = 0 #define Vc_DUMMY_ARG2 , short = 0 #define Vc_DUMMY_ARG3 , char = '0' #define Vc_DUMMY_ARG4 , unsigned = 0u #define Vc_DUMMY_ARG5 , unsigned short = 0u #else #define Vc_DUMMY_ARG0 #define Vc_DUMMY_ARG1 #define Vc_DUMMY_ARG2 #define Vc_DUMMY_ARG3 #define Vc_DUMMY_ARG4 #define Vc_DUMMY_ARG5 #endif template Vc_INTRINSIC Vc_CONST enable_if simd_cast_impl_smaller_input(const From &... xs, const T &last) { Return r = simd_cast(xs...); for (size_t i = 0; i < N; ++i) { r[i + N * sizeof...(From)] = static_cast(last[i]); } return r; } template Vc_INTRINSIC Vc_CONST Return simd_cast_impl_smaller_input(const T &last) { Return r = Return(); for (size_t i = 0; i < N; ++i) { r[i] = static_cast(last[i]); } return r; } template Vc_INTRINSIC Vc_CONST enable_if simd_cast_impl_larger_input( const From &... xs, const T &last) { Return r = simd_cast(xs...); for (size_t i = N * sizeof...(From); i < Return::Size; ++i) { r[i] = static_cast(last[i - N * sizeof...(From)]); } return r; } template Vc_INTRINSIC Vc_CONST Return simd_cast_impl_larger_input(const T &last) { Return r = Return(); for (size_t i = 0; i < Return::size(); ++i) { r[i] = static_cast(last[i]); } return r; } template Vc_INTRINSIC_L Vc_CONST_L Return simd_cast_without_last(const From &... xs, const T &) Vc_INTRINSIC_R Vc_CONST_R; template struct are_all_types_equal; template struct are_all_types_equal : public std::integral_constant { }; template struct are_all_types_equal : public std::integral_constant< bool, std::is_same::value && are_all_types_equal::value> { }; template Vc_INTRINSIC Vc_CONST Return simd_cast_interleaved_argument_order(const Ts &... a, const Ts &... b); template Vc_INTRINSIC Vc_CONST enable_if<(are_all_types_equal::value && offset == 0), Return> simd_cast_with_offset(const From &x, const Froms &... xs); template Vc_INTRINSIC Vc_CONST enable_if<(From::Size > offset && offset > 0 && offset % Return::Size == 0), Return> simd_cast_with_offset(const From &x); template Vc_INTRINSIC Vc_CONST enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 && ((Traits::isSimdArray::value && !Traits::isAtomicSimdArray::value) || (Traits::isSimdMaskArray::value && !Traits::isAtomicSimdMaskArray::value))), Return> simd_cast_with_offset(const From &x); template Vc_INTRINSIC Vc_CONST enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 && ((Traits::isSimdArray::value && Traits::isAtomicSimdArray::value) || (Traits::isSimdMaskArray::value && Traits::isAtomicSimdMaskArray::value))), Return> simd_cast_with_offset(const From &x); template Vc_INTRINSIC Vc_CONST enable_if< (are_all_types_equal::value && From::Size <= offset), Return> simd_cast_with_offset(const From &, const Froms &... xs) { return simd_cast_with_offset(xs...); } template Vc_INTRINSIC Vc_CONST enable_if<(From::Size <= offset), Return> simd_cast_with_offset( const From &) { return Return(0); } template struct first_type_of_impl { using type = T; }; template using first_type_of = typename first_type_of_impl::type; template Vc_INTRINSIC Vc_CONST Return simd_cast_drop_arguments(From x); template Vc_INTRINSIC Vc_CONST enable_if<(are_all_types_equal::value && sizeof...(Froms) * first_type_of::Size < Return::Size), Return> simd_cast_drop_arguments(Froms... xs, first_type_of x); template Vc_INTRINSIC Vc_CONST enable_if< (are_all_types_equal::value && (1 + sizeof...(Froms)) * From::Size >= Return::Size && sizeof...(Froms) != 0), Return> simd_cast_drop_arguments(Froms... xs, From x, From); template Vc_INTRINSIC Vc_CONST enable_if<(are_all_types_equal::value && From::Size >= Return::Size), Return> simd_cast_drop_arguments(From x, From); namespace { #ifdef Vc_DEBUG_SIMD_CAST void debugDoNothing(const std::initializer_list &) {} template inline void vc_debug_(const char *prefix, const char *suffix, const T0 &arg0, const Ts &... args) { std::cerr << prefix << arg0; debugDoNothing({&(std::cerr << ", " << args)...}); std::cerr << suffix; } #else template Vc_INTRINSIC void vc_debug_(const char *, const char *, const T0 &, const Ts &...) { } #endif } template struct is_less : public std::integral_constant { }; template struct is_power_of_2 : public std::integral_constant { }; #define Vc_SIMDARRAY_CASTS(SimdArrayType_,NativeType_) \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (Traits::isAtomic##SimdArrayType_::value && \ is_less::Size * sizeof...(Froms), Return::Size>::value && \ are_all_types_equal, Froms...>::value && \ !detail::is_fixed_size_abi::value), \ Return> \ simd_cast(NativeType_ x, Froms... xs) \ { \ vc_debug_("simd_cast{1}(", ")\n", x, xs...); \ return {private_init, simd_cast(x, xs...)}; \ } \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (Traits::isAtomic##SimdArrayType_::value && \ !is_less::Size * sizeof...(Froms), Return::Size>::value && \ are_all_types_equal, Froms...>::value && \ !detail::is_fixed_size_abi::value), \ Return> \ simd_cast(NativeType_ x, Froms... xs) \ { \ vc_debug_("simd_cast{2}(", ")\n", x, xs...); \ return {simd_cast_without_last, Froms...>(x, xs...)}; \ } \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(Traits::is##SimdArrayType_::value && \ !Traits::isAtomic##SimdArrayType_::value && \ is_less(), \ NativeType_::Size *(1 + sizeof...(Froms))>::value && \ are_all_types_equal, Froms...>::value && \ !detail::is_fixed_size_abi::value), \ Return> \ simd_cast(NativeType_ x, Froms... xs) \ { \ vc_debug_("simd_cast{3}(", ")\n", x, xs...); \ using R0 = typename Return::storage_type0; \ using R1 = typename Return::storage_type1; \ return {simd_cast_drop_arguments(x, xs...), \ simd_cast_with_offset(x, xs...)}; \ } \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(Traits::is##SimdArrayType_::value && \ !Traits::isAtomic##SimdArrayType_::value && \ !is_less(), \ NativeType_::Size *(1 + sizeof...(Froms))>::value && \ are_all_types_equal, Froms...>::value && \ !detail::is_fixed_size_abi::value), \ Return> \ simd_cast(NativeType_ x, Froms... xs) \ { \ vc_debug_("simd_cast{4}(", ")\n", x, xs...); \ using R0 = typename Return::storage_type0; \ using R1 = typename Return::storage_type1; \ return {simd_cast(x, xs...), R1(0)}; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_SIMDARRAY_CASTS(SimdArray, Vc::Vector); Vc_SIMDARRAY_CASTS(SimdMaskArray, Vc::Mask); #undef Vc_SIMDARRAY_CASTS #define Vc_SIMDARRAY_CASTS(SimdArrayType_,NativeType_) \ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if::value, Return> \ simd_cast(NativeType_ x Vc_DUMMY_ARG0) \ { \ vc_debug_("simd_cast{offset, atomic}(", ")\n", offset, x); \ return {private_init, simd_cast(x)}; \ } \ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(Traits::is##SimdArrayType_::value && \ !Traits::isAtomic##SimdArrayType_::value && \ Return::Size * offset + Common::left_size() < \ NativeType_::Size), \ Return> \ simd_cast(NativeType_ x Vc_DUMMY_ARG1) \ { \ vc_debug_("simd_cast{offset, split Return}(", ")\n", offset, x); \ using R0 = typename Return::storage_type0; \ constexpr int entries_offset = offset * Return::Size; \ constexpr int entries_offset_right = entries_offset + R0::Size; \ return { \ simd_cast_with_offset(x), \ simd_cast_with_offset( \ x)}; \ } \ \ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(Traits::is##SimdArrayType_::value && \ !Traits::isAtomic##SimdArrayType_::value && \ Return::Size * offset + Common::left_size() >= \ NativeType_::Size), \ Return> \ simd_cast(NativeType_ x Vc_DUMMY_ARG2) \ { \ vc_debug_("simd_cast{offset, R1::Zero}(", ")\n", offset, x); \ using R0 = typename Return::storage_type0; \ using R1 = typename Return::storage_type1; \ constexpr int entries_offset = offset * Return::Size; \ return {simd_cast_with_offset(x), R1(0)}; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_SIMDARRAY_CASTS(SimdArray, Vc::Vector); Vc_SIMDARRAY_CASTS(SimdMaskArray, Vc::Mask); #undef Vc_SIMDARRAY_CASTS #define Vc_SIMDARRAY_CASTS(SimdArrayType_) \ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(are_all_types_equal, From...>::value && \ (sizeof...(From) == 0 || N * sizeof...(From) < Return::Size) && \ !std::is_same>::value), \ Return> \ simd_cast(const SimdArrayType_ &x0, const From &... xs) \ { \ vc_debug_("simd_cast{indivisible}(", ")\n", x0, xs...); \ return simd_cast(internal_data(x0), internal_data(xs)...); \ } \ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(are_all_types_equal, From...>::value && \ (sizeof...(From) > 0 && (N * sizeof...(From) >= Return::Size)) && \ !std::is_same>::value), \ Return> \ simd_cast(const SimdArrayType_ &x0, const From &... xs) \ { \ vc_debug_("simd_cast{indivisible2}(", ")\n", x0, xs...); \ return simd_cast_without_last::storage_type, \ typename From::storage_type...>( \ internal_data(x0), internal_data(xs)...); \ } \ \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (N != M && are_all_types_equal, From...>::value && \ !std::is_same>::value && \ is_less::value && is_power_of_2::value), \ Return> \ simd_cast(const SimdArrayType_ &x0, const From &... xs) \ { \ vc_debug_("simd_cast{bisectable}(", ")\n", x0, xs...); \ return simd_cast_interleaved_argument_order< \ Return, typename SimdArrayType_::storage_type0, \ typename From::storage_type0...>(internal_data0(x0), internal_data0(xs)..., \ internal_data1(x0), internal_data1(xs)...); \ } \ \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (N != M && are_all_types_equal, From...>::value && \ !is_less::value && is_power_of_2::value), \ Return> \ simd_cast(const SimdArrayType_ &x0, const From &... xs) \ { \ vc_debug_("simd_cast{bisectable2}(", ")\n", x0, xs...); \ return simd_cast_without_last, From...>( \ x0, xs...); \ } \ \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (N != M && are_all_types_equal, From...>::value && \ N * (1 + sizeof...(From)) <= Return::Size && !is_power_of_2::value), \ Return> \ simd_cast(const SimdArrayType_ &x0, const From &... xs) \ { \ vc_debug_("simd_cast{remaining}(", ")\n", x0, xs...); \ return simd_cast_impl_smaller_input, \ From...>(x0, xs...); \ } \ \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (N != M && are_all_types_equal, From...>::value && \ N * (1 + sizeof...(From)) > Return::Size && !is_power_of_2::value), \ Return> \ simd_cast(const SimdArrayType_ &x0, const From &... xs) \ { \ vc_debug_("simd_cast{remaining2}(", ")\n", x0, xs...); \ return simd_cast_impl_larger_input, \ From...>(x0, xs...); \ } \ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(N != M && N >= 2 * Return::Size && is_power_of_2::value), Return> \ simd_cast(const SimdArrayType_ &x) \ { \ vc_debug_("simd_cast{single bisectable}(", ")\n", x); \ return simd_cast(internal_data0(x)); \ } \ template \ Vc_INTRINSIC Vc_CONST enable_if<(N != M && N > Return::Size && \ N < 2 * Return::Size && is_power_of_2::value), \ Return> \ simd_cast(const SimdArrayType_ &x) \ { \ vc_debug_("simd_cast{single bisectable2}(", ")\n", x); \ return simd_cast(internal_data0(x), internal_data1(x)); \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_SIMDARRAY_CASTS(SimdArray); Vc_SIMDARRAY_CASTS(SimdMaskArray); #undef Vc_SIMDARRAY_CASTS template >::value>> Vc_INTRINSIC Return simd_cast(const fixed_size_simd &x, const Ts &... xs) { return simd_cast(static_cast &>(x), static_cast &>(xs)...); } template >::value>> Vc_INTRINSIC Return simd_cast(const fixed_size_simd_mask &x, const Ts &... xs) { return simd_cast(static_cast &>(x), static_cast &>(xs)...); } #define Vc_SIMDARRAY_CASTS(SimdArrayType_) \ \ template \ Vc_INTRINSIC Vc_CONST enable_if<(offset == 0), Return> simd_cast( \ const SimdArrayType_ &x Vc_DUMMY_ARG0) \ { \ vc_debug_("simd_cast{offset == 0}(", ")\n", offset, x); \ return simd_cast(x); \ } \ \ template \ Vc_INTRINSIC Vc_CONST enable_if<(offset != 0), Return> simd_cast( \ const SimdArrayType_ &x Vc_DUMMY_ARG1) \ { \ vc_debug_("simd_cast{offset, forward}(", ")\n", offset, x); \ return simd_cast(internal_data(x)); \ } \ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(N != M && offset * Return::Size >= Common::left_size() && \ offset != 0 && Common::left_size() % Return::Size == 0), \ Return> \ simd_cast(const SimdArrayType_ &x Vc_DUMMY_ARG2) \ { \ vc_debug_("simd_cast{offset, right}(", ")\n", offset, x); \ return simd_cast() / Return::Size>( \ internal_data1(x)); \ } \ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(N != M && offset * Return::Size >= Common::left_size() && \ offset != 0 && Common::left_size() % Return::Size != 0), \ Return> \ simd_cast(const SimdArrayType_ &x Vc_DUMMY_ARG3) \ { \ vc_debug_("simd_cast{offset, right, nofit}(", ")\n", offset, x); \ return simd_cast_with_offset()>( \ internal_data1(x)); \ } \ \ template \ Vc_INTRINSIC Vc_CONST enable_if< \ (N != M && \ offset != 0 && (offset + 1) * Return::Size <= Common::left_size()), \ Return> \ simd_cast(const SimdArrayType_ &x Vc_DUMMY_ARG4) \ { \ vc_debug_("simd_cast{offset, left}(", ")\n", offset, x); \ return simd_cast(internal_data0(x)); \ } \ \ template \ Vc_INTRINSIC Vc_CONST \ enable_if<(N != M && (offset * Return::Size < Common::left_size()) && \ offset != 0 && (offset + 1) * Return::Size > Common::left_size()), \ Return> \ simd_cast(const SimdArrayType_ &x Vc_DUMMY_ARG5) \ { \ vc_debug_("simd_cast{offset, copy scalars}(", ")\n", offset, x); \ using R = typename Return::EntryType; \ Return r = Return(0); \ for (std::size_t i = offset * Return::Size; \ i < std::min(N, (offset + 1) * Return::Size); ++i) { \ r[i - offset * Return::Size] = static_cast(x[i]); \ } \ return r; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_SIMDARRAY_CASTS(SimdArray); Vc_SIMDARRAY_CASTS(SimdMaskArray); #undef Vc_SIMDARRAY_CASTS template Vc_INTRINSIC Vc_CONST Return simd_cast_drop_arguments(From x) { return simd_cast(x); } template Vc_INTRINSIC Vc_CONST enable_if<(are_all_types_equal::value && sizeof...(Froms) * first_type_of::Size < Return::Size), Return> simd_cast_drop_arguments(Froms... xs, first_type_of x) { return simd_cast(xs..., x); } template Vc_INTRINSIC Vc_CONST enable_if< (are_all_types_equal::value && (1 + sizeof...(Froms)) * From::Size >= Return::Size && sizeof...(Froms) != 0), Return> simd_cast_drop_arguments(Froms... xs, From x, From) { return simd_cast_drop_arguments(xs..., x); } template Vc_INTRINSIC Vc_CONST enable_if<(are_all_types_equal::value && From::Size >= Return::Size), Return> simd_cast_drop_arguments(From x, From) { return simd_cast_drop_arguments(x); } template Vc_INTRINSIC Vc_CONST enable_if<(From::Size > offset && offset > 0 && offset % Return::Size == 0), Return> simd_cast_with_offset(const From &x) { return simd_cast(x); } template Vc_INTRINSIC Vc_CONST enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 && ((Traits::isSimdArray::value && !Traits::isAtomicSimdArray::value) || (Traits::isSimdMaskArray::value && !Traits::isAtomicSimdMaskArray::value))), Return> simd_cast_with_offset(const From &x) { using R0 = typename Return::storage_type0; using R1 = typename Return::storage_type1; return {simd_cast_with_offset(x), simd_cast_with_offset(x)}; } template Vc_INTRINSIC Vc_CONST enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 && ((Traits::isSimdArray::value && Traits::isAtomicSimdArray::value) || (Traits::isSimdMaskArray::value && Traits::isAtomicSimdMaskArray::value))), Return> simd_cast_with_offset(const From &x) { return simd_cast(x.shifted(offset % Return::Size)); } template Vc_INTRINSIC Vc_CONST enable_if<(are_all_types_equal::value && offset == 0), Return> simd_cast_with_offset(const From &x, const Froms &... xs) { return simd_cast(x, xs...); } template Vc_INTRINSIC Vc_CONST Return simd_cast_without_last(const From &... xs, const T &) { return simd_cast(xs...); } #ifdef Vc_MSVC template Vc_INTRINSIC Vc_CONST enable_if<(I == 0), T0> extract_interleaved(const T0 &a0, const T0 &) { return a0; } template Vc_INTRINSIC Vc_CONST enable_if<(I == 1), T0> extract_interleaved(const T0 &, const T0 &b0) { return b0; } #endif template Vc_INTRINSIC Vc_CONST enable_if<(I == 0), T0> extract_interleaved(const T0 &a0, const Ts &..., const T0 &, const Ts &...) { return a0; } template Vc_INTRINSIC Vc_CONST enable_if<(I == 1), T0> extract_interleaved(const T0 &, const Ts &..., const T0 &b0, const Ts &...) { return b0; } template Vc_INTRINSIC Vc_CONST enable_if<(I > 1), T0> extract_interleaved(const T0 &, const Ts &... a, const T0 &, const Ts &... b) { return extract_interleaved(a..., b...); } template Vc_INTRINSIC Vc_CONST Return simd_cast_interleaved_argument_order_1(index_sequence, const Ts &... a, const Ts &... b) { return simd_cast(extract_interleaved(a..., b...)...); } template Vc_INTRINSIC Vc_CONST Return simd_cast_interleaved_argument_order(const Ts &... a, const Ts &... b) { using seq = make_index_sequence; return simd_cast_interleaved_argument_order_1(seq(), a..., b...); } #define Vc_CONDITIONAL_ASSIGN(name_,op_) \ template \ Vc_INTRINSIC enable_if conditional_assign( \ SimdArray &lhs, M &&mask, U &&rhs) \ { \ lhs(mask) op_ rhs; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_CONDITIONAL_ASSIGN( Assign, =); Vc_CONDITIONAL_ASSIGN( PlusAssign, +=); Vc_CONDITIONAL_ASSIGN( MinusAssign, -=); Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=); Vc_CONDITIONAL_ASSIGN( DivideAssign, /=); Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=); Vc_CONDITIONAL_ASSIGN( XorAssign, ^=); Vc_CONDITIONAL_ASSIGN( AndAssign, &=); Vc_CONDITIONAL_ASSIGN( OrAssign, |=); Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=); Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=); #undef Vc_CONDITIONAL_ASSIGN #define Vc_CONDITIONAL_ASSIGN(name_,expr_) \ template \ Vc_INTRINSIC enable_if> \ conditional_assign(SimdArray &lhs, M &&mask) \ { \ return expr_; \ } \ Vc_NOTHING_EXPECTING_SEMICOLON Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++); Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask)); Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--); Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask)); #undef Vc_CONDITIONAL_ASSIGN namespace Common { template inline void transpose_impl( TransposeTag<4, 4>, SimdArray *Vc_RESTRICT r[], const TransposeProxy, SimdArray, SimdArray, SimdArray> &proxy) { V *Vc_RESTRICT r2[4] = {&internal_data(*r[0]), &internal_data(*r[1]), &internal_data(*r[2]), &internal_data(*r[3])}; transpose_impl(TransposeTag<4, 4>(), &r2[0], TransposeProxy{internal_data(std::get<0>(proxy.in)), internal_data(std::get<1>(proxy.in)), internal_data(std::get<2>(proxy.in)), internal_data(std::get<3>(proxy.in))}); } template inline void transpose_impl( TransposeTag<2, 4>, SimdArray *Vc_RESTRICT r[], const TransposeProxy, SimdArray, SimdArray, SimdArray> &proxy) { auto &lo = *r[0]; auto &hi = *r[1]; internal_data0(internal_data0(lo)) = internal_data0(std::get<0>(proxy.in)); internal_data1(internal_data0(lo)) = internal_data0(std::get<1>(proxy.in)); internal_data0(internal_data1(lo)) = internal_data0(std::get<2>(proxy.in)); internal_data1(internal_data1(lo)) = internal_data0(std::get<3>(proxy.in)); internal_data0(internal_data0(hi)) = internal_data1(std::get<0>(proxy.in)); internal_data1(internal_data0(hi)) = internal_data1(std::get<1>(proxy.in)); internal_data0(internal_data1(hi)) = internal_data1(std::get<2>(proxy.in)); internal_data1(internal_data1(hi)) = internal_data1(std::get<3>(proxy.in)); } template inline void transpose_impl( TransposeTag<4, 4>, SimdArray *Vc_RESTRICT r[], const TransposeProxy, SimdArray, SimdArray, SimdArray> &proxy) { V *Vc_RESTRICT r2[4] = {&internal_data(*r[0]), &internal_data(*r[1]), &internal_data(*r[2]), &internal_data(*r[3])}; transpose_impl(TransposeTag<4, 4>(), &r2[0], TransposeProxy{internal_data(std::get<0>(proxy.in)), internal_data(std::get<1>(proxy.in)), internal_data(std::get<2>(proxy.in)), internal_data(std::get<3>(proxy.in))}); } template inline void transpose_impl( TransposeTag<4, 4>, SimdArray *Vc_RESTRICT r[], const TransposeProxy, SimdArray, SimdArray, SimdArray> &proxy) { SimdArray *Vc_RESTRICT r0[4 / 2] = {r[0], r[1]}; SimdArray *Vc_RESTRICT r1[4 / 2] = {r[2], r[3]}; using H = SimdArray; transpose_impl(TransposeTag<2, 4>(), &r0[0], TransposeProxy{internal_data0(std::get<0>(proxy.in)), internal_data0(std::get<1>(proxy.in)), internal_data0(std::get<2>(proxy.in)), internal_data0(std::get<3>(proxy.in))}); transpose_impl(TransposeTag<2, 4>(), &r1[0], TransposeProxy{internal_data1(std::get<0>(proxy.in)), internal_data1(std::get<1>(proxy.in)), internal_data1(std::get<2>(proxy.in)), internal_data1(std::get<3>(proxy.in))}); } } namespace Detail { template struct InterleaveImpl, N, VSizeof> { template static Vc_INTRINSIC void interleave(T *const data, const I &i, const VV &... vv) { InterleaveImpl::interleave(data, i, internal_data(vv)...); } template static Vc_INTRINSIC void deinterleave(T const *const data, const I &i, VV &... vv) { InterleaveImpl::deinterleave(data, i, internal_data(vv)...); } }; } } namespace std { template struct numeric_limits> : public numeric_limits { private: using R = Vc::SimdArray; public: static Vc_ALWAYS_INLINE Vc_CONST R max() noexcept { return numeric_limits::max(); } static Vc_ALWAYS_INLINE Vc_CONST R min() noexcept { return numeric_limits::min(); } static Vc_ALWAYS_INLINE Vc_CONST R lowest() noexcept { return numeric_limits::lowest(); } static Vc_ALWAYS_INLINE Vc_CONST R epsilon() noexcept { return numeric_limits::epsilon(); } static Vc_ALWAYS_INLINE Vc_CONST R round_error() noexcept { return numeric_limits::round_error(); } static Vc_ALWAYS_INLINE Vc_CONST R infinity() noexcept { return numeric_limits::infinity(); } static Vc_ALWAYS_INLINE Vc_CONST R quiet_NaN() noexcept { return numeric_limits::quiet_NaN(); } static Vc_ALWAYS_INLINE Vc_CONST R signaling_NaN() noexcept { return numeric_limits::signaling_NaN(); } static Vc_ALWAYS_INLINE Vc_CONST R denorm_min() noexcept { return numeric_limits::denorm_min(); } }; } #endif namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template enable_if::value, U> is_convertible_to_any_vector(Vector); template T is_convertible_to_any_vector(Vector); template ::value, bool = std::is_integral::value> struct FundamentalReturnType; template using fundamental_return_t = typename FundamentalReturnType::type; template struct FundamentalReturnType { using type = typename std::conditional< std::is_arithmetic::value, typename std::conditional<(sizeof(T) < sizeof(U)), U, T>::type, T>::type; }; template struct FundamentalReturnType { using type = typename std::conditional< std::is_arithmetic::value, U, T>::type; }; template struct FundamentalReturnType { using type = T; }; template struct my_make_signed : public std::make_signed { }; template <> struct my_make_signed { using type = bool; }; template struct higher_conversion_rank { template using fix_sign = typename std::conditional<(std::is_unsigned::value || std::is_unsigned::value), typename std::make_unsigned::type, A>::type; using T = typename my_make_signed::type; using U = typename my_make_signed::type; template using c = typename std::conditional::value || std::is_same::value, Test, Otherwise>::type; using type = fix_sign>>>>>; }; template struct FundamentalReturnType { template using c = typename std::conditional::type; using type = c<(sizeof(T) > sizeof(U)), T, c<(sizeof(T) < sizeof(U)), U, typename higher_conversion_rank::type>>; }; template struct ReturnTypeImpl { }; template struct ReturnTypeImpl, Vector, Uq, void> { using type = Vc::Vector, Abi>; }; template struct ReturnTypeImpl, int, Uq, void> { using type = Vc::Vector; }; template struct ReturnTypeImpl, uint, Uq, void> { using type = Vc::Vector< typename std::conditional::value, std::make_unsigned, std::enable_if>::type::type, Abi>; }; template struct ReturnTypeImpl< Vector, U, Uq, enable_if::value && !std::is_same::value && !std::is_same::value && Traits::is_valid_vector_argument>::value, void>> { using type = Vc::Vector, Abi>; }; template struct ReturnTypeImpl< Vector, U, Uq, enable_if::value && !Traits::is_simd_vector::value && Traits::is_valid_vector_argument(std::declval()))>::value, void>> { using type = Vc::Vector( std::declval()))>, Abi>; }; template > using ReturnType = typename ReturnTypeImpl::type; template struct is_a_type : public std::true_type { }; #ifdef Vc_ENABLE_FLOAT_BIT_OPERATORS #define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) true #else #define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) \ Detail::is_a_type() \ op_ std::declval())>::value #endif } #define Vc_GENERIC_OPERATOR(op_) \ template , U>> \ Vc_ALWAYS_INLINE enable_if, R>::value && \ std::is_convertible::value, \ R> \ operator op_(Vector x, U &&y) \ { \ return Detail::operator op_(R(x), R(std::forward(y))); \ } \ template , U>> \ Vc_ALWAYS_INLINE enable_if::value && \ std::is_convertible, R>::value && \ std::is_convertible::value, \ R> \ operator op_(U &&x, Vector y) \ { \ return Detail::operator op_(R(std::forward(x)), R(y)); \ } \ template , U>> \ Vc_ALWAYS_INLINE enable_if, R>::value && \ std::is_convertible::value, \ Vector &> \ operator op_##=(Vector &x, U &&y) \ { \ x = Detail::operator op_(R(x), R(std::forward(y))); \ return x; \ } #define Vc_LOGICAL_OPERATOR(op_) \ template \ Vc_ALWAYS_INLINE typename Vector::Mask operator op_(Vector x, \ Vector y) \ { \ return !!x op_ !!y; \ } \ template \ Vc_ALWAYS_INLINE \ enable_if, Vector>::value && \ std::is_convertible, Vector>::value, \ typename Detail::ReturnType, Vector>::Mask> \ operator op_(Vector x, Vector y) \ { \ return !!x op_ !!y; \ } \ template \ Vc_ALWAYS_INLINE enable_if())>::value, \ typename Vector::Mask> \ operator op_(Vector x, U &&y) \ { \ using M = typename Vector::Mask; \ return !!x op_ M(!!std::forward(y)); \ } \ template \ Vc_ALWAYS_INLINE enable_if())>::value, \ typename Vector::Mask> \ operator op_(U &&x, Vector y) \ { \ using M = typename Vector::Mask; \ return M(!!std::forward(x)) op_ !!y; \ } #define Vc_COMPARE_OPERATOR(op_) \ template , U>> \ Vc_ALWAYS_INLINE enable_if, R>::value && \ std::is_convertible::value, \ typename R::Mask> \ operator op_(Vector x, U &&y) \ { \ return Detail::operator op_(R(x), R(std::forward(y))); \ } \ template , U>> \ Vc_ALWAYS_INLINE \ enable_if>::value && \ std::is_convertible, R>::value && \ std::is_convertible::value, \ typename R::Mask> \ operator op_(U &&x, Vector y) \ { \ return Detail::operator op_(R(std::forward(x)), R(y)); \ } Vc_ALL_LOGICAL (Vc_LOGICAL_OPERATOR); Vc_ALL_BINARY (Vc_GENERIC_OPERATOR); Vc_ALL_ARITHMETICS(Vc_GENERIC_OPERATOR); Vc_ALL_COMPARES (Vc_COMPARE_OPERATOR); #undef Vc_LOGICAL_OPERATOR #undef Vc_GENERIC_OPERATOR #undef Vc_COMPARE_OPERATOR #undef Vc_INVALID_OPERATOR } #endif #ifndef VC_COMMON_ALIGNEDBASE_H_ #define VC_COMMON_ALIGNEDBASE_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template constexpr T max(T a) { return a; } template constexpr T max(T a, T b, Ts... rest) { return a > b ? max(a, rest...) : max(b, rest...); } } namespace Common { template Vc_INTRINSIC void *aligned_malloc(std::size_t); Vc_ALWAYS_INLINE void free(void *); } template struct alignas(Alignment) AlignedBase { Vc_FREE_STORE_OPERATORS_ALIGNED(Alignment); }; using VectorAlignedBase = AlignedBase< Detail::max(alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector), alignof(Vector))>; template using VectorAlignedBaseT = AlignedBase; using MemoryAlignedBase = AlignedBase< Detail::max(Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment, Vector::MemoryAlignment)>; template using MemoryAlignedBaseT = AlignedBase; } #endif namespace Vc_VERSIONED_NAMESPACE { constexpr std::size_t VectorAlignment = alignof(VectorAlignedBase); constexpr std::size_t MemoryAlignment = alignof(MemoryAlignedBase); } #define Vc_VECTOR_DECLARED_ 1 #ifndef VC_SCALAR_DEINTERLEAVE_H_ #define VC_SCALAR_DEINTERLEAVE_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template Vc_ALWAYS_INLINE void deinterleave(Scalar::Vector &a, Scalar::Vector &b, const M *mem, A) { a = mem[0]; b = mem[1]; } Vc_ALWAYS_INLINE void prefetchForOneRead(const void *, VectorAbi::Scalar) {} Vc_ALWAYS_INLINE void prefetchForModify(const void *, VectorAbi::Scalar) {} Vc_ALWAYS_INLINE void prefetchClose(const void *, VectorAbi::Scalar) {} Vc_ALWAYS_INLINE void prefetchMid(const void *, VectorAbi::Scalar) {} Vc_ALWAYS_INLINE void prefetchFar(const void *, VectorAbi::Scalar) {} } } #endif #ifndef VC_SCALAR_MATH_H_ #define VC_SCALAR_MATH_H_ #include namespace Vc_VERSIONED_NAMESPACE { Vc_INTRINSIC Scalar::float_v copysign(Scalar::float_v mag, Scalar::float_v sign) { union { float f; unsigned int i; } value, s; value.f = mag.data(); s.f = sign.data(); value.i = (s.i & 0x80000000u) | (value.i & 0x7fffffffu); return Scalar::float_v{value.f}; } Vc_INTRINSIC Vc_CONST Scalar::double_v copysign(Scalar::double_v mag, Scalar::double_v sign) { union { double f; unsigned long long i; } value, s; value.f = mag.data(); s.f = sign.data(); value.i = (s.i & 0x8000000000000000ull) | (value.i & 0x7fffffffffffffffull); return Scalar::double_v{value.f}; } #define Vc_MINMAX(V) \ static Vc_ALWAYS_INLINE Scalar::V min(const Scalar::V &x, const Scalar::V &y) \ { \ return Scalar::V(std::min(x.data(), y.data())); \ } \ static Vc_ALWAYS_INLINE Scalar::V max(const Scalar::V &x, const Scalar::V &y) \ { \ return Scalar::V(std::max(x.data(), y.data())); \ } Vc_ALL_VECTOR_TYPES(Vc_MINMAX); #undef Vc_MINMAX template static Vc_ALWAYS_INLINE Scalar::Vector sqrt (const Scalar::Vector &x) { return Scalar::Vector(std::sqrt(x.data())); } template static Vc_ALWAYS_INLINE Scalar::Vector rsqrt(const Scalar::Vector &x) { const typename Vector::EntryType one = 1; return Scalar::Vector(one / std::sqrt(x.data())); } template ::value || std::is_same::value || std::is_same::value>> Vc_ALWAYS_INLINE Vc_PURE Scalar::Vector abs(Scalar::Vector x) { return std::abs(x.data()); } Vc_ALWAYS_INLINE Vc_PURE Scalar::Vector abs(Scalar::Vector x) { return std::abs(static_cast(x.data())); } template static Vc_ALWAYS_INLINE void sincos(const Scalar::Vector &x, Scalar::Vector *sin, Scalar::Vector *cos) { #if defined(_WIN32) || defined(__APPLE__) sin->data() = std::sin(x.data()); cos->data() = std::cos(x.data()); #elif Vc_HAS_BUILTIN(__builtin_sincosf) || defined Vc_GCC __builtin_sincosf(x.data(), &sin->data(), &cos->data()); #else sincosf(x.data(), &sin->data(), &cos->data()); #endif } template<> Vc_ALWAYS_INLINE void sincos(const Scalar::Vector &x, Scalar::Vector *sin, Scalar::Vector *cos) { #if defined(_WIN32) || defined(__APPLE__) sin->data() = std::sin(x.data()); cos->data() = std::cos(x.data()); #elif Vc_HAS_BUILTIN(__builtin_sincos) || defined Vc_GCC __builtin_sincos(x.data(), &sin->data(), &cos->data()); #else ::sincos(x.data(), &sin->data(), &cos->data()); #endif } template static Vc_ALWAYS_INLINE Scalar::Vector sin (const Scalar::Vector &x) { return Scalar::Vector(std::sin(x.data())); } template static Vc_ALWAYS_INLINE Scalar::Vector asin (const Scalar::Vector &x) { return Scalar::Vector(std::asin(x.data())); } template static Vc_ALWAYS_INLINE Scalar::Vector cos (const Scalar::Vector &x) { return Scalar::Vector(std::cos(x.data())); } template static Vc_ALWAYS_INLINE Scalar::Vector log (const Scalar::Vector &x) { return Scalar::Vector(std::log(x.data())); } template static Vc_ALWAYS_INLINE Scalar::Vector log10(const Scalar::Vector &x) { return Scalar::Vector(std::log10(x.data())); } template static Vc_ALWAYS_INLINE Scalar::Vector log2(const Scalar::Vector &x) { return Scalar::Vector(std::log2(x.data())); } template static Vc_ALWAYS_INLINE Scalar::Vector exp (const Scalar::Vector &x) { return Scalar::Vector(std::exp(x.data())); } template static Vc_ALWAYS_INLINE Scalar::Vector atan (const Scalar::Vector &x) { return Scalar::Vector(std::atan( x.data() )); } template static Vc_ALWAYS_INLINE Scalar::Vector atan2(const Scalar::Vector &x, const Scalar::Vector &y) { return Scalar::Vector(std::atan2( x.data(), y.data() )); } template static Vc_ALWAYS_INLINE Scalar::Vector trunc(const Scalar::Vector &x) { return std::trunc(x.data()); } template static Vc_ALWAYS_INLINE Scalar::Vector floor(const Scalar::Vector &x) { return Scalar::Vector(std::floor(x.data())); } template static Vc_ALWAYS_INLINE Scalar::Vector ceil(const Scalar::Vector &x) { return Scalar::Vector(std::ceil(x.data())); } template static Vc_ALWAYS_INLINE Scalar::Vector round(const Scalar::Vector &x) { return x; } namespace { template bool _realIsEvenHalf(T x) { const T two = 2; const T half = 0.5; const T f = std::floor(x * half) * two; return (x - f) == half; } } template<> Vc_ALWAYS_INLINE Scalar::Vector round(const Scalar::Vector &x) { return Scalar::float_v(std::floor(x.data() + 0.5f) - (_realIsEvenHalf(x.data()) ? 1.f : 0.f)); } template<> Vc_ALWAYS_INLINE Scalar::Vector round(const Scalar::Vector &x) { return Scalar::double_v(std::floor(x.data() + 0.5 ) - (_realIsEvenHalf(x.data()) ? 1. : 0. )); } template static Vc_ALWAYS_INLINE Scalar::Vector reciprocal(const Scalar::Vector &x) { const typename Vector::EntryType one = 1; return Scalar::Vector(one / x.data()); } #ifdef isfinite #undef isfinite #endif #ifdef isnan #undef isnan #endif template static Vc_ALWAYS_INLINE typename Vector::Mask isfinite(const Scalar::Vector &x) { return typename Vector::Mask( #ifdef _MSC_VER !!_finite(x.data()) #elif defined(__INTEL_COMPILER) && __INTEL_COMPILER < 1500 ::isfinite(x.data()) #else std::isfinite(x.data()) #endif ); } template Vc_ALWAYS_INLINE typename Vector::Mask isinf(const Scalar::Vector &x) { return typename Vector::Mask(std::isinf(x.data())); } template static Vc_ALWAYS_INLINE typename Vector::Mask isnan(const Scalar::Vector &x) { return typename Vector::Mask( #ifdef _MSC_VER !!_isnan(x.data()) #elif defined(__INTEL_COMPILER) && __INTEL_COMPILER < 1500 ::isnan(x.data()) #else std::isnan(x.data()) #endif ); } Vc_ALWAYS_INLINE Scalar::Vector frexp(Scalar::Vector x, SimdArray, 1> *e) { return Scalar::float_v(std::frexp(x.data(), &internal_data(*e).data())); } Vc_ALWAYS_INLINE Scalar::Vector frexp(Scalar::Vector x, SimdArray, 1> *e) { return Scalar::double_v(std::frexp(x.data(), &internal_data(*e).data())); } Vc_ALWAYS_INLINE Scalar::Vector ldexp(Scalar::Vector x, const SimdArray, 1> &e) { return Scalar::float_v(std::ldexp(x.data(), internal_data(e).data())); } Vc_ALWAYS_INLINE Scalar::Vector ldexp(Scalar::Vector x, const SimdArray, 1> &e) { return Scalar::double_v(std::ldexp(x.data(), internal_data(e).data())); } template Vc_ALWAYS_INLINE Vector fma(Vector a, Vector b, Vector c) { if (std::is_integral::value) { return a * b + c; } else { return std::fma(a.data(), b.data(), c.data()); } } } #endif #ifndef Vc_SCALAR_SIMD_CAST_CALLER_TCC_ #define Vc_SCALAR_SIMD_CAST_CALLER_TCC_ namespace Vc_VERSIONED_NAMESPACE { #if Vc_IS_VERSION_1 template template Vc_INTRINSIC Mask::Mask( U &&rhs, Common::enable_if_mask_converts_explicitly) : Mask(simd_cast(std::forward(rhs))) { } #endif } #endif #if defined(Vc_IMPL_SSE) #ifndef VC_SSE_DEINTERLEAVE_H_ #define VC_SSE_DEINTERLEAVE_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template inline void deinterleave(SSE::float_v &, SSE::float_v &, const float *, A); template inline void deinterleave(SSE::float_v &, SSE::float_v &, const short *, A); template inline void deinterleave(SSE::float_v &, SSE::float_v &, const ushort *, A); template inline void deinterleave(SSE::double_v &, SSE::double_v &, const double *, A); template inline void deinterleave(SSE::int_v &, SSE::int_v &, const int *, A); template inline void deinterleave(SSE::int_v &, SSE::int_v &, const short *, A); template inline void deinterleave(SSE::uint_v &, SSE::uint_v &, const uint *, A); template inline void deinterleave(SSE::uint_v &, SSE::uint_v &, const ushort *, A); template inline void deinterleave(SSE::short_v &, SSE::short_v &, const short *, A); template inline void deinterleave(SSE::ushort_v &, SSE::ushort_v &, const ushort *, A); Vc_ALWAYS_INLINE_L void prefetchForOneRead(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L void prefetchForModify(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L void prefetchClose(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L void prefetchMid(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE_L void prefetchFar(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R; } } namespace Vc_VERSIONED_NAMESPACE { namespace SSE { inline void deinterleave(Vector &a, Vector &b) { const __m128 tmp0 = _mm_unpacklo_ps(a.data(), b.data()); const __m128 tmp1 = _mm_unpackhi_ps(a.data(), b.data()); a.data() = _mm_unpacklo_ps(tmp0, tmp1); b.data() = _mm_unpackhi_ps(tmp0, tmp1); } inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp) { a.data() = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(tmp.data(), 16), 16)); b.data() = _mm_cvtepi32_ps(_mm_srai_epi32(tmp.data(), 16)); } inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp) { a.data() = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_slli_epi32(tmp.data(), 16), 16)); b.data() = _mm_cvtepi32_ps(_mm_srli_epi32(tmp.data(), 16)); } inline void deinterleave(Vector &a, Vector &b) { __m128d tmp = _mm_unpacklo_pd(a.data(), b.data()); b.data() = _mm_unpackhi_pd(a.data(), b.data()); a.data() = tmp; } inline void deinterleave(Vector &a, Vector &b) { const __m128i tmp0 = _mm_unpacklo_epi32(a.data(), b.data()); const __m128i tmp1 = _mm_unpackhi_epi32(a.data(), b.data()); a.data() = _mm_unpacklo_epi32(tmp0, tmp1); b.data() = _mm_unpackhi_epi32(tmp0, tmp1); } inline void deinterleave(Vector &a, Vector &b) { const __m128i tmp0 = _mm_unpacklo_epi32(a.data(), b.data()); const __m128i tmp1 = _mm_unpackhi_epi32(a.data(), b.data()); a.data() = _mm_unpacklo_epi32(tmp0, tmp1); b.data() = _mm_unpackhi_epi32(tmp0, tmp1); } inline void deinterleave(Vector &a, Vector &b) { __m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data()); __m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data()); __m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); __m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); a.data() = _mm_unpacklo_epi16(tmp2, tmp3); b.data() = _mm_unpackhi_epi16(tmp2, tmp3); } inline void deinterleave(Vector &a, Vector &b) { __m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data()); __m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data()); __m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); __m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); a.data() = _mm_unpacklo_epi16(tmp2, tmp3); b.data() = _mm_unpackhi_epi16(tmp2, tmp3); } inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp) { a.data() = _mm_srai_epi32(_mm_slli_epi32(tmp.data(), 16), 16); b.data() = _mm_srai_epi32(tmp.data(), 16); } inline void deinterleave(Vector &a, Vector &b, Vector::AsArg tmp) { a.data() = _mm_srli_epi32(_mm_slli_epi32(tmp.data(), 16), 16); b.data() = _mm_srli_epi32(tmp.data(), 16); } } } namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template inline void deinterleave( SSE::float_v &a, SSE::float_v &b, const float *m, A align) { a.load(m, align); b.load(m + SSE::float_v::Size, align); Vc::SSE::deinterleave(a, b); } template inline void deinterleave( SSE::float_v &a, SSE::float_v &b, const short *m, A align) { SSE::short_v tmp(m, align); Vc::SSE::deinterleave(a, b, tmp); } template inline void deinterleave( SSE::float_v &a, SSE::float_v &b, const unsigned short *m, A align) { SSE::ushort_v tmp(m, align); Vc::SSE::deinterleave(a, b, tmp); } template inline void deinterleave( SSE::double_v &a, SSE::double_v &b, const double *m, A align) { a.load(m, align); b.load(m + SSE::double_v::Size, align); Vc::SSE::deinterleave(a, b); } template inline void deinterleave( SSE::int_v &a, SSE::int_v &b, const int *m, A align) { a.load(m, align); b.load(m + SSE::int_v::Size, align); Vc::SSE::deinterleave(a, b); } template inline void deinterleave( SSE::int_v &a, SSE::int_v &b, const short *m, A align) { SSE::short_v tmp(m, align); Vc::SSE::deinterleave(a, b, tmp); } template inline void deinterleave( SSE::uint_v &a, SSE::uint_v &b, const unsigned int *m, A align) { a.load(m, align); b.load(m + SSE::uint_v::Size, align); Vc::SSE::deinterleave(a, b); } template inline void deinterleave( SSE::uint_v &a, SSE::uint_v &b, const unsigned short *m, A align) { SSE::ushort_v tmp(m, align); Vc::SSE::deinterleave(a, b, tmp); } template inline void deinterleave( SSE::short_v &a, SSE::short_v &b, const short *m, A align) { a.load(m, align); b.load(m + SSE::short_v::Size, align); Vc::SSE::deinterleave(a, b); } template inline void deinterleave( SSE::ushort_v &a, SSE::ushort_v &b, const unsigned short *m, A align) { a.load(m, align); b.load(m + SSE::ushort_v::Size, align); Vc::SSE::deinterleave(a, b); } } } #ifndef VC_SSE_PREFETCHES_TCC_ #define VC_SSE_PREFETCHES_TCC_ namespace Vc_VERSIONED_NAMESPACE { namespace Detail { Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr, VectorAbi::Sse) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_NTA); } Vc_ALWAYS_INLINE void prefetchClose(const void *addr, VectorAbi::Sse) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T0); } Vc_ALWAYS_INLINE void prefetchMid(const void *addr, VectorAbi::Sse) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T1); } Vc_ALWAYS_INLINE void prefetchFar(const void *addr, VectorAbi::Sse) { _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T2); } Vc_ALWAYS_INLINE void prefetchForModify(const void *addr, VectorAbi::Sse) { #ifdef __3dNOW__ _m_prefetchw(const_cast(addr)); #else _mm_prefetch(static_cast(const_cast(addr)), _MM_HINT_T0); #endif } } } #endif #endif #ifndef VC_SSE_MATH_H_ #define VC_SSE_MATH_H_ #ifndef VC_SSE_CONST_H_ #define VC_SSE_CONST_H_ namespace Vc_VERSIONED_NAMESPACE { namespace SSE { template struct Const { typedef Vector V; typedef Mask M; enum Constants { Stride = 16 / sizeof(T) }; static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return load(&c_trig::data[0 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return load(&c_trig::data[1 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return load(&c_trig::data[2 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return load(&c_trig::data[3 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return load(&c_trig::data[4 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _16() { return load(&c_trig::data[5 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return load(&c_trig::data[(12 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return load(&c_trig::data[(17 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return load(&c_trig::data[22 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return load(&c_trig::data[23 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return load(&c_trig::data[24 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return load(&c_trig::data[8 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return load(&c_trig::data[9 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return load(&c_trig::data[10 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return load(&c_trig::data[11 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return load(&c_trig::data[(28 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return load(&c_trig::data[(33 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return load(&c_trig::data[(37 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return load(&c_trig::data[(43 + i) * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return load(&c_trig::data[25 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return load(&c_trig::data[26 * Stride]); } static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(load(c_log::d(1)).data()); } static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return load(c_log::d(18)); } static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return load(c_log::d(15)); } static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return load(c_log::d(2 + i)); } static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return load(c_log::d(8 + i)); } static Vc_ALWAYS_INLINE Vc_CONST V min() { return load(c_log::d(14)); } static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return load(c_log::d(17)); } static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return load(c_log::d(16)); } static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return load(c_log::d(13)); } static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return load(c_log::d(19)); } static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return load(c_log::d(20)); } static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R; static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask(int bits) Vc_ALWAYS_INLINE_R Vc_CONST_R; private: static Vc_ALWAYS_INLINE_L Vc_CONST_L V load(const T *mem) Vc_ALWAYS_INLINE_R Vc_CONST_R; }; template Vc_ALWAYS_INLINE Vc_CONST Vector Const::load(const T *mem) { return V(mem); } template <> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return Vector(reinterpret_cast(&c_general::highMaskFloat)); } template <> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask() { return Vector( reinterpret_cast(&c_general::highMaskDouble)); } template <> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask(int bits) { return _mm_castsi128_ps(_mm_slli_epi32(_mm_setallone_si128(), bits)); } template <> Vc_ALWAYS_INLINE Vc_CONST Vector Const::highMask(int bits) { return _mm_castsi128_pd(_mm_slli_epi64(_mm_setallone_si128(), bits)); } } } #endif namespace Vc_VERSIONED_NAMESPACE { Vc_INTRINSIC Vc_CONST SSE::float_v copysign(SSE::float_v mag, SSE::float_v sign) { return _mm_or_ps(_mm_and_ps(sign.data(), SSE::_mm_setsignmask_ps()), _mm_and_ps(mag.data(), SSE::_mm_setabsmask_ps())); } Vc_INTRINSIC Vc_CONST SSE::double_v copysign(SSE::double_v mag, SSE::double_v sign) { return _mm_or_pd(_mm_and_pd(sign.data(), SSE::_mm_setsignmask_pd()), _mm_and_pd(mag.data(), SSE::_mm_setabsmask_pd())); } inline SSE::double_v frexp(const SSE::double_v &v, SimdArray *e) { const __m128i exponentBits = SSE::Const::exponentMask().dataI(); const __m128i exponentPart = _mm_and_si128(_mm_castpd_si128(v.data()), exponentBits); SSE::int_v exponent = _mm_sub_epi32(_mm_srli_epi64(exponentPart, 52), _mm_set1_epi32(0x3fe)); const __m128d exponentMaximized = _mm_or_pd(v.data(), _mm_castsi128_pd(exponentBits)); SSE::double_v ret = _mm_and_pd( exponentMaximized, _mm_load_pd(reinterpret_cast(&SSE::c_general::frexpMask[0]))); SSE::double_m zeroMask = v == SSE::double_v::Zero(); ret(isnan(v) || !isfinite(v) || zeroMask) = v; exponent.setZero(zeroMask.data()); (*e)[0] = exponent[0]; (*e)[1] = exponent[2]; return ret; } inline SSE::float_v frexp(const SSE::float_v &v, SimdArray *e) { const __m128i exponentBits = SSE::Const::exponentMask().dataI(); const __m128i exponentPart = _mm_and_si128(_mm_castps_si128(v.data()), exponentBits); internal_data(*e) = _mm_sub_epi32(_mm_srli_epi32(exponentPart, 23), _mm_set1_epi32(0x7e)); const __m128 exponentMaximized = _mm_or_ps(v.data(), _mm_castsi128_ps(exponentBits)); SSE::float_v ret = _mm_and_ps(exponentMaximized, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu))); ret(isnan(v) || !isfinite(v) || v == SSE::float_v::Zero()) = v; e->setZero(v == SSE::float_v::Zero()); return ret; } inline SSE::double_v ldexp(SSE::double_v::AsArg v, const SimdArray &_e) { SSE::int_v e = _mm_setr_epi32(_e[0], 0, _e[1], 0); e.setZero((v == SSE::double_v::Zero()).dataI()); const __m128i exponentBits = _mm_slli_epi64(e.data(), 52); return _mm_castsi128_pd(_mm_add_epi64(_mm_castpd_si128(v.data()), exponentBits)); } inline SSE::float_v ldexp(SSE::float_v::AsArg v, const SimdArray &_e) { SSE::int_v e = internal_data(_e); e.setZero(simd_cast(v == SSE::float_v::Zero())); return reinterpret_components_cast( reinterpret_components_cast(v) + (e << 23)); } #ifdef Vc_IMPL_SSE4_1 inline SSE::double_v trunc(SSE::double_v::AsArg v) { return _mm_round_pd(v.data(), 0x3); } inline SSE::float_v trunc(SSE::float_v::AsArg v) { return _mm_round_ps(v.data(), 0x3); } inline SSE::double_v floor(SSE::double_v::AsArg v) { return _mm_floor_pd(v.data()); } inline SSE::float_v floor(SSE::float_v::AsArg v) { return _mm_floor_ps(v.data()); } inline SSE::double_v ceil(SSE::double_v::AsArg v) { return _mm_ceil_pd(v.data()); } inline SSE::float_v ceil(SSE::float_v::AsArg v) { return _mm_ceil_ps(v.data()); } #else inline SSE::Vector trunc(SSE::Vector x) { const auto truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(x.data())); const auto no_fractional_values = _mm_castsi128_ps(_mm_cmplt_epi32( _mm_and_si128(_mm_castps_si128(x.data()), _mm_set1_epi32(0x7f800000u)), _mm_set1_epi32(0x4b000000))); return _mm_or_ps(_mm_andnot_ps(no_fractional_values, x.data()), _mm_and_ps(no_fractional_values, truncated)); } inline SSE::Vector trunc(SSE::Vector x) { const auto abs_x = Vc::abs(x).data(); const auto min_no_fractional_bits = _mm_castsi128_pd(_mm_set1_epi64x(0x4330000000000000ull)); __m128d truncated = _mm_sub_pd(_mm_add_pd(abs_x, min_no_fractional_bits), min_no_fractional_bits); truncated = _mm_sub_pd(truncated, _mm_and_pd(_mm_cmplt_pd(abs_x, truncated), _mm_set1_pd(1.))); return _mm_or_pd( _mm_and_pd(_mm_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ull)), x.data()), truncated); } template inline SSE::Vector floor(SSE::Vector x) { auto y = trunc(x); y(!(y == x) && x < 0) -= 1; return y; } template inline SSE::Vector ceil(SSE::Vector x) { auto y = trunc(x); y(!(y == x || x < 0)) += 1; return y; } #endif template Vc_ALWAYS_INLINE Vector fma(Vector a, Vector b, Vector c) { SSE::VectorHelper::fma(a.data(), b.data(), c.data()); return a; } } #endif #ifndef Vc_SSE_SIMD_CAST_CALLER_TCC_ #define Vc_SSE_SIMD_CAST_CALLER_TCC_ namespace Vc_VERSIONED_NAMESPACE { #if Vc_IS_VERSION_1 template template Vc_INTRINSIC Mask::Mask(U &&rhs, Common::enable_if_mask_converts_explicitly) : Mask(Vc::simd_cast(std::forward(rhs))) { } #endif } #endif #endif #if defined(Vc_IMPL_AVX) #ifndef VC_AVX_HELPERIMPL_H_ #define VC_AVX_HELPERIMPL_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const float *, A); template inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const short *, A); template inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const ushort *, A); template inline void deinterleave(AVX2::double_v &, AVX2::double_v &, const double *, A); template inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const int *, A); template inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const short *, A); template inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const uint *, A); template inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const ushort *, A); template inline void deinterleave(AVX2::short_v &, AVX2::short_v &, const short *, A); template inline void deinterleave(AVX2::ushort_v &, AVX2::ushort_v &, const ushort *, A); template Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, AVX2::Vector &Vc_RESTRICT c, const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; template Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, AVX2::Vector &Vc_RESTRICT c, AVX2::Vector &Vc_RESTRICT d, const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; template Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, AVX2::Vector &Vc_RESTRICT c, AVX2::Vector &Vc_RESTRICT d, AVX2::Vector &Vc_RESTRICT e, const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; template Vc_ALWAYS_INLINE_L void deinterleave( AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, AVX2::Vector &Vc_RESTRICT c, AVX2::Vector &Vc_RESTRICT d, AVX2::Vector &Vc_RESTRICT e, AVX2::Vector &Vc_RESTRICT f, const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; template Vc_ALWAYS_INLINE_L void deinterleave( AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, AVX2::Vector &Vc_RESTRICT c, AVX2::Vector &Vc_RESTRICT d, AVX2::Vector &Vc_RESTRICT e, AVX2::Vector &Vc_RESTRICT f, AVX2::Vector &Vc_RESTRICT g, AVX2::Vector &Vc_RESTRICT h, const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R; Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr, VectorAbi::Avx) { prefetchForOneRead(addr, VectorAbi::Sse()); } Vc_ALWAYS_INLINE void prefetchForModify(const void *addr, VectorAbi::Avx) { prefetchForModify(addr, VectorAbi::Sse()); } Vc_ALWAYS_INLINE void prefetchClose(const void *addr, VectorAbi::Avx) { prefetchClose(addr, VectorAbi::Sse()); } Vc_ALWAYS_INLINE void prefetchMid(const void *addr, VectorAbi::Avx) { prefetchMid(addr, VectorAbi::Sse()); } Vc_ALWAYS_INLINE void prefetchFar(const void *addr, VectorAbi::Avx) { prefetchFar(addr, VectorAbi::Sse()); } } } namespace Vc_VERSIONED_NAMESPACE { namespace AVX2 { inline void deinterleave(double_v &Vc_RESTRICT a, double_v &Vc_RESTRICT b, double_v &Vc_RESTRICT c) { const m256d tmp0 = Mem::shuffle128(a.data(), b.data()); const m256d tmp1 = Mem::shuffle128(a.data(), c.data()); const m256d tmp2 = Mem::shuffle128(b.data(), c.data()); a.data() = Mem::shuffle(tmp0, tmp1); b.data() = Mem::shuffle(tmp0, tmp2); c.data() = Mem::shuffle(tmp1, tmp2); } inline void deinterleave(float_v &Vc_RESTRICT a, float_v &Vc_RESTRICT b, float_v &Vc_RESTRICT c) { const m256 ac0 = Mem::shuffle128(a.data(), c.data()); const m256 ac1 = Mem::shuffle128(a.data(), c.data()); m256 tmp0 = Mem::blend( ac0, b.data()); tmp0 = Mem::blend(tmp0, ac1); m256 tmp1 = Mem::blend( ac0, b.data()); tmp1 = Mem::blend(tmp1, ac1); m256 tmp2 = Mem::blend( ac0, b.data()); tmp2 = Mem::blend(tmp2, ac1); a.data() = Mem::permute(tmp0); b.data() = Mem::permute(tmp1); c.data() = Mem::permute(tmp2); } inline void deinterleave(int_v &Vc_RESTRICT a, int_v &Vc_RESTRICT b, int_v &Vc_RESTRICT c) { deinterleave(reinterpret_cast(a), reinterpret_cast(b), reinterpret_cast(c)); } inline void deinterleave(uint_v &Vc_RESTRICT a, uint_v &Vc_RESTRICT b, uint_v &Vc_RESTRICT c) { deinterleave(reinterpret_cast(a), reinterpret_cast(b), reinterpret_cast(c)); } inline void deinterleave(Vector &Vc_RESTRICT , Vector &Vc_RESTRICT , Vector &Vc_RESTRICT ) { return; } inline void deinterleave(Vector &Vc_RESTRICT a, Vector &Vc_RESTRICT b, Vector &Vc_RESTRICT c) { deinterleave(reinterpret_cast &>(a), reinterpret_cast &>(b), reinterpret_cast &>(c)); } inline void deinterleave(Vector &a, Vector &b) { const m256 tmp0 = Reg::permute128(a.data(), b.data()); const m256 tmp1 = Reg::permute128(a.data(), b.data()); const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); a.data() = _mm256_unpacklo_ps(tmp2, tmp3); b.data() = _mm256_unpackhi_ps(tmp2, tmp3); } inline void deinterleave(Vector &a, Vector &b) { auto v0 = Mem::shuffle128(a.data(), b.data()); auto v1 = Mem::shuffle128(a.data(), b.data()); auto v2 = AVX::unpacklo_epi16(v0, v1); auto v3 = AVX::unpackhi_epi16(v0, v1); v0 = AVX::unpacklo_epi16(v2, v3); v1 = AVX::unpackhi_epi16(v2, v3); a.data() = AVX::unpacklo_epi16(v0, v1); b.data() = AVX::unpackhi_epi16(v0, v1); } inline void deinterleave(Vector &a, Vector &b) { auto v0 = Mem::shuffle128(a.data(), b.data()); auto v1 = Mem::shuffle128(a.data(), b.data()); auto v2 = AVX::unpacklo_epi16(v0, v1); auto v3 = AVX::unpackhi_epi16(v0, v1); v0 = AVX::unpacklo_epi16(v2, v3); v1 = AVX::unpackhi_epi16(v2, v3); a.data() = AVX::unpacklo_epi16(v0, v1); b.data() = AVX::unpackhi_epi16(v0, v1); } } namespace Detail { template inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const float *m, Flags align) { a.load(m, align); b.load(m + AVX2::float_v::Size, align); Vc::AVX2::deinterleave(a, b); } template inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const short *m, Flags f) { using namespace Vc::AVX2; const auto tmp = Detail::load32(m, f); a.data() = _mm256_cvtepi32_ps(concat(_mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16), _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16))); b.data() = _mm256_cvtepi32_ps( concat(_mm_srai_epi32(lo128(tmp), 16), _mm_srai_epi32(hi128(tmp), 16))); } template inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const unsigned short *m, Flags f) { using namespace Vc::AVX2; const auto tmp = Detail::load32(m, f); a.data() = _mm256_cvtepi32_ps( concat(_mm_blend_epi16(lo128(tmp), _mm_setzero_si128(), 0xaa), _mm_blend_epi16(hi128(tmp), _mm_setzero_si128(), 0xaa))); b.data() = _mm256_cvtepi32_ps( concat(_mm_srli_epi32(lo128(tmp), 16), _mm_srli_epi32(hi128(tmp), 16))); } template inline void deinterleave(AVX2::double_v &a, AVX2::double_v &b, const double *m, Flags align) { using namespace Vc::AVX2; a.load(m, align); b.load(m + AVX2::double_v::Size, align); m256d tmp0 = Mem::shuffle128(a.data(), b.data()); m256d tmp1 = Mem::shuffle128(a.data(), b.data()); a.data() = _mm256_unpacklo_pd(tmp0, tmp1); b.data() = _mm256_unpackhi_pd(tmp0, tmp1); } template inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const int *m, Flags align) { using namespace AVX; a.load(m, align); b.load(m + AVX2::int_v::Size, align); const m256 tmp0 = avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp1 = avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); a.data() = avx_cast(_mm256_unpacklo_ps(tmp2, tmp3)); b.data() = avx_cast(_mm256_unpackhi_ps(tmp2, tmp3)); } template inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const short *m, Flags f) { using namespace Vc::AVX; const AVX2::short_v tmp0(m, f); const m256i tmp = tmp0.data(); a.data() = concat( _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16), _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16)); b.data() = concat( _mm_srai_epi32(lo128(tmp), 16), _mm_srai_epi32(hi128(tmp), 16)); } template inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned int *m, Flags align) { using namespace AVX; a.load(m, align); b.load(m + AVX2::uint_v::Size, align); const m256 tmp0 = avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp1 = avx_cast(Mem::shuffle128(a.data(), b.data())); const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); a.data() = avx_cast(_mm256_unpacklo_ps(tmp2, tmp3)); b.data() = avx_cast(_mm256_unpackhi_ps(tmp2, tmp3)); } template inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned short *m, Flags f) { using namespace Vc::AVX; const AVX2::ushort_v tmp0(m, f); const m256i tmp = tmp0.data(); a.data() = concat( _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16), _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16)); b.data() = concat( _mm_srai_epi32(lo128(tmp), 16), _mm_srai_epi32(hi128(tmp), 16)); } template inline void deinterleave(AVX2::short_v &a, AVX2::short_v &b, const short *m, Flags align) { a.load(m, align); b.load(m + AVX2::short_v::Size, align); Vc::AVX2::deinterleave(a, b); } template inline void deinterleave(AVX2::ushort_v &a, AVX2::ushort_v &b, const unsigned short *m, Flags align) { a.load(m, align); b.load(m + AVX2::ushort_v::Size, align); Vc::AVX2::deinterleave(a, b); } template Vc_ALWAYS_INLINE void deinterleave(AVX2::Vector &Vc_RESTRICT a, AVX2::Vector &Vc_RESTRICT b, AVX2::Vector &Vc_RESTRICT c, const M *Vc_RESTRICT memory, Flags align) { using V = AVX2::Vector; a.load(&memory[0 * V::Size], align); b.load(&memory[1 * V::Size], align); c.load(&memory[2 * V::Size], align); Vc::AVX2::deinterleave(a, b, c); } } } #endif #ifndef VC_AVX_MATH_H_ #define VC_AVX_MATH_H_ namespace Vc_VERSIONED_NAMESPACE { #ifdef Vc_IMPL_AVX2 Vc_ALWAYS_INLINE AVX2::int_v min(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_min_epi32(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::uint_v min(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_min_epu32(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::short_v min(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_min_epi16(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::ushort_v min(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_min_epu16(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::int_v max(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_max_epi32(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::uint_v max(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_max_epu32(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::short_v max(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_max_epi16(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::ushort_v max(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_max_epu16(x.data(), y.data()); } #endif Vc_ALWAYS_INLINE AVX2::float_v min(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_min_ps(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::double_v min(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_min_pd(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::float_v max(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_max_ps(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::double_v max(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_max_pd(x.data(), y.data()); } template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector sqrt(const AVX2::Vector &x) { return AVX::VectorHelper::sqrt(x.data()); } template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector rsqrt(const AVX2::Vector &x) { return AVX::VectorHelper::rsqrt(x.data()); } template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector reciprocal(const AVX2::Vector &x) { return AVX::VectorHelper::reciprocal(x.data()); } template Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector round(const AVX2::Vector &x) { return AVX::VectorHelper::round(x.data()); } Vc_INTRINSIC Vc_CONST AVX2::double_v abs(AVX2::double_v x) { return Detail::and_(x.data(), AVX::setabsmask_pd()); } Vc_INTRINSIC Vc_CONST AVX2::float_v abs(AVX2::float_v x) { return Detail::and_(x.data(), AVX::setabsmask_ps()); } #ifdef Vc_IMPL_AVX2 Vc_INTRINSIC Vc_CONST AVX2::int_v abs(AVX2::int_v x) { return _mm256_abs_epi32(x.data()); } Vc_INTRINSIC Vc_CONST AVX2::short_v abs(AVX2::short_v x) { return _mm256_abs_epi16(x.data()); } #endif Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isfinite(const AVX2::double_v &x) { return AVX::cmpord_pd(x.data(), _mm256_mul_pd(Detail::zero<__m256d>(), x.data())); } Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isfinite(const AVX2::float_v &x) { return AVX::cmpord_ps(x.data(), _mm256_mul_ps(Detail::zero<__m256>(), x.data())); } Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isinf(const AVX2::double_v &x) { return _mm256_castsi256_pd(AVX::cmpeq_epi64( _mm256_castpd_si256(abs(x).data()), _mm256_castpd_si256(Detail::avx_broadcast(AVX::c_log::d(1))))); } Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isinf(const AVX2::float_v &x) { return _mm256_castsi256_ps( AVX::cmpeq_epi32(_mm256_castps_si256(abs(x).data()), _mm256_castps_si256(Detail::avx_broadcast(AVX::c_log::d(1))))); } Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isnan(const AVX2::double_v &x) { return AVX::cmpunord_pd(x.data(), x.data()); } Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isnan(const AVX2::float_v &x) { return AVX::cmpunord_ps(x.data(), x.data()); } Vc_INTRINSIC Vc_CONST AVX2::float_v copysign(AVX2::float_v mag, AVX2::float_v sign) { return _mm256_or_ps(_mm256_and_ps(sign.data(), AVX::setsignmask_ps()), _mm256_and_ps(mag.data(), AVX::setabsmask_ps())); } Vc_INTRINSIC Vc_CONST AVX2::double_v copysign(AVX2::double_v::AsArg mag, AVX2::double_v::AsArg sign) { return _mm256_or_pd(_mm256_and_pd(sign.data(), AVX::setsignmask_pd()), _mm256_and_pd(mag.data(), AVX::setabsmask_pd())); } inline AVX2::double_v frexp(AVX2::double_v::AsArg v, SimdArray *e) { const __m256d exponentBits = AVX::Const::exponentMask().dataD(); const __m256d exponentPart = _mm256_and_pd(v.data(), exponentBits); auto lo = AVX::avx_cast<__m128i>(AVX::lo128(exponentPart)); auto hi = AVX::avx_cast<__m128i>(AVX::hi128(exponentPart)); lo = _mm_sub_epi32(_mm_srli_epi64(lo, 52), _mm_set1_epi64x(0x3fe)); hi = _mm_sub_epi32(_mm_srli_epi64(hi, 52), _mm_set1_epi64x(0x3fe)); SSE::int_v exponent = Mem::shuffle(lo, hi); const __m256d exponentMaximized = _mm256_or_pd(v.data(), exponentBits); AVX2::double_v ret = _mm256_and_pd(exponentMaximized, _mm256_broadcast_sd(reinterpret_cast(&AVX::c_general::frexpMask))); const double_m zeroMask = v == AVX2::double_v::Zero(); ret(isnan(v) || !isfinite(v) || zeroMask) = v; exponent.setZero(simd_cast(zeroMask)); internal_data(*e) = exponent; return ret; } #ifdef Vc_IMPL_AVX2 inline SimdArray frexp(const SimdArray &v, SimdArray *e) { const __m256d exponentBits = AVX::Const::exponentMask().dataD(); const __m256d w[2] = {internal_data(internal_data0(v)).data(), internal_data(internal_data1(v)).data()}; const __m256i exponentPart[2] = { _mm256_castpd_si256(_mm256_and_pd(w[0], exponentBits)), _mm256_castpd_si256(_mm256_and_pd(w[1], exponentBits))}; const __m256i lo = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[0], 52), _mm256_set1_epi32(0x3fe)); const __m256i hi = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[1], 52), _mm256_set1_epi32(0x3fe)); const __m256i a = _mm256_unpacklo_epi32(lo, hi); const __m256i b = _mm256_unpackhi_epi32(lo, hi); const __m256i tmp = _mm256_unpacklo_epi32(a, b); const __m256i exponent = AVX::concat(_mm_unpacklo_epi64(AVX::lo128(tmp), AVX::hi128(tmp)), _mm_unpackhi_epi64(AVX::lo128(tmp), AVX::hi128(tmp))); const __m256d exponentMaximized[2] = {_mm256_or_pd(w[0], exponentBits), _mm256_or_pd(w[1], exponentBits)}; const auto frexpMask = _mm256_broadcast_sd(reinterpret_cast(&AVX::c_general::frexpMask)); fixed_size_simd ret = { fixed_size_simd( AVX::double_v(_mm256_and_pd(exponentMaximized[0], frexpMask))), fixed_size_simd( AVX::double_v(_mm256_and_pd(exponentMaximized[1], frexpMask)))}; const auto zeroMask = v == v.Zero(); ret(isnan(v) || !isfinite(v) || zeroMask) = v; internal_data(*e) = Detail::andnot_(simd_cast(zeroMask).dataI(), exponent); return ret; } #endif namespace Detail { Vc_INTRINSIC AVX2::float_v::IndexType extractExponent(__m256 e) { SimdArray exponentPart; const auto ee = AVX::avx_cast<__m256i>(e); #ifdef Vc_IMPL_AVX2 exponentPart = AVX2::uint_v(ee); #else internal_data(internal_data0(exponentPart)) = AVX::lo128(ee); internal_data(internal_data1(exponentPart)) = AVX::hi128(ee); #endif return (exponentPart >> 23) - 0x7e; } } inline AVX2::float_v frexp(AVX2::float_v::AsArg v, SimdArray *e) { using namespace Detail; using namespace AVX2; const __m256 exponentBits = Const::exponentMask().data(); *e = extractExponent(and_(v.data(), exponentBits)); const __m256 exponentMaximized = or_(v.data(), exponentBits); AVX2::float_v ret = _mm256_and_ps(exponentMaximized, avx_cast<__m256>(set1_epi32(0xbf7fffffu))); ret(isnan(v) || !isfinite(v) || v == AVX2::float_v::Zero()) = v; e->setZero(simd_cast(v == AVX2::float_v::Zero())); return ret; } inline AVX2::double_v ldexp(AVX2::double_v::AsArg v, const SimdArray &_e) { SSE::int_v e = internal_data(_e); e.setZero(simd_cast(v == AVX2::double_v::Zero())); const __m256i exponentBits = AVX::concat(_mm_slli_epi64(_mm_unpacklo_epi32(e.data(), e.data()), 52), _mm_slli_epi64(_mm_unpackhi_epi32(e.data(), e.data()), 52)); return AVX::avx_cast<__m256d>( AVX::add_epi64(AVX::avx_cast<__m256i>(v.data()), exponentBits)); } inline AVX2::float_v ldexp(AVX2::float_v::AsArg v, SimdArray e) { e.setZero(simd_cast(v == AVX2::float_v::Zero())); e <<= 23; #ifdef Vc_IMPL_AVX2 return {AVX::avx_cast<__m256>( AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())), AVX::lo128(internal_data(e).data())), _mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())), AVX::hi128(internal_data(e).data()))))}; #else return {AVX::avx_cast<__m256>( AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())), internal_data(internal_data0(e)).data()), _mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())), internal_data(internal_data1(e)).data())))}; #endif } Vc_ALWAYS_INLINE AVX2::float_v trunc(AVX2::float_v::AsArg v) { return _mm256_round_ps(v.data(), 0x3); } Vc_ALWAYS_INLINE AVX2::double_v trunc(AVX2::double_v::AsArg v) { return _mm256_round_pd(v.data(), 0x3); } Vc_ALWAYS_INLINE AVX2::float_v floor(AVX2::float_v::AsArg v) { return _mm256_floor_ps(v.data()); } Vc_ALWAYS_INLINE AVX2::double_v floor(AVX2::double_v::AsArg v) { return _mm256_floor_pd(v.data()); } Vc_ALWAYS_INLINE AVX2::float_v ceil(AVX2::float_v::AsArg v) { return _mm256_ceil_ps(v.data()); } Vc_ALWAYS_INLINE AVX2::double_v ceil(AVX2::double_v::AsArg v) { return _mm256_ceil_pd(v.data()); } template Vc_ALWAYS_INLINE Vector fma(Vector a, Vector b, Vector c) { return Detail::fma(a.data(), b.data(), c.data(), T()); } } #endif #ifndef Vc_AVX_SIMD_CAST_CALLER_TCC_ #define Vc_AVX_SIMD_CAST_CALLER_TCC_ namespace Vc_VERSIONED_NAMESPACE { #if Vc_IS_VERSION_1 template template Vc_INTRINSIC Vector::Vector(U &&x) : d(simd_cast(std::forward(x)).data()) { } template template Vc_INTRINSIC Mask::Mask(U &&rhs, Common::enable_if_mask_converts_explicitly) : Mask(simd_cast(std::forward(rhs))) { } #endif } #endif #endif #ifndef VC_COMMON_MATH_H_ #define VC_COMMON_MATH_H_ #define Vc_COMMON_MATH_H_INTERNAL 1 #ifndef VC_COMMON_TRIGONOMETRIC_H_ #define VC_COMMON_TRIGONOMETRIC_H_ #ifdef Vc_HAVE_LIBMVEC extern "C" { __m128 _ZGVbN4v_sinf(__m128); __m128d _ZGVbN2v_sin(__m128d); __m128 _ZGVbN4v_cosf(__m128); __m128d _ZGVbN2v_cos(__m128d); __m256 _ZGVdN8v_sinf(__m256); __m256d _ZGVdN4v_sin(__m256d); __m256 _ZGVdN8v_cosf(__m256); __m256d _ZGVdN4v_cos(__m256d); } #endif namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template struct MapImpl { enum Dummy { Value = Impl }; }; template<> struct MapImpl { enum Dummy { Value = MapImpl::Value }; }; template using TrigonometricImplementation = ImplementationT::Value #if defined(Vc_IMPL_XOP) && defined(Vc_IMPL_FMA4) + Vc::XopInstructions + Vc::Fma4Instructions #endif >; } namespace Common { template struct Trigonometric { template static T Vc_VDECL sin(const T &_x); template static T Vc_VDECL cos(const T &_x); template static void Vc_VDECL sincos(const T &_x, T *_sin, T *_cos); template static T Vc_VDECL asin (const T &_x); template static T Vc_VDECL atan (const T &_x); template static T Vc_VDECL atan2(const T &y, const T &x); }; } #if defined Vc_IMPL_SSE || defined DOXYGEN namespace Detail { template using Trig = Common::Trigonometric::value ? SSE42Impl : std::is_same::value ? AVXImpl : ScalarImpl)>>; } #ifdef Vc_HAVE_LIBMVEC Vc_INTRINSIC __m128 sin_dispatch(__m128 x) { return ::_ZGVbN4v_sinf(x); } Vc_INTRINSIC __m128d sin_dispatch(__m128d x) { return ::_ZGVbN2v_sin (x); } Vc_INTRINSIC __m128 cos_dispatch(__m128 x) { return ::_ZGVbN4v_cosf(x); } Vc_INTRINSIC __m128d cos_dispatch(__m128d x) { return ::_ZGVbN2v_cos (x); } #ifdef Vc_IMPL_AVX Vc_INTRINSIC __m256 sin_dispatch(__m256 x) { return ::_ZGVdN8v_sinf(x); } Vc_INTRINSIC __m256d sin_dispatch(__m256d x) { return ::_ZGVdN4v_sin (x); } Vc_INTRINSIC __m256 cos_dispatch(__m256 x) { return ::_ZGVdN8v_cosf(x); } Vc_INTRINSIC __m256d cos_dispatch(__m256d x) { return ::_ZGVdN4v_cos (x); } #endif template Vc_INTRINSIC Vector> sin(const Vector &x) { return sin_dispatch(x.data()); } template Vc_INTRINSIC Vector> cos(const Vector &x) { return cos_dispatch(x.data()); } #else template Vc_INTRINSIC Vector> sin(const Vector &x) { return Detail::Trig::sin(x); } template Vc_INTRINSIC Vector> cos(const Vector &x) { return Detail::Trig::cos(x); } #endif template Vc_INTRINSIC Vector> asin(const Vector &x) { return Detail::Trig::asin(x); } template Vc_INTRINSIC Vector> atan(const Vector &x) { return Detail::Trig::atan(x); } template Vc_INTRINSIC Vector> atan2(const Vector &y, const Vector &x) { return Detail::Trig::atan2(y, x); } template Vc_INTRINSIC void sincos(const Vector &x, Vector> *sin, Vector *cos) { Detail::Trig::sincos(x, sin, cos); } #endif } #endif #ifndef VC_COMMON_CONST_H_ #define VC_COMMON_CONST_H_ #include namespace Vc_VERSIONED_NAMESPACE { namespace Detail { template constexpr double exponentToFloat(std::integral_constant); template constexpr double exponentToFloat(std::integral_constant); template <> constexpr double exponentToFloat<0>(std::integral_constant) { return 1.; } template <> constexpr double exponentToFloat<0>(std::integral_constant) { return 1.; } template <> constexpr double exponentToFloat<-32>(std::integral_constant) { return 1. / (65536. * 65536.); } template <> constexpr double exponentToFloat<32>(std::integral_constant) { return 65536. * 65536.; } template <> constexpr double exponentToFloat<-64>(std::integral_constant) { return 1. / (65536. * 65536. * 65536. * 65536.); } template <> constexpr double exponentToFloat<64>(std::integral_constant) { return 65536. * 65536. * 65536. * 65536.; } template constexpr double exponentToFloat(std::integral_constant negative) { return exponentToFloat(negative) * 2.0; } template constexpr double exponentToFloat(std::integral_constant negative) { return exponentToFloat(negative) * 0.5; } template constexpr double doubleConstant() { return (static_cast((mantissa & 0x000fffffffffffffull) | 0x0010000000000000ull) / 0x0010000000000000ull) * exponentToFloat(std::integral_constant()) * sign; } template constexpr float floatConstant() { return (static_cast((mantissa & 0x007fffffu) | 0x00800000u) / 0x00800000u) * static_cast( exponentToFloat(std::integral_constant())) * sign; } } } #endif namespace Vc_VERSIONED_NAMESPACE { template SimdArray::size()> fpclassify(const Vector &x) { return SimdArray::size()>( [&](std::size_t i) { return std::fpclassify(x[i]); }); } template SimdArray fpclassify(const SimdArray &x) { return SimdArray([&](std::size_t i) { return std::fpclassify(x[i]); }); } #ifdef Vc_IMPL_SSE #ifdef Vc_COMMON_MATH_H_INTERNAL enum LogarithmBase { BaseE, Base10, Base2 }; namespace Detail { template using Const = typename std::conditional::value, AVX::Const, SSE::Const>::type; template struct LogImpl { template static Vc_ALWAYS_INLINE void log_series(Vector &Vc_RESTRICT x, typename Vector::AsArg exponent) { typedef Vector V; typedef Detail::Const C; const V x2 = x * x; #ifdef Vc_LOG_ILP V y2 = (C::P(6) * x2 + C::P(7) * x) + C::P(8); V y0 = (C::P(0) * x2 + C::P(1) * x) + C::P(2); V y1 = (C::P(3) * x2 + C::P(4) * x) + C::P(5); const V x3 = x2 * x; const V x6 = x3 * x3; const V x9 = x6 * x3; V y = (y0 * x9 + y1 * x6) + y2 * x3; #elif defined Vc_LOG_ILP2 const V x3 = x2 * x; const V x4 = x2 * x2; const V x5 = x2 * x3; const V x6 = x3 * x3; const V x7 = x4 * x3; const V x8 = x4 * x4; const V x9 = x5 * x4; const V x10 = x5 * x5; const V x11 = x5 * x6; V y = C::P(0) * x11 + C::P(1) * x10 + C::P(2) * x9 + C::P(3) * x8 + C::P(4) * x7 + C::P(5) * x6 + C::P(6) * x5 + C::P(7) * x4 + C::P(8) * x3; #else V y = C::P(0); Vc::Common::unrolled_loop([&](int i) { y = y * x + C::P(i); }); y *= x * x2; #endif switch (Base) { case BaseE: y += exponent * C::ln2_small(); y -= x2 * C::_1_2(); x += y; x += exponent * C::ln2_large(); break; case Base10: y += exponent * C::ln2_small(); y -= x2 * C::_1_2(); x += y; x += exponent * C::ln2_large(); x *= C::log10_e(); break; case Base2: { const V x_ = x; x *= C::log2_e(); y *= C::log2_e(); y -= x_ * x * C::_1_2(); x += y; x += exponent; break; } } } template static Vc_ALWAYS_INLINE void log_series(Vector &Vc_RESTRICT x, typename Vector::AsArg exponent) { typedef Vector V; typedef Detail::Const C; const V x2 = x * x; V y = C::P(0); V y2 = C::Q(0) + x; Vc::Common::unrolled_loop([&](int i) { y = y * x + C::P(i); y2 = y2 * x + C::Q(i); }); y2 = x / y2; y = y * x + C::P(5); y = x2 * y * y2; switch (Base) { case BaseE: y += exponent * C::ln2_small(); y -= x2 * C::_1_2(); x += y; x += exponent * C::ln2_large(); break; case Base10: y += exponent * C::ln2_small(); y -= x2 * C::_1_2(); x += y; x += exponent * C::ln2_large(); x *= C::log10_e(); break; case Base2: { const V x_ = x; x *= C::log2_e(); y *= C::log2_e(); y -= x_ * x * C::_1_2(); x += y; x += exponent; break; } } } template > static inline Vector calc(V _x) { typedef typename V::Mask M; typedef Detail::Const C; V x(_x); const M invalidMask = x < V::Zero(); const M infinityMask = x == V::Zero(); const M denormal = x <= C::min(); x(denormal) *= V(Vc::Detail::doubleConstant<1, 0, 54>()); V exponent = Detail::exponent(x.data()); exponent(denormal) -= 54; x.setZero(C::exponentMask()); x = Detail::operator|(x, C::_1_2()); const M smallX = x < C::_1_sqrt2(); x(smallX) += x; x -= V::One(); exponent(!smallX) += V::One(); log_series(x, exponent); x.setQnan(invalidMask); x(infinityMask) = C::neginf(); return x; } }; } template Vc_INTRINSIC Vc_CONST Vector> log( const Vector &x) { return Detail::LogImpl::calc(x); } template Vc_INTRINSIC Vc_CONST Vector> log10( const Vector &x) { return Detail::LogImpl::calc(x); } template Vc_INTRINSIC Vc_CONST Vector> log2( const Vector &x) { return Detail::LogImpl::calc(x); } #endif #ifdef Vc_COMMON_MATH_H_INTERNAL constexpr float log2_e = 1.44269504088896341f; constexpr float MAXLOGF = 88.722831726074219f; constexpr float MINLOGF = -88.029685974121094f; constexpr float MAXNUMF = 3.4028234663852885981170418348451692544e38f; template ::value || std::is_same::value>> inline Vector> exp(Vector x) { using V = Vector; typedef typename V::Mask M; typedef Detail::Const C; const M overflow = x > MAXLOGF; const M underflow = x < MINLOGF; V z = floor(C::log2_e() * x + 0.5f); const auto n = static_cast>(z); x -= z * C::ln2_large(); x -= z * C::ln2_small(); z = ((((( 1.9875691500E-4f * x + 1.3981999507E-3f) * x + 8.3334519073E-3f) * x + 4.1665795894E-2f) * x + 1.6666665459E-1f) * x + 5.0000001201E-1f) * (x * x) + x + 1.0f; x = ldexp(z, n); x(overflow) = std::numeric_limits::infinity(); x.setZero(underflow); return x; } #endif #ifdef Vc_IMPL_AVX inline AVX::double_v exp(AVX::double_v _x) { AVX::Vector x = _x; typedef AVX::Vector V; typedef V::Mask M; typedef AVX::Const C; const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>(); const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>(); V px = floor(C::log2_e() * x + 0.5); __m128i tmp = _mm256_cvttpd_epi32(px.data()); const SimdArray n = SSE::int_v{tmp}; x -= px * C::ln2_large(); x -= px * C::ln2_small(); const double P[] = { Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(), Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(), Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>() }; const double Q[] = { Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(), Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(), Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(), Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>() }; const V x2 = x * x; px = x * ((P[0] * x2 + P[1]) * x2 + P[2]); x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px); x = V::One() + 2.0 * x; x = ldexp(x, n); x(overflow) = std::numeric_limits::infinity(); x.setZero(underflow); return x; } #endif inline SSE::double_v exp(SSE::double_v::AsArg _x) { SSE::Vector x = _x; typedef SSE::Vector V; typedef V::Mask M; typedef SSE::Const C; const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>(); const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>(); V px = floor(C::log2_e() * x + 0.5); SimdArray n; _mm_storel_epi64(reinterpret_cast<__m128i *>(&n), _mm_cvttpd_epi32(px.data())); x -= px * C::ln2_large(); x -= px * C::ln2_small(); const double P[] = { Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(), Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(), Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>() }; const double Q[] = { Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(), Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(), Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(), Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>() }; const V x2 = x * x; px = x * ((P[0] * x2 + P[1]) * x2 + P[2]); x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px); x = V::One() + 2.0 * x; x = ldexp(x, n); x(overflow) = std::numeric_limits::infinity(); x.setZero(underflow); return x; } #endif } #undef Vc_COMMON_MATH_H_INTERNAL #endif #ifdef isfinite #undef isfinite #endif #ifdef isnan #undef isnan #endif #ifndef VC_COMMON_VECTORTUPLE_H_ #define VC_COMMON_VECTORTUPLE_H_ namespace Vc_VERSIONED_NAMESPACE { namespace Common { template struct InterleavedMemoryReadAccess; template class VectorReferenceArray { typedef typename V::EntryType T; typedef V &Vc_RESTRICT Reference; std::array r; typedef make_index_sequence IndexSequence; template constexpr VectorReferenceArray appendOneReference( VV &a, index_sequence) const { return {*r[Indexes]..., a}; } template Vc_INTRINSIC void callDeinterleave(const A &access, index_sequence) const { access.deinterleave(*r[Indexes]...); } public: template > constexpr VectorReferenceArray(Us &&... args) : r{{std::addressof(std::forward(args))...}} { } template ::value && std::is_same::value>> Vc_DEPRECATED("build the tuple with Vc::tie instead") constexpr VectorReferenceArray< Length + 1, V> operator,(VV &a) const && { return appendOneReference(a, IndexSequence()); } Vc_DEPRECATED("build the tuple with Vc::tie instead") constexpr VectorReferenceArray< Length + 1, const V> operator,(const V &a) const && { return appendOneReference(a, IndexSequence()); } template Vc_ALWAYS_INLINE enable_if<(Length <= StructSize), void> operator=( const InterleavedMemoryReadAccess &access) && { callDeinterleave(access, IndexSequence()); } template enable_if<(Length > StructSize), void> operator=( const InterleavedMemoryReadAccess &access) && = delete; template void operator=(TransposeProxy &&proxy) && { transpose_impl(TransposeTag(), &r[0], proxy); } template void operator=(SubscriptOperation &&sub) && { const auto &args = std::move(sub).gatherArguments(); Common::InterleavedMemoryReadAccess<1, V, Traits::decay> deinterleaver(args.address, args.indexes); callDeinterleave(deinterleaver, IndexSequence()); } Vc_ALWAYS_INLINE Reference operator[](std::size_t i) { return *r[i]; } }; } template Vc_DEPRECATED("build the tuple with Vc::tie instead") constexpr Common::VectorReferenceArray<2, Vc::Vector> operator,(Vc::Vector &a, Vc::Vector &b) { return {a, b}; } template Vc_DEPRECATED("build the tuple with Vc::tie instead") constexpr Common::VectorReferenceArray<2, const Vc::Vector> operator,(const Vc::Vector &a, const Vc::Vector &b) { return {a, b}; } template constexpr Common::VectorReferenceArray::type> tie(V &&a, Vs &&... b) { return {std::forward(a), std::forward(b)...}; } } #endif #ifndef VC_COMMON_IIF_H_ #define VC_COMMON_IIF_H_ #ifndef VC_TYPE_TRAITS_ #define VC_TYPE_TRAITS_ #include namespace Vc_VERSIONED_NAMESPACE { using Traits::is_simd_mask; using Traits::is_simd_vector; using Traits::is_integral; using Traits::is_floating_point; using Traits::is_arithmetic; using Traits::is_signed; using Traits::is_unsigned; template struct memory_alignment : public std::integral_constant {}; template<> struct memory_alignment : public std::integral_constant {}; template<> struct memory_alignment : public std::integral_constant {}; } #endif namespace Vc_VERSIONED_NAMESPACE { template Vc_ALWAYS_INLINE enable_if::value && is_simd_vector::value, T> iif( const Mask &condition, const T &trueValue, const T &falseValue) { T result(falseValue); Vc::where(condition) | result = trueValue; return result; } template enable_if::value && !is_simd_vector::value, T> iif( const Mask &, const T &, const T &) = delete; template constexpr T iif (bool condition, const T &trueValue, const T &falseValue) { return condition ? trueValue : falseValue; } } #endif #ifndef Vc_NO_STD_FUNCTIONS namespace std { using Vc::min; using Vc::max; using Vc::abs; using Vc::asin; using Vc::atan; using Vc::atan2; using Vc::ceil; using Vc::cos; using Vc::exp; using Vc::fma; using Vc::trunc; using Vc::floor; using Vc::frexp; using Vc::ldexp; using Vc::log; using Vc::log10; using Vc::log2; using Vc::round; using Vc::sin; using Vc::sqrt; using Vc::isfinite; using Vc::isnan; } #endif Vc_RESET_DIAGNOSTICS #endif