vc/godbolt/Vc

23036 lines
910 KiB
Plaintext

#ifndef VC_VECTOR_H_
#define VC_VECTOR_H_
#ifndef VC_SCALAR_VECTOR_H_
#define VC_SCALAR_VECTOR_H_
#include <assert.h>
#include <algorithm>
#include <cmath>
#ifdef _MSC_VER
#include <float.h>
#endif
#ifndef VC_COMMON_TYPES_H_
#define VC_COMMON_TYPES_H_
#ifdef Vc_CHECK_ALIGNMENT
#include <cstdlib>
#include <cstdio>
#endif
#include <ratio>
#ifndef VC_GLOBAL_H_
#define VC_GLOBAL_H_
#include <cstdint>
#ifndef VC_FWDDECL_H_
#define VC_FWDDECL_H_
#include <cstddef>
#define Vc_VERSIONED_NAMESPACE Vc_1
namespace Vc_VERSIONED_NAMESPACE
{
namespace VectorAbi
{
struct Scalar {};
struct Sse {};
struct Avx {};
struct Mic {};
template <class T> struct DeduceCompatible;
template <class T> struct DeduceBest;
}
namespace Common
{
template <class T, std::size_t N> struct select_best_vector_type;
}
template <class T, class Abi> class Mask;
template <class T, class Abi> class Vector;
template <class T, std::size_t N,
class V = typename Common::select_best_vector_type<T, N>::type,
std::size_t Wt = V::Size>
class SimdArray;
template <class T, std::size_t N,
class V = typename Common::select_best_vector_type<T, N>::type,
std::size_t Wt = V::Size>
class SimdMaskArray;
namespace simd_abi
{
using scalar = VectorAbi::Scalar;
template <int N> struct fixed_size;
template <class T> using compatible = typename VectorAbi::DeduceCompatible<T>::type;
template <class T> using native = typename VectorAbi::DeduceBest<T>::type;
using __sse = VectorAbi::Sse;
using __avx = VectorAbi::Avx;
struct __avx512;
struct __neon;
}
template <class T, class Abi = simd_abi::compatible<T>> using simd = Vector<T, Abi>;
template <class T, class Abi = simd_abi::compatible<T>> using simd_mask = Mask<T, Abi>;
template <class T> using native_simd = simd<T, simd_abi::native<T>>;
template <class T> using native_simd_mask = simd_mask<T, simd_abi::native<T>>;
template <class T, int N> using fixed_size_simd = simd<T, simd_abi::fixed_size<N>>;
template <class T, int N>
using fixed_size_simd_mask = simd_mask<T, simd_abi::fixed_size<N>>;
}
#ifndef DOXYGEN
namespace Vc = Vc_VERSIONED_NAMESPACE;
#endif
#endif
#ifdef DOXYGEN
#define Vc_ICC __INTEL_COMPILER_BUILD_DATE
#undef Vc_ICC
#define Vc_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
#undef Vc_CLANG
#define Vc_APPLECLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
#undef Vc_APPLECLANG
#define Vc_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__)
#define Vc_MSVC _MSC_FULL_VER
#undef Vc_MSVC
#else
#ifdef __INTEL_COMPILER
#define Vc_ICC __INTEL_COMPILER_BUILD_DATE
#elif defined(__clang__) && defined(__apple_build_version__)
#define Vc_APPLECLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
#elif defined(__clang__)
#define Vc_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
#elif defined(__GNUC__)
#define Vc_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__)
#elif defined(_MSC_VER)
#define Vc_MSVC _MSC_FULL_VER
#else
#define Vc_UNSUPPORTED_COMPILER 1
#endif
#if defined Vc_GCC && Vc_GCC >= 0x60000
#define Vc_RESET_DIAGNOSTICS _Pragma("GCC diagnostic pop")
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wignored-attributes"
#else
#define Vc_RESET_DIAGNOSTICS
#endif
#if defined Vc_ICC
#pragma warning disable 2922
#endif
#if __cplusplus < 201103 && (!defined Vc_MSVC || _MSC_VER < 1900)
# error "Vc requires support for C++11."
#elif __cplusplus >= 201402L
#define Vc_CXX14 1
# if __cplusplus > 201700L
#define Vc_CXX17 1
# endif
#endif
#if defined(__GNUC__) && !defined(Vc_NO_INLINE_ASM)
#define Vc_GNU_ASM 1
#endif
#ifdef Vc_GCC
# if Vc_GCC >= 0x70000 && defined __i386__
# ifdef __GLIBC_PREREQ
# if __GLIBC_PREREQ(2,26)
#define Vc_HAVE_STD_MAX_ALIGN_T 1
# endif
# endif
# elif Vc_GCC >= 0x40900
#define Vc_HAVE_STD_MAX_ALIGN_T 1
# else
#define Vc_HAVE_MAX_ALIGN_T 1
# endif
#elif !defined(Vc_CLANG) && !defined(Vc_ICC)
#define Vc_HAVE_STD_MAX_ALIGN_T 1
#endif
#if defined(Vc_GCC) || defined(Vc_CLANG) || defined Vc_APPLECLANG
#define Vc_USE_BUILTIN_VECTOR_TYPES 1
#endif
#ifdef Vc_MSVC
#define Vc_CDECL __cdecl
#define Vc_VDECL __vectorcall
#else
#define Vc_CDECL
#define Vc_VDECL
#endif
#define Scalar 0x00100000
#define SSE 0x00200000
#define SSE2 0x00300000
#define SSE3 0x00400000
#define SSSE3 0x00500000
#define SSE4_1 0x00600000
#define SSE4_2 0x00700000
#define AVX 0x00800000
#define AVX2 0x00900000
#define XOP 0x00000001
#define FMA4 0x00000002
#define F16C 0x00000004
#define POPCNT 0x00000008
#define SSE4a 0x00000010
#define FMA 0x00000020
#define BMI2 0x00000040
#define IMPL_MASK 0xFFF00000
#define EXT_MASK 0x000FFFFF
#ifdef Vc_MSVC
# ifdef _M_IX86_FP
# if _M_IX86_FP >= 1
# ifndef __SSE__
#define __SSE__ 1
# endif
# endif
# if _M_IX86_FP >= 2
# ifndef __SSE2__
#define __SSE2__ 1
# endif
# endif
# elif defined(_M_AMD64)
# ifndef __SSE__
#define __SSE__ 1
# endif
# ifndef __SSE2__
#define __SSE2__ 1
# endif
# endif
#endif
#if defined Vc_ICC && !defined __POPCNT__
# if defined __SSE4_2__ || defined __SSE4A__
#define __POPCNT__ 1
# endif
#endif
#ifdef VC_IMPL
#error "You are using the old VC_IMPL macro. Since Vc 1.0 all Vc macros start with Vc_, i.e. a lower-case 'c'"
#endif
#ifndef Vc_IMPL
# if defined(__AVX2__)
#define Vc_IMPL_AVX2 1
#define Vc_IMPL_AVX 1
# elif defined(__AVX__)
#define Vc_IMPL_AVX 1
# else
# if defined(__SSE4_2__)
#define Vc_IMPL_SSE 1
#define Vc_IMPL_SSE4_2 1
# endif
# if defined(__SSE4_1__)
#define Vc_IMPL_SSE 1
#define Vc_IMPL_SSE4_1 1
# endif
# if defined(__SSE3__)
#define Vc_IMPL_SSE 1
#define Vc_IMPL_SSE3 1
# endif
# if defined(__SSSE3__)
#define Vc_IMPL_SSE 1
#define Vc_IMPL_SSSE3 1
# endif
# if defined(__SSE2__)
#define Vc_IMPL_SSE 1
#define Vc_IMPL_SSE2 1
# endif
# if defined(Vc_IMPL_SSE)
# else
#define Vc_IMPL_Scalar 1
# endif
# endif
# if !defined(Vc_IMPL_Scalar)
# ifdef __FMA4__
#define Vc_IMPL_FMA4 1
# endif
# ifdef __XOP__
#define Vc_IMPL_XOP 1
# endif
# ifdef __F16C__
#define Vc_IMPL_F16C 1
# endif
# ifdef __POPCNT__
#define Vc_IMPL_POPCNT 1
# endif
# ifdef __SSE4A__
#define Vc_IMPL_SSE4a 1
# endif
# ifdef __FMA__
#define Vc_IMPL_FMA 1
# endif
# ifdef __BMI2__
#define Vc_IMPL_BMI2 1
# endif
# endif
#else
# if (Vc_IMPL & IMPL_MASK) == AVX2
#define Vc_IMPL_AVX2 1
#define Vc_IMPL_AVX 1
# elif (Vc_IMPL & IMPL_MASK) == AVX
#define Vc_IMPL_AVX 1
# elif (Vc_IMPL & IMPL_MASK) == Scalar
#define Vc_IMPL_Scalar 1
# elif (Vc_IMPL & IMPL_MASK) == SSE4_2
#define Vc_IMPL_SSE4_2 1
#define Vc_IMPL_SSE4_1 1
#define Vc_IMPL_SSSE3 1
#define Vc_IMPL_SSE3 1
#define Vc_IMPL_SSE2 1
#define Vc_IMPL_SSE 1
# elif (Vc_IMPL & IMPL_MASK) == SSE4_1
#define Vc_IMPL_SSE4_1 1
#define Vc_IMPL_SSSE3 1
#define Vc_IMPL_SSE3 1
#define Vc_IMPL_SSE2 1
#define Vc_IMPL_SSE 1
# elif (Vc_IMPL & IMPL_MASK) == SSSE3
#define Vc_IMPL_SSSE3 1
#define Vc_IMPL_SSE3 1
#define Vc_IMPL_SSE2 1
#define Vc_IMPL_SSE 1
# elif (Vc_IMPL & IMPL_MASK) == SSE3
#define Vc_IMPL_SSE3 1
#define Vc_IMPL_SSE2 1
#define Vc_IMPL_SSE 1
# elif (Vc_IMPL & IMPL_MASK) == SSE2
#define Vc_IMPL_SSE2 1
#define Vc_IMPL_SSE 1
# elif (Vc_IMPL & IMPL_MASK) == SSE
#define Vc_IMPL_SSE 1
# if defined(__SSE4_2__)
#define Vc_IMPL_SSE4_2 1
# endif
# if defined(__SSE4_1__)
#define Vc_IMPL_SSE4_1 1
# endif
# if defined(__SSE3__)
#define Vc_IMPL_SSE3 1
# endif
# if defined(__SSSE3__)
#define Vc_IMPL_SSSE3 1
# endif
# if defined(__SSE2__)
#define Vc_IMPL_SSE2 1
# endif
# elif (Vc_IMPL & IMPL_MASK) == 0 && (Vc_IMPL & SSE4a)
#define Vc_IMPL_SSE3 1
#define Vc_IMPL_SSE2 1
#define Vc_IMPL_SSE 1
# endif
# if (Vc_IMPL & XOP)
#define Vc_IMPL_XOP 1
# endif
# if (Vc_IMPL & FMA4)
#define Vc_IMPL_FMA4 1
# endif
# if (Vc_IMPL & F16C)
#define Vc_IMPL_F16C 1
# endif
# if (!defined(Vc_IMPL_Scalar) && defined(__POPCNT__)) || (Vc_IMPL & POPCNT)
#define Vc_IMPL_POPCNT 1
# endif
# if (Vc_IMPL & SSE4a)
#define Vc_IMPL_SSE4a 1
# endif
# if (Vc_IMPL & FMA)
#define Vc_IMPL_FMA 1
# endif
# if (Vc_IMPL & BMI2)
#define Vc_IMPL_BMI2 1
# endif
#undef Vc_IMPL
#endif
#ifdef __AVX__
#define Vc_USE_VEX_CODING 1
#endif
#ifdef Vc_IMPL_AVX
#define Vc_IMPL_SSE4_2 1
#define Vc_IMPL_SSE4_1 1
#define Vc_IMPL_SSSE3 1
#define Vc_IMPL_SSE3 1
#define Vc_IMPL_SSE2 1
#define Vc_IMPL_SSE 1
#endif
#if defined(Vc_CLANG) && Vc_CLANG >= 0x30600 && Vc_CLANG < 0x30700
# if defined(Vc_IMPL_AVX)
# warning "clang 3.6.x miscompiles AVX code, frequently losing 50% of the data. Vc will fall back to SSE4 instead."
#undef Vc_IMPL_AVX
# if defined(Vc_IMPL_AVX2)
#undef Vc_IMPL_AVX2
# endif
# endif
#endif
# if !defined(Vc_IMPL_Scalar) && !defined(Vc_IMPL_SSE) && !defined(Vc_IMPL_AVX)
# error "No suitable Vc implementation was selected! Probably Vc_IMPL was set to an invalid value."
# elif defined(Vc_IMPL_SSE) && !defined(Vc_IMPL_SSE2)
# error "SSE requested but no SSE2 support. Vc needs at least SSE2!"
# endif
#undef Scalar
#undef SSE
#undef SSE2
#undef SSE3
#undef SSSE3
#undef SSE4_1
#undef SSE4_2
#undef AVX
#undef AVX2
#undef XOP
#undef FMA4
#undef F16C
#undef POPCNT
#undef SSE4a
#undef FMA
#undef BMI2
#undef IMPL_MASK
#undef EXT_MASK
#if defined Vc_IMPL_AVX2
#define Vc_DEFAULT_IMPL_AVX2
#elif defined Vc_IMPL_AVX
#define Vc_DEFAULT_IMPL_AVX
#elif defined Vc_IMPL_SSE
#define Vc_DEFAULT_IMPL_SSE
#elif defined Vc_IMPL_Scalar
#define Vc_DEFAULT_IMPL_Scalar
#else
#error "Preprocessor logic broken. Please report a bug."
#endif
#endif
namespace Vc_VERSIONED_NAMESPACE
{
typedef signed char int8_t;
typedef unsigned char uint8_t;
typedef signed short int16_t;
typedef unsigned short uint16_t;
typedef signed int int32_t;
typedef unsigned int uint32_t;
typedef signed long long int64_t;
typedef unsigned long long uint64_t;
enum MallocAlignment {
AlignOnVector,
AlignOnCacheline,
AlignOnPage
};
enum Implementation : std::uint_least32_t {
ScalarImpl,
SSE2Impl,
SSE3Impl,
SSSE3Impl,
SSE41Impl,
SSE42Impl,
AVXImpl,
AVX2Impl,
MICImpl,
ImplementationMask = 0xfff
};
enum ExtraInstructions : std::uint_least32_t {
Float16cInstructions = 0x01000,
Fma4Instructions = 0x02000,
XopInstructions = 0x04000,
PopcntInstructions = 0x08000,
Sse4aInstructions = 0x10000,
FmaInstructions = 0x20000,
VexInstructions = 0x40000,
Bmi2Instructions = 0x80000,
ExtraInstructionsMask = 0xfffff000u
};
template <unsigned int Features> struct ImplementationT {
static constexpr Implementation current()
{
return static_cast<Implementation>(Features & ImplementationMask);
}
static constexpr bool is(Implementation impl)
{
return static_cast<unsigned int>(impl) == current();
}
static constexpr bool is_between(Implementation low, Implementation high)
{
return static_cast<unsigned int>(low) <= current() &&
static_cast<unsigned int>(high) >= current();
}
static constexpr bool runs_on(unsigned int extraInstructions)
{
return (extraInstructions & Features & ExtraInstructionsMask) ==
(Features & ExtraInstructionsMask);
}
};
using CurrentImplementation = ImplementationT<
#ifdef Vc_IMPL_Scalar
ScalarImpl
#elif defined(Vc_IMPL_AVX2)
AVX2Impl
#elif defined(Vc_IMPL_AVX)
AVXImpl
#elif defined(Vc_IMPL_SSE4_2)
SSE42Impl
#elif defined(Vc_IMPL_SSE4_1)
SSE41Impl
#elif defined(Vc_IMPL_SSSE3)
SSSE3Impl
#elif defined(Vc_IMPL_SSE3)
SSE3Impl
#elif defined(Vc_IMPL_SSE2)
SSE2Impl
#endif
#ifdef Vc_IMPL_SSE4a
+ Vc::Sse4aInstructions
#ifdef Vc_IMPL_XOP
+ Vc::XopInstructions
#ifdef Vc_IMPL_FMA4
+ Vc::Fma4Instructions
#endif
#endif
#endif
#ifdef Vc_IMPL_POPCNT
+ Vc::PopcntInstructions
#endif
#ifdef Vc_IMPL_FMA
+ Vc::FmaInstructions
#endif
#ifdef Vc_IMPL_BMI2
+ Vc::Bmi2Instructions
#endif
#ifdef Vc_USE_VEX_CODING
+ Vc::VexInstructions
#endif
>;
}
#ifndef VC_VERSION_H_
#define VC_VERSION_H_
#define Vc_VERSION_STRING "1.4.2-dev"
#define Vc_VERSION_NUMBER 0x010405
#define Vc_VERSION_CHECK(major,minor,patch) ((major << 16) | (minor << 8) | (patch << 1))
#define Vc_LIBRARY_ABI_VERSION 5
#define Vc_IS_VERSION_2 (Vc_VERSION_NUMBER >= Vc_VERSION_CHECK(1, 70, 0))
#define Vc_IS_VERSION_1 (Vc_VERSION_NUMBER < Vc_VERSION_CHECK(1, 70, 0))
namespace Vc_VERSIONED_NAMESPACE
{
inline const char *versionString() { return Vc_VERSION_STRING; }
constexpr unsigned int versionNumber() { return Vc_VERSION_NUMBER; }
}
#endif
#endif
#ifndef VC_TRAITS_TYPE_TRAITS_H_
#define VC_TRAITS_TYPE_TRAITS_H_
#include <type_traits>
#ifndef VC_TRAITS_DECAY_H_
#define VC_TRAITS_DECAY_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
template <typename T> using decay = typename std::decay<T>::type;
}
}
#endif
#ifndef VC_TRAITS_HAS_NO_ALLOCATED_DATA_H_
#define VC_TRAITS_HAS_NO_ALLOCATED_DATA_H_
#include <array>
namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
template<typename T> struct has_no_allocated_data_impl : public std::false_type {};
template <typename T>
struct has_no_allocated_data
: public has_no_allocated_data_impl<
typename std::remove_cv<typename std::remove_reference<T>::type>::type>
{
};
template<typename T, std::size_t N> struct has_no_allocated_data_impl<std::array<T, N>> : public std::true_type {};
template<typename T, std::size_t N> struct has_no_allocated_data_impl<T[N]> : public std::true_type {};
template<typename T> struct has_no_allocated_data_impl<T[]> : public std::true_type {};
}
}
#endif
#ifndef VC_TRAITS_HAS_CONTIGUOUS_STORAGE_H_
#define VC_TRAITS_HAS_CONTIGUOUS_STORAGE_H_
#include <initializer_list>
#include <memory>
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_BEGIN_NAMESPACE_STD
#else
namespace std
{
#endif
#ifdef _WIN32
template <typename T, size_t N> class array;
#else
template <typename T, size_t N> struct array;
#endif
template <typename T, typename Allocator> class vector;
#ifdef _LIBCPP_END_NAMESPACE_STD
_LIBCPP_END_NAMESPACE_STD
#else
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
namespace has_contiguous_storage_detail
{
template <typename T, typename It = typename T::iterator>
std::is_base_of<std::random_access_iterator_tag,
typename std::iterator_traits<It>::iterator_category>
test(int);
template <typename T>
std::is_base_of<std::random_access_iterator_tag,
typename std::iterator_traits<T>::iterator_category>
test(long);
template <typename T> std::false_type test(...);
}
template <typename T>
struct has_contiguous_storage_impl
: public decltype(has_contiguous_storage_detail::test<T>(int())) {
};
template <typename T>
struct has_contiguous_storage
: public has_contiguous_storage_impl<
typename std::remove_cv<typename std::remove_reference<T>::type>::type>
{
};
template <typename T> struct has_contiguous_storage_impl<const T *> : public std::true_type {};
template <typename T> struct has_contiguous_storage_impl<T *> : public std::true_type {};
template <typename T> struct has_contiguous_storage_impl<std::unique_ptr<T[]>> : public std::true_type {};
template <typename T> struct has_contiguous_storage_impl<std::initializer_list<T>> : public std::true_type {};
template <typename T, std::size_t N> struct has_contiguous_storage_impl<T[N]> : public std::true_type {};
template <typename T, std::size_t N> struct has_contiguous_storage_impl<std::array<T, N>> : public std::true_type {};
template <typename T, typename A> struct has_contiguous_storage_impl<std::vector<T, A>> : public std::true_type {};
}
}
#endif
#ifndef VC_TRAITS_IS_FUNCTOR_ARGUMENT_IMMUTABLE_H_
#define VC_TRAITS_IS_FUNCTOR_ARGUMENT_IMMUTABLE_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
namespace is_functor_argument_immutable_impl
{
template <typename F, typename A> std::true_type test(void (F::*)(A));
template <typename F, typename A> std::true_type test(void (F::*)(A) const);
template <typename F, typename A> std::is_const<A> test(void (F::*)(A &));
template <typename F, typename A> std::is_const<A> test(void (F::*)(A &) const);
template <typename F, typename A> std::is_const<A> test(void (F::*)(A &&));
template <typename F, typename A> std::is_const<A> test(void (F::*)(A &&) const);
struct dummy {};
template <
typename F, typename A,
#ifdef Vc_MSVC
#define Vc_TEMPLATE_
#else
#define Vc_TEMPLATE_ template
#endif
typename MemberPtr = decltype(&F::Vc_TEMPLATE_ operator()<A>)>
decltype(is_functor_argument_immutable_impl::test(std::declval<MemberPtr>())) test2(int);
#undef Vc_TEMPLATE_
template <typename F, typename A>
decltype(
is_functor_argument_immutable_impl::test(std::declval<decltype(&F::operator())>()))
test2(float);
template <typename A> std::true_type test3(void(*)(A));
template <typename A> std::is_const<A> test3(void(*)(A &));
template <typename A> std::is_const<A> test3(void(*)(A &&));
}
template <typename F, typename A, bool = std::is_function<F>::value>
struct is_functor_argument_immutable;
template <typename F, typename A>
struct is_functor_argument_immutable<F, A, false>
: decltype(is_functor_argument_immutable_impl::test2<
typename std::remove_reference<F>::type, A>(int())) {
};
template <typename F, typename A>
struct is_functor_argument_immutable<F, A, true>
: decltype(is_functor_argument_immutable_impl::test3(std::declval<F>())) {
};
}
}
#endif
#ifndef VC_TRAITS_IS_OUTPUT_ITERATOR_H_
#define VC_TRAITS_IS_OUTPUT_ITERATOR_H_
#include <iterator>
namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
namespace is_output_iterator_impl
{
template <typename T, typename ValueType = typename std::iterator_traits<T>::value_type,
typename = decltype(*std::declval<T &>() = std::declval<
ValueType>())
>
std::true_type test(int);
template <typename T> std::false_type test(...);
}
template <typename T>
struct is_output_iterator
: public std::conditional<
std::is_void<typename std::iterator_traits<T>::value_type>::value,
std::true_type, decltype(is_output_iterator_impl::test<T>(int()))>::type
{
};
static_assert(!std::is_void<std::iterator_traits<int *>::value_type>::value, "");
static_assert(is_output_iterator<int *>::value, "");
static_assert(!is_output_iterator<const int *>::value, "");
}
}
#endif
#ifndef VC_IS_INDEX_SEQUENCE_H_
#define VC_IS_INDEX_SEQUENCE_H_
#ifndef VC_COMMON_INDEXSEQUENCE_H_
#define VC_COMMON_INDEXSEQUENCE_H_
namespace Vc_VERSIONED_NAMESPACE
{
template <std::size_t... I> struct index_sequence
{
static constexpr std::size_t size() noexcept { return sizeof...(I); }
};
template <std::size_t N> struct make_index_sequence_impl {
template <std::size_t Offset, std::size_t... Ns>
static index_sequence<Ns..., (Ns + Offset)...> join(std::false_type,
index_sequence<Ns...>);
template <std::size_t Offset, std::size_t... Ns>
static index_sequence<Ns..., Offset - 1, (Ns + Offset)...> join(
std::true_type, index_sequence<Ns...>);
using is_odd = std::integral_constant<bool, N & 1>;
using half = typename make_index_sequence_impl<N / 2>::type;
using type = decltype(join<(N + 1) / 2>(is_odd(), half()));
};
template <> struct make_index_sequence_impl<0> {
using type = index_sequence<>;
};
template <> struct make_index_sequence_impl<1> {
using type = index_sequence<0>;
};
template <> struct make_index_sequence_impl<2> {
using type = index_sequence<0, 1>;
};
template <std::size_t N>
using make_index_sequence = typename make_index_sequence_impl<N>::type;
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
template <typename T> struct is_index_sequence : public std::false_type {};
template <std::size_t... I>
struct is_index_sequence<Vc::index_sequence<I...>> : public std::true_type {};
static_assert(!is_index_sequence<int>::value, "");
static_assert(is_index_sequence<make_index_sequence<2>>::value, "");
}
}
#endif
#ifndef VC_TRAITS_IS_IMPLICIT_CAST_ALLOWED_H_
#define VC_TRAITS_IS_IMPLICIT_CAST_ALLOWED_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
template <typename From, typename To, bool = std::is_integral<From>::value>
struct is_implicit_cast_allowed
: public std::integral_constant<
bool, std::is_same<From, To>::value ||
(std::is_integral<To>::value &&
(std::is_same<typename std::make_unsigned<From>::type, To>::value ||
std::is_same<typename std::make_signed<From>::type, To>::value))> {
};
template <typename From, typename To>
struct is_implicit_cast_allowed<From, To, false> : public std::is_same<From, To>::type {
};
template <typename From, typename To>
struct is_implicit_cast_allowed_mask : public is_implicit_cast_allowed<From, To> {
};
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
struct enable_if_default_type
{
constexpr enable_if_default_type() {}
};
static constexpr enable_if_default_type nullarg;
template <bool Test, typename T = enable_if_default_type> using enable_if = typename std::enable_if<Test, T>::type;
template <bool B, class T, class F>
using conditional_t = typename std::conditional<B, T, F>::type;
template <class T>
using remove_cvref_t =
typename std::remove_cv<typename std::remove_reference<T>::type>::type;
namespace Traits
{
#ifndef VC_TRAITS_HAS_SUBSCRIPT_OPERATOR_H_
#define VC_TRAITS_HAS_SUBSCRIPT_OPERATOR_H_
namespace has_subscript_operator_impl
{
template <typename T, typename I, typename = decltype(std::declval<T &>()[std::declval<I>()])> std::true_type test(int);
template <typename T, typename I> std::false_type test(float);
}
template <typename T, typename I = std::size_t>
struct has_subscript_operator : public decltype(has_subscript_operator_impl::test<T, I>(1))
{
};
#endif
#ifndef VC_TRAITS_HAS_MULTIPLY_OPERATOR_H_
#define VC_TRAITS_HAS_MULTIPLY_OPERATOR_H_
namespace has_multiply_operator_impl
{
template <typename T, typename U, typename = decltype(std::declval<T>() * std::declval<U>())> std::true_type test(int);
template <typename T, typename U> std::false_type test(...);
}
template <typename T, typename U = T>
struct has_multiply_operator : public decltype(has_multiply_operator_impl::test<T, U>(1))
{
};
#endif
#ifndef VC_TRAITS_HAS_ADDITION_OPERATOR_H_
#define VC_TRAITS_HAS_ADDITION_OPERATOR_H_
namespace has_addition_operator_impl
{
template <typename T, typename U, typename = decltype(std::declval<T>() + std::declval<U>())> std::true_type test(int);
template <typename T, typename U> std::false_type test(...);
}
template <typename T, typename U = T>
struct has_addition_operator : public decltype(has_addition_operator_impl::test<T, U>(1))
{
};
#endif
#ifndef VC_TRAITS_HAS_EQUALITY_OPERATOR_H_
#define VC_TRAITS_HAS_EQUALITY_OPERATOR_H_
namespace has_equality_operator_impl
{
template <typename T, typename U,
typename = enable_if<!std::is_same<void, decltype(std::declval<T>() == std::declval<U>())>::value>>
std::true_type test(int);
template <typename T, typename U> std::false_type test(...);
}
template <typename T, typename U = T>
struct has_equality_operator : public decltype(has_equality_operator_impl::test<T, U>(1))
{
};
#endif
template<typename T> struct is_valid_vector_argument : public std::false_type {};
template <> struct is_valid_vector_argument<double> : public std::true_type {};
template <> struct is_valid_vector_argument<float> : public std::true_type {};
template <> struct is_valid_vector_argument<int> : public std::true_type {};
template <> struct is_valid_vector_argument<unsigned int> : public std::true_type {};
template <> struct is_valid_vector_argument<short> : public std::true_type {};
template <> struct is_valid_vector_argument<unsigned short> : public std::true_type {};
template<typename T> struct is_simd_mask_internal : public std::false_type {};
template<typename T> struct is_simd_vector_internal : public std::false_type {};
template<typename T> struct is_simdarray_internal : public std::false_type {};
template<typename T> struct is_simd_mask_array_internal : public std::false_type {};
template<typename T> struct is_loadstoreflag_internal : public std::false_type {};
template <typename T, bool = is_simd_vector_internal<T>::value> struct is_integral_internal;
template <typename T, bool = is_simd_vector_internal<T>::value> struct is_floating_point_internal;
template <typename T, bool = is_simd_vector_internal<T>::value> struct is_signed_internal;
template <typename T, bool = is_simd_vector_internal<T>::value> struct is_unsigned_internal;
template <typename T> struct is_integral_internal <T, false> : public std::is_integral <T> {};
template <typename T> struct is_floating_point_internal<T, false> : public std::is_floating_point<T> {};
template <typename T> struct is_signed_internal <T, false> : public std::is_signed <T> {};
template <typename T> struct is_unsigned_internal <T, false> : public std::is_unsigned <T> {};
template <typename V> struct is_integral_internal <V, true> : public std::is_integral <typename V::EntryType> {};
template <typename V> struct is_floating_point_internal<V, true> : public std::is_floating_point<typename V::EntryType> {};
template <typename V> struct is_signed_internal <V, true> : public std::is_signed <typename V::EntryType> {};
template <typename V> struct is_unsigned_internal <V, true> : public std::is_unsigned <typename V::EntryType> {};
template <typename T>
struct is_arithmetic_internal
: public std::integral_constant<
bool,
(is_floating_point_internal<T>::value || is_integral_internal<T>::value)>
{
};
template <class T, class = void>
struct vector_size_internal : std::integral_constant<std::size_t, 0> {
};
template <class T>
struct vector_size_internal<T, decltype((void)(T::size() > 0))>
: std::integral_constant<std::size_t, T::size()> {
};
template <typename T>
struct is_simd_mask : public std::integral_constant<bool,
(is_simd_mask_internal<decay<T>>::value ||
is_simd_mask_array_internal<decay<T>>::value)>
{
};
template <typename T>
struct is_simd_vector
: public std::integral_constant<bool,
(is_simd_vector_internal<decay<T>>::value ||
is_simdarray_internal<decay<T>>::value)>
{
};
template <typename T>
struct isSimdArray : public is_simdarray_internal<decay<T>>
{
};
template <typename T>
struct isSimdMaskArray : public is_simd_mask_array_internal<decay<T>>
{
};
template <typename T> struct is_load_store_flag : public is_loadstoreflag_internal<decay<T>> {};
template <typename T> struct is_atomic_simdarray_internal : public std::false_type {};
template <typename T> using isAtomicSimdArray = is_atomic_simdarray_internal<decay<T>>;
template <typename T> struct is_atomic_simd_mask_array_internal : public std::false_type {};
template <typename T> using isAtomicSimdMaskArray = is_atomic_simd_mask_array_internal<decay<T>>;
template <typename T> struct simd_vector_size : public vector_size_internal<decay<T>> {};
template <typename T> struct is_integral : public is_integral_internal<decay<T>> {};
template <typename T> struct is_floating_point : public is_floating_point_internal<decay<T>> {};
template <typename T> struct is_arithmetic : public is_arithmetic_internal<decay<T>> {};
template <typename T> struct is_signed : public is_signed_internal<decay<T>> {};
template <typename T> struct is_unsigned : public is_unsigned_internal<decay<T>> {};
template <typename T, bool IsSimdVector> struct scalar_type_internal { using type = T; };
template <typename T> struct scalar_type_internal<T, true> { using type = typename T::EntryType; };
template <typename T> using scalar_type = typename scalar_type_internal<decay<T>, is_simd_vector<T>::value>::type;
}
}
#ifndef VC_TRAITS_ENTRY_TYPE_OF_H_
#define VC_TRAITS_ENTRY_TYPE_OF_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Traits
{
namespace entry_type_of_internal
{
template <typename T, bool = Traits::is_simd_vector<T>::value> struct entry_type;
template <typename T> struct entry_type<T, true>
{
using type = typename decay<T>::EntryType;
};
template <typename T> struct entry_type<T, false>
{
using type = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
};
}
template <typename T> using entry_type_of = typename entry_type_of_internal::entry_type<T>::type;
}
}
#endif
#endif
#ifndef VC_COMMON_PERMUTATION_H_
#define VC_COMMON_PERMUTATION_H_
#ifndef VC_COMMON_MACROS_H_
#define VC_COMMON_MACROS_H_
#ifdef Vc_MSVC
#define Vc_ALIGNED_TYPEDEF(n_,type_,new_type_) \
typedef __declspec(align(n_)) type_ new_type_
#elif __GNUC__
#define Vc_ALIGNED_TYPEDEF(n_,type_,new_type_) \
typedef type_ new_type_[[gnu::aligned(n_)]]
#else
#define Vc_ALIGNED_TYPEDEF(n_,type_,new_type_) \
using new_type_ alignas(sizeof(n_)) = type_
#endif
#ifdef WIN32
#define NOMINMAX 1
#if defined min
#undef min
#endif
#if defined max
#undef max
#endif
#endif
#if defined Vc_GCC && Vc_GCC >= 0x60000
#define Vc_TEMPLATES_DROP_ATTRIBUTES 1
#endif
#if Vc_IS_VERSION_2 || (defined Vc_GCC && Vc_GCC >= 0x60000)
#define Vc_RECURSIVE_MEMORY 1
#endif
#if defined Vc_CLANG || defined Vc_APPLECLANG
#define Vc_UNREACHABLE __builtin_unreachable
#define Vc_NEVER_INLINE [[gnu::noinline]]
#define Vc_INTRINSIC_L inline
#define Vc_INTRINSIC_R __attribute__((always_inline))
#define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R
#define Vc_FLATTEN
#define Vc_CONST __attribute__((const))
#define Vc_CONST_L
#define Vc_CONST_R Vc_CONST
#define Vc_PURE __attribute__((pure))
#define Vc_PURE_L
#define Vc_PURE_R Vc_PURE
#define Vc_MAY_ALIAS __attribute__((may_alias))
#define Vc_ALWAYS_INLINE_L inline
#define Vc_ALWAYS_INLINE_R __attribute__((always_inline))
#define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R
#define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0)
#define Vc_IS_LIKELY(x) __builtin_expect(x, 1)
#define Vc_RESTRICT __restrict__
#define Vc_DEPRECATED(msg)
#define Vc_DEPRECATED_ALIAS(msg)
#define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
#elif defined(__GNUC__)
#define Vc_UNREACHABLE __builtin_unreachable
# if defined Vc_GCC && !defined __OPTIMIZE__
#define Vc_MAY_ALIAS
# else
#define Vc_MAY_ALIAS __attribute__((__may_alias__))
# endif
#define Vc_INTRINSIC_R __attribute__((__always_inline__, __artificial__))
#define Vc_INTRINSIC_L inline
#define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R
#define Vc_FLATTEN __attribute__((__flatten__))
#define Vc_ALWAYS_INLINE_L inline
#define Vc_ALWAYS_INLINE_R __attribute__((__always_inline__))
#define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R
# ifdef Vc_ICC
#define Vc_PURE
#define Vc_CONST
#define Vc_NEVER_INLINE
# else
#define Vc_NEVER_INLINE [[gnu::noinline]]
#define Vc_PURE __attribute__((__pure__))
#define Vc_CONST __attribute__((__const__))
# endif
#define Vc_CONST_L
#define Vc_CONST_R Vc_CONST
#define Vc_PURE_L
#define Vc_PURE_R Vc_PURE
#define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0)
#define Vc_IS_LIKELY(x) __builtin_expect(x, 1)
#define Vc_RESTRICT __restrict__
# ifdef Vc_ICC
#define Vc_DEPRECATED(msg)
#define Vc_DEPRECATED_ALIAS(msg)
# else
#define Vc_DEPRECATED(msg) __attribute__((__deprecated__(msg)))
#define Vc_DEPRECATED_ALIAS(msg) __attribute__((__deprecated__(msg)))
# endif
#define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
#else
#define Vc_NEVER_INLINE
#define Vc_FLATTEN
# ifdef Vc_PURE
#undef Vc_PURE
# endif
#define Vc_MAY_ALIAS
# ifdef Vc_MSVC
#define Vc_ALWAYS_INLINE inline __forceinline
#define Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE
#define Vc_ALWAYS_INLINE_R
#define Vc_CONST __declspec(noalias)
#define Vc_CONST_L Vc_CONST
#define Vc_CONST_R
#define Vc_PURE
#define Vc_PURE_L Vc_PURE
#define Vc_PURE_R
#define Vc_INTRINSIC inline __forceinline
#define Vc_INTRINSIC_L Vc_INTRINSIC
#define Vc_INTRINSIC_R
namespace Vc_VERSIONED_NAMESPACE {
namespace detail
{
static Vc_INTRINSIC void unreachable() { __assume(0); }
}
}
#define Vc_UNREACHABLE Vc::detail::unreachable
# else
#define Vc_ALWAYS_INLINE
#define Vc_ALWAYS_INLINE_L
#define Vc_ALWAYS_INLINE_R
#define Vc_CONST
#define Vc_CONST_L
#define Vc_CONST_R
#define Vc_PURE
#define Vc_PURE_L
#define Vc_PURE_R
#define Vc_INTRINSIC
#define Vc_INTRINSIC_L
#define Vc_INTRINSIC_R
#define Vc_UNREACHABLE std::abort
# endif
#define Vc_IS_UNLIKELY(x) x
#define Vc_IS_LIKELY(x) x
#define Vc_RESTRICT __restrict
#define Vc_DEPRECATED(msg) __declspec(deprecated(msg))
#define Vc_DEPRECATED_ALIAS(msg)
#define Vc_WARN_UNUSED_RESULT
#endif
#ifdef Vc_CXX14
#undef Vc_DEPRECATED
#define Vc_DEPRECATED(msg_) [[deprecated(msg_)]]
#endif
#define Vc_NOTHING_EXPECTING_SEMICOLON static_assert(true, "")
#define Vc_FREE_STORE_OPERATORS_ALIGNED(align_) \
\
\
\
Vc_ALWAYS_INLINE void *operator new(size_t size) \
{ \
return Vc::Common::aligned_malloc<align_>(size); \
} \
\
Vc_ALWAYS_INLINE void *operator new(size_t, void *p) { return p; } \
\
Vc_ALWAYS_INLINE void *operator new[](size_t size) \
{ \
return Vc::Common::aligned_malloc<align_>(size); \
} \
\
Vc_ALWAYS_INLINE void *operator new[](size_t, void *p) { return p; } \
\
Vc_ALWAYS_INLINE void operator delete(void *ptr, size_t) { Vc::Common::free(ptr); } \
\
Vc_ALWAYS_INLINE void operator delete(void *, void *) {} \
\
Vc_ALWAYS_INLINE void operator delete[](void *ptr, size_t) \
{ \
Vc::Common::free(ptr); \
} \
\
Vc_ALWAYS_INLINE void operator delete[](void *, void *) {} \
\
Vc_NOTHING_EXPECTING_SEMICOLON
#ifdef Vc_ASSERT
#define Vc_EXTERNAL_ASSERT 1
#else
#ifdef NDEBUG
#define Vc_ASSERT(x)
#else
#include <assert.h>
#define Vc_ASSERT(x) assert(x);
#endif
#endif
#if defined Vc_CLANG || defined Vc_APPLECLANG
#define Vc_HAS_BUILTIN(x) __has_builtin(x)
#else
#define Vc_HAS_BUILTIN(x) 0
#endif
#define Vc_CAT_HELPER_(a,b,c,d) a ##b ##c ##d
#define Vc_CAT(a,b,c,d) Vc_CAT_HELPER_(a, b, c, d)
#define Vc_CAT_IMPL(a,b) a ##b
#define Vc_CAT2(a,b) Vc_CAT_IMPL(a, b)
#define Vc_APPLY_IMPL_1_(macro,a,b,c,d,e) macro(a)
#define Vc_APPLY_IMPL_2_(macro,a,b,c,d,e) macro(a, b)
#define Vc_APPLY_IMPL_3_(macro,a,b,c,d,e) macro(a, b, c)
#define Vc_APPLY_IMPL_4_(macro,a,b,c,d,e) macro(a, b, c, d)
#define Vc_APPLY_IMPL_5_(macro,a,b,c,d,e) macro(a, b, c, d, e)
#define Vc_LIST_FLOAT_VECTOR_TYPES(size,macro,a,b,c,d) \
size(macro, double_v, a, b, c, d) \
size(macro, float_v, a, b, c, d)
#define Vc_LIST_INT_VECTOR_TYPES(size,macro,a,b,c,d) \
size(macro, int_v, a, b, c, d) \
size(macro, uint_v, a, b, c, d) \
size(macro, short_v, a, b, c, d) \
size(macro, ushort_v, a, b, c, d)
#define Vc_LIST_VECTOR_TYPES(size,macro,a,b,c,d) \
Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \
Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d)
#define Vc_LIST_COMPARES(size,macro,a,b,c,d) \
size(macro, ==, a, b, c, d) \
size(macro, !=, a, b, c, d) \
size(macro, <=, a, b, c, d) \
size(macro, >=, a, b, c, d) \
size(macro, < , a, b, c, d) \
size(macro, > , a, b, c, d)
#define Vc_LIST_LOGICAL(size,macro,a,b,c,d) \
size(macro, &&, a, b, c, d) \
size(macro, ||, a, b, c, d)
#define Vc_LIST_BINARY(size,macro,a,b,c,d) \
size(macro, |, a, b, c, d) \
size(macro, &, a, b, c, d) \
size(macro, ^, a, b, c, d)
#define Vc_LIST_SHIFTS(size,macro,a,b,c,d) \
size(macro, <<, a, b, c, d) \
size(macro, >>, a, b, c, d)
#define Vc_LIST_ARITHMETICS(size,macro,a,b,c,d) \
size(macro, +, a, b, c, d) \
size(macro, -, a, b, c, d) \
size(macro, *, a, b, c, d) \
size(macro, /, a, b, c, d) \
size(macro, %, a, b, c, d)
#define Vc_APPLY_0(_list,macro) _list(Vc_APPLY_IMPL_1_, macro, 0, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_APPLY_1(_list,macro,a) _list(Vc_APPLY_IMPL_2_, macro, a, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_APPLY_2(_list,macro,a,b) _list(Vc_APPLY_IMPL_3_, macro, a, b, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_APPLY_3(_list,macro,a,b,c) _list(Vc_APPLY_IMPL_4_, macro, a, b, c, 0) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_APPLY_4(_list,macro,a,b,c,d) _list(Vc_APPLY_IMPL_5_, macro, a, b, c, d) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_ALL_COMPARES(macro) Vc_APPLY_0(Vc_LIST_COMPARES, macro)
#define Vc_ALL_LOGICAL(macro) Vc_APPLY_0(Vc_LIST_LOGICAL, macro)
#define Vc_ALL_BINARY(macro) Vc_APPLY_0(Vc_LIST_BINARY, macro)
#define Vc_ALL_SHIFTS(macro) Vc_APPLY_0(Vc_LIST_SHIFTS, macro)
#define Vc_ALL_ARITHMETICS(macro) Vc_APPLY_0(Vc_LIST_ARITHMETICS, macro)
#define Vc_ALL_FLOAT_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_FLOAT_VECTOR_TYPES, macro)
#define Vc_ALL_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_VECTOR_TYPES, macro)
#define Vc_EXACT_TYPE(_test,_reference,_type) \
typename std::enable_if<std::is_same<_test, _reference>::value, _type>::type
#define Vc_make_unique(name) Vc_CAT(Vc_,name,_,__LINE__)
#if defined(Vc_ICC) || defined(Vc_CLANG) || defined Vc_APPLECLANG
#define Vc_OFFSETOF(Type,member) (reinterpret_cast<const char *>(&reinterpret_cast<const Type *>(0)->member) - reinterpret_cast<const char *>(0))
#else
#define Vc_OFFSETOF(Type,member) offsetof(Type, member)
#endif
#if defined(Vc_NO_NOEXCEPT)
#define Vc_NOEXCEPT throw()
#else
#define Vc_NOEXCEPT noexcept
#endif
#ifdef Vc_NO_ALWAYS_INLINE
#undef Vc_ALWAYS_INLINE
#undef Vc_ALWAYS_INLINE_L
#undef Vc_ALWAYS_INLINE_R
#define Vc_ALWAYS_INLINE inline
#define Vc_ALWAYS_INLINE_L inline
#define Vc_ALWAYS_INLINE_R
#undef Vc_INTRINSIC
#undef Vc_INTRINSIC_L
#undef Vc_INTRINSIC_R
#define Vc_INTRINSIC inline
#define Vc_INTRINSIC_L inline
#define Vc_INTRINSIC_R
#endif
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Permutation
{
struct ReversedTag {};
constexpr ReversedTag Reversed{};
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
using std::size_t;
using llong = long long;
using ullong = unsigned long long;
using ulong = unsigned long;
using uint = unsigned int;
using ushort = unsigned short;
using uchar = unsigned char;
using schar = signed char;
struct VectorSpecialInitializerZero {};
struct VectorSpecialInitializerOne {};
struct VectorSpecialInitializerIndexesFromZero {};
constexpr VectorSpecialInitializerZero Zero = {};
constexpr VectorSpecialInitializerOne One = {};
constexpr VectorSpecialInitializerIndexesFromZero IndexesFromZero = {};
namespace Detail
{
template<typename T> struct MayAliasImpl {
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wattributes"
#endif
typedef T type Vc_MAY_ALIAS;
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
};
}
#ifdef Vc_ICC
template <typename T> using MayAlias [[gnu::may_alias]] = T;
#else
template <typename T> using MayAlias = typename Detail::MayAliasImpl<T>::type;
#endif
template <class To, class From> MayAlias<To> &aliasing_cast(From &x)
{
return *reinterpret_cast<MayAlias<To> *>(&x);
}
template <class To, class From> const MayAlias<To> &aliasing_cast(const From &x)
{
return *reinterpret_cast<const MayAlias<To> *>(&x);
}
template <class To, class From> MayAlias<To> *aliasing_cast(From *x)
{
return reinterpret_cast<MayAlias<To> *>(x);
}
template <class To, class From> const MayAlias<To> *aliasing_cast(const From *x)
{
return reinterpret_cast<const MayAlias<To> *>(x);
}
enum class Operator : char {
Assign,
Multiply,
MultiplyAssign,
Divide,
DivideAssign,
Remainder,
RemainderAssign,
Plus,
PlusAssign,
Minus,
MinusAssign,
RightShift,
RightShiftAssign,
LeftShift,
LeftShiftAssign,
And,
AndAssign,
Xor,
XorAssign,
Or,
OrAssign,
PreIncrement,
PostIncrement,
PreDecrement,
PostDecrement,
LogicalAnd,
LogicalOr,
Comma,
UnaryPlus,
UnaryMinus,
UnaryNot,
UnaryOnesComplement,
CompareEqual,
CompareNotEqual,
CompareLess,
CompareGreater,
CompareLessEqual,
CompareGreaterEqual
};
template <typename T, std::size_t N> struct array;
namespace Common {
template <typename T, std::ptrdiff_t N> class span;
}
#ifndef Vc_CHECK_ALIGNMENT
template<typename _T> static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *){}
#else
template<typename _T> static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *ptr)
{
const size_t s = alignof(_T);
if((reinterpret_cast<size_t>(ptr) & ((s ^ (s & (s - 1))) - 1)) != 0) {
fprintf(stderr, "A vector with incorrect alignment has just been created. Look at the stacktrace to find the guilty object.\n");
abort();
}
}
#endif
namespace Common
{
template <typename T, std::size_t Pieces, std::size_t Index> struct Segment;
template<size_t StructSize> class SuccessiveEntries
{
#ifdef Vc_MSVC
using size_type = unsigned;
#else
using size_type = size_t;
#endif
const size_type m_first;
public:
typedef SuccessiveEntries AsArg;
Vc_INTRINSIC SuccessiveEntries(size_type first) : m_first(first) {}
Vc_INTRINSIC Vc_PURE size_type operator[](size_type offset) const
{
return m_first + offset * StructSize;
}
Vc_INTRINSIC Vc_PURE size_type data() const { return m_first; }
Vc_INTRINSIC Vc_PURE SuccessiveEntries operator+(const SuccessiveEntries &rhs) const
{
return SuccessiveEntries(m_first + rhs.m_first);
}
Vc_INTRINSIC Vc_PURE SuccessiveEntries operator*(const SuccessiveEntries &rhs) const
{
return SuccessiveEntries(m_first * rhs.m_first);
}
Vc_INTRINSIC Vc_PURE SuccessiveEntries operator<<(size_type x) const
{
return {m_first << x};
}
friend Vc_INTRINSIC SuccessiveEntries &internal_data(SuccessiveEntries &x)
{
return x;
}
friend Vc_INTRINSIC const SuccessiveEntries &internal_data(const SuccessiveEntries &x)
{
return x;
}
};
template <std::size_t alignment>
Vc_INTRINSIC_L void *aligned_malloc(std::size_t n) Vc_INTRINSIC_R;
Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R;
template <typename Mask, typename T, typename U>
using enable_if_mask_converts_implicitly =
enable_if<(!std::is_same<Mask, Traits::decay<U>>::value &&
Traits::is_simd_mask<U>::value && !Traits::isSimdMaskArray<U>::value &&
Traits::is_implicit_cast_allowed_mask<
Traits::entry_type_of<typename Traits::decay<U>::Vector>, T>::value)>;
template <typename T, typename U>
using enable_if_mask_converts_explicitly = enable_if<(
Traits::isSimdMaskArray<U>::value ||
(Traits::is_simd_mask<U>::value &&
!Traits::is_implicit_cast_allowed_mask<
Traits::entry_type_of<typename Traits::decay<U>::Vector>, T>::value))>;
template <typename T> using WidthT = std::integral_constant<std::size_t, sizeof(T)>;
template <std::size_t Bytes> class MaskBool;
template <typename T, typename IndexVector, typename Scale, bool>
class SubscriptOperation;
template <class T, class IndexVector, int Scale = 1>
struct GatherArguments {
static_assert(std::is_same<T, remove_cvref_t<T>>::value && !std::is_pointer<T>::value,
"GatherArguments expects an cv unqualified non-ref/ptr type");
const IndexVector indexes;
const T *const address;
};
template <int Scale, class T, class I>
GatherArguments<T, I, Scale> make_gather(const T *m, const I &i)
{
return {i, m};
}
template <typename T, typename IndexVector> struct ScatterArguments
{
const IndexVector indexes;
T *const address;
};
template <typename I, I Begin, I End, typename F>
Vc_INTRINSIC enable_if<(Begin >= End), void> unrolled_loop(F &&)
{
}
template <typename I, I Begin, I End, typename F>
Vc_INTRINSIC Vc_FLATTEN enable_if<(Begin < End), void> unrolled_loop(F &&f)
{
f(Begin);
unrolled_loop<I, Begin + 1, End>(f);
}
template <std::size_t Size, typename F> Vc_INTRINSIC void for_all_vector_entries(F &&f)
{
unrolled_loop<std::size_t, 0u, Size>(std::forward<F>(f));
}
}
}
#ifndef VC_COMMON_VECTOR_H_
#define VC_COMMON_VECTOR_H_
#include <ratio>
#ifndef VC_COMMON_ELEMENTREFERENCE_H_
#define VC_COMMON_ELEMENTREFERENCE_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename U, typename Accessor = U> class ElementReference
{
friend U;
friend Accessor;
Vc_INTRINSIC ElementReference(U &o, int i) noexcept : index(i), obj(o) {}
static constexpr bool get_noexcept =
noexcept(Accessor::get(std::declval<U &>(), int()));
template <typename T> static constexpr bool set_noexcept()
{
return noexcept(Accessor::set(std::declval<U &>(), int(), std::declval<T>()));
}
public:
using value_type = typename U::value_type;
Vc_INTRINSIC ElementReference(const ElementReference &) = delete;
Vc_INTRINSIC ElementReference(ElementReference &&) = default;
Vc_INTRINSIC operator value_type() const noexcept(get_noexcept)
{
return Accessor::get(obj, index);
}
template <typename T>
Vc_INTRINSIC ElementReference &operator=(T &&x) &&
noexcept(noexcept(Accessor::set(std::declval<U &>(), int(), std::declval<T>())))
{
Accessor::set(obj, index, std::forward<T>(x));
return *this;
}
#define Vc_OP_(op_) \
template <typename T, typename R = decltype(std::declval<const value_type &>() \
op_ std::declval<T>())> \
Vc_INTRINSIC ElementReference &operator op_##=(T &&x) && \
noexcept(get_noexcept && noexcept(Accessor::set(std::declval<U &>(), int(), \
std::declval<R &&>()))) \
{ \
const value_type &lhs = Accessor::get(obj, index); \
Accessor::set(obj, index, lhs op_ std::forward<T>(x)); \
return *this; \
}
Vc_ALL_ARITHMETICS(Vc_OP_);
Vc_ALL_SHIFTS(Vc_OP_);
Vc_ALL_BINARY(Vc_OP_);
#undef Vc_OP_
template <typename = void>
Vc_INTRINSIC ElementReference &operator++() &&
noexcept(noexcept(std::declval<value_type &>() =
Accessor::get(std::declval<U &>(), int())) &&
set_noexcept<decltype(++std::declval<value_type &>())>())
{
value_type x = Accessor::get(obj, index);
Accessor::set(obj, index, ++x);
return *this;
}
template <typename = void>
Vc_INTRINSIC value_type operator++(int) &&
noexcept(noexcept(std::declval<value_type &>() =
Accessor::get(std::declval<U &>(), int())) &&
set_noexcept<decltype(std::declval<value_type &>()++)>())
{
const value_type r = Accessor::get(obj, index);
value_type x = r;
Accessor::set(obj, index, ++x);
return r;
}
template <typename = void>
Vc_INTRINSIC ElementReference &operator--() &&
noexcept(noexcept(std::declval<value_type &>() =
Accessor::get(std::declval<U &>(), int())) &&
set_noexcept<decltype(--std::declval<value_type &>())>())
{
value_type x = Accessor::get(obj, index);
Accessor::set(obj, index, --x);
return *this;
}
template <typename = void>
Vc_INTRINSIC value_type operator--(int) &&
noexcept(noexcept(std::declval<value_type &>() =
Accessor::get(std::declval<U &>(), int())) &&
set_noexcept<decltype(std::declval<value_type &>()--)>())
{
const value_type r = Accessor::get(obj, index);
value_type x = r;
Accessor::set(obj, index, --x);
return r;
}
friend void swap(ElementReference &&a, ElementReference &&b) {
value_type tmp(a);
static_cast<ElementReference &&>(a) = static_cast<value_type>(b);
static_cast<ElementReference &&>(b) = tmp;
}
friend void swap(value_type &a, ElementReference &&b) {
value_type tmp(a);
a = static_cast<value_type>(b);
static_cast<ElementReference &&>(b) = tmp;
}
friend void swap(ElementReference &&a, value_type &b) {
value_type tmp(a);
static_cast<ElementReference &&>(a) = b;
b = tmp;
}
private:
int index;
U &obj;
};
}
}
#endif
#ifndef VC_COMMON_VECTORABI_H_
#define VC_COMMON_VECTORABI_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace VectorAbi
{
template <typename T>
using Avx1Abi = typename std::conditional<std::is_integral<T>::value, VectorAbi::Sse,
VectorAbi::Avx>::type;
template <typename T> struct DeduceCompatible {
#ifdef __x86_64__
using type = Sse;
#else
using type = Scalar;
#endif
};
template <typename T>
struct DeduceBest {
using type = typename std::conditional<
CurrentImplementation::is(ScalarImpl), Scalar,
typename std::conditional<
CurrentImplementation::is_between(SSE2Impl, SSE42Impl), Sse,
typename std::conditional<
CurrentImplementation::is(AVXImpl), Avx1Abi<T>,
typename std::conditional<CurrentImplementation::is(AVX2Impl), Avx,
void>::type>::type>::type>::type;
};
template <typename T> using Best = typename DeduceBest<T>::type;
}
}
#ifndef VC_COMMON_SIMDARRAYFWD_H_
#define VC_COMMON_SIMDARRAYFWD_H_
#ifndef VC_SSE_TYPES_H_
#define VC_SSE_TYPES_H_
#ifdef Vc_DEFAULT_IMPL_SSE
#define Vc_DOUBLE_V_SIZE 2
#define Vc_FLOAT_V_SIZE 4
#define Vc_INT_V_SIZE 4
#define Vc_UINT_V_SIZE 4
#define Vc_SHORT_V_SIZE 8
#define Vc_USHORT_V_SIZE 8
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{
template <typename T> using Vector = Vc::Vector<T, VectorAbi::Sse>;
typedef Vector<double> double_v;
typedef Vector<float> float_v;
typedef Vector<int> int_v;
typedef Vector<unsigned int> uint_v;
typedef Vector<short> short_v;
typedef Vector<unsigned short> ushort_v;
template <typename T> using Mask = Vc::Mask<T, VectorAbi::Sse>;
typedef Mask<double> double_m;
typedef Mask<float> float_m;
typedef Mask<int> int_m;
typedef Mask<unsigned int> uint_m;
typedef Mask<short> short_m;
typedef Mask<unsigned short> ushort_m;
template <typename T> struct Const;
template <typename T> struct is_vector : public std::false_type {};
template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
template <typename T> struct is_mask : public std::false_type {};
template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
}
namespace Traits
{
template <class T> struct
is_simd_vector_internal<Vector<T, VectorAbi::Sse>>
: public is_valid_vector_argument<T> {};
template<typename T> struct is_simd_mask_internal<Mask<T, VectorAbi::Sse>>
: public std::true_type {};
}
}
#endif
#ifndef VC_AVX_TYPES_H_
#define VC_AVX_TYPES_H_
#ifndef VC_AVX_MACROS_H_
#define VC_AVX_MACROS_H_
#endif
#ifdef Vc_DEFAULT_IMPL_AVX2
#define Vc_DOUBLE_V_SIZE 4
#define Vc_FLOAT_V_SIZE 8
#define Vc_INT_V_SIZE 8
#define Vc_UINT_V_SIZE 8
#define Vc_SHORT_V_SIZE 16
#define Vc_USHORT_V_SIZE 16
#elif defined Vc_DEFAULT_IMPL_AVX
#define Vc_DOUBLE_V_SIZE 4
#define Vc_FLOAT_V_SIZE 8
#define Vc_INT_V_SIZE 4
#define Vc_UINT_V_SIZE 4
#define Vc_SHORT_V_SIZE 8
#define Vc_USHORT_V_SIZE 8
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
template <typename T> using Vector = Vc::Vector<T, VectorAbi::Avx1Abi<T>>;
typedef Vector<double> double_v;
typedef Vector<float> float_v;
typedef Vector<int> int_v;
typedef Vector<unsigned int> uint_v;
typedef Vector<short> short_v;
typedef Vector<unsigned short> ushort_v;
template <typename T> using Mask = Vc::Mask<T, VectorAbi::Avx1Abi<T>>;
typedef Mask<double> double_m;
typedef Mask<float> float_m;
typedef Mask<int> int_m;
typedef Mask<unsigned int> uint_m;
typedef Mask<short> short_m;
typedef Mask<unsigned short> ushort_m;
template <typename T> struct Const;
template <typename T> struct is_vector : public std::false_type {};
template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
template <typename T> struct is_mask : public std::false_type {};
template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
}
namespace AVX2
{
template <typename T> using Vector = Vc::Vector<T, VectorAbi::Avx>;
using double_v = Vector<double>;
using float_v = Vector< float>;
using int_v = Vector< int>;
using uint_v = Vector< uint>;
using short_v = Vector< short>;
using ushort_v = Vector<ushort>;
template <typename T> using Mask = Vc::Mask<T, VectorAbi::Avx>;
using double_m = Mask<double>;
using float_m = Mask< float>;
using llong_m = Mask< llong>;
using ullong_m = Mask<ullong>;
using long_m = Mask< long>;
using ulong_m = Mask< ulong>;
using int_m = Mask< int>;
using uint_m = Mask< uint>;
using short_m = Mask< short>;
using ushort_m = Mask<ushort>;
using schar_m = Mask< schar>;
using uchar_m = Mask< uchar>;
template <typename T> struct is_vector : public std::false_type {};
template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
template <typename T> struct is_mask : public std::false_type {};
template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
}
namespace Traits
{
template <class T> struct
is_simd_vector_internal<Vector<T, VectorAbi::Avx>>
: public is_valid_vector_argument<T> {};
template<typename T> struct is_simd_mask_internal<Mask<T, VectorAbi::Avx>>
: public std::true_type {};
}
}
#endif
#ifndef VC_COMMON_UTILITY_H_
#define VC_COMMON_UTILITY_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template <size_t x, bool = (x & (x - 1)) == 0> struct NextPowerOfTwo;
template <size_t x>
struct NextPowerOfTwo<x, true> : public std::integral_constant<size_t, x> {
};
template <size_t x>
struct NextPowerOfTwo<x, false>
: public std::integral_constant<
size_t, NextPowerOfTwo<(x | (x >> 1) | (x >> 2) | (x >> 5)) + 1>::value> {
};
template <size_t A>
struct BoundedAlignment : public std::integral_constant<size_t,
#if defined Vc_MSVC || defined Vc_GCC
((A - 1) &
#ifdef Vc_MSVC
31
#elif defined __AVX__
255
#else
127
#endif
) + 1
#else
A
#endif
> {
};
template <std::size_t N> static constexpr std::size_t left_size()
{
return Common::NextPowerOfTwo<(N + 1) / 2>::value;
}
template <std::size_t N> static constexpr std::size_t right_size()
{
return N - left_size<N>();
}
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
template <class T, int N>
class Vector<T, simd_abi::fixed_size<N>> : public SimdArray<T, N>
{
using SimdArray<T, N>::SimdArray;
public:
Vc_INTRINSIC Vector(const Vector &x) : SimdArray<T, N>(x) {}
Vc_INTRINSIC Vector &operator=(const Vector &x)
{
SimdArray<T, N>::operator=(x);
return *this;
}
Vector() = default;
using abi_type = simd_abi::fixed_size<N>;
using abi = abi_type;
Vc_DEPRECATED("use Vector([](int n) { return n; }) instead of "
"Vector::IndexesFromZero()") static Vector IndexesFromZero()
{
return Vector([](size_t i) -> T { return i; });
}
Vc_DEPRECATED("use 0 instead of Vector::Zero()") static Vector Zero() { return 0; }
Vc_DEPRECATED("use 1 instead of Vector::One()") static Vector One() { return 1; }
};
template <class T, int N>
class Mask<T, simd_abi::fixed_size<N>> : public SimdMaskArray<T, N>
{
using SimdMaskArray<T, N>::SimdMaskArray;
public:
Vc_INTRINSIC Mask(const Mask &x) : SimdMaskArray<T, N>(x) {}
Vc_INTRINSIC Mask &operator=(const Mask &x)
{
SimdMaskArray<T, N>::operator=(x);
return *this;
}
Mask() = default;
using abi_type = simd_abi::fixed_size<N>;
using abi = abi_type;
};
template <typename T, std::size_t N> struct SimdArrayTraits {
static constexpr std::size_t N0 = Common::left_size<N>();
static constexpr std::size_t N1 = Common::right_size<N>();
using storage_type0 = fixed_size_simd<T, N0>;
using storage_type1 = fixed_size_simd<T, N1>;
};
template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
Vc_INTRINSIC_L typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
Vc_INTRINSIC_L typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
Vc_INTRINSIC_L const typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
const SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
Vc_INTRINSIC_L const typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
const SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename V>
Vc_INTRINSIC_L V &internal_data(SimdArray<T, N, V, N> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename V>
Vc_INTRINSIC_L const V &internal_data(const SimdArray<T, N, V, N> &x) Vc_INTRINSIC_R;
namespace Traits
{
template <class T> struct is_fixed_size_simd : std::false_type {
};
template <class T, int N>
struct is_fixed_size_simd<fixed_size_simd<T, N>> : std::true_type {
};
template <class T, int N>
struct is_fixed_size_simd<fixed_size_simd_mask<T, N>> : std::true_type {
};
template <class T, int N>
struct is_simd_vector_internal<fixed_size_simd<T, N>> : is_valid_vector_argument<T> {};
template <class T, int N>
struct is_simd_mask_internal<fixed_size_simd_mask<T, N>> : is_valid_vector_argument<T> {};
template <typename T, std::size_t N, typename V>
struct is_atomic_simdarray_internal<SimdArray<T, N, V, N>> : is_valid_vector_argument<T> {};
template <typename T, int N>
struct is_atomic_simdarray_internal<fixed_size_simd<T, N>>
: is_atomic_simdarray_internal<SimdArray<T, N>> {
};
template <typename T, std::size_t N, typename V>
struct is_atomic_simd_mask_array_internal<SimdMaskArray<T, N, V, N>>
: is_valid_vector_argument<T> {
};
template <typename T, int N>
struct is_atomic_simd_mask_array_internal<fixed_size_simd_mask<T, N>>
: is_atomic_simd_mask_array_internal<SimdMaskArray<T, N>> {
};
template <typename T, std::size_t N, typename VectorType, std::size_t M>
struct is_simdarray_internal<SimdArray<T, N, VectorType, M>>
: is_valid_vector_argument<T> {
};
template <typename T, int N>
struct is_simdarray_internal<fixed_size_simd<T, N>> : is_valid_vector_argument<T> {
};
template <typename T, std::size_t N, typename VectorType, std::size_t M>
struct is_simd_mask_array_internal<SimdMaskArray<T, N, VectorType, M>>
: is_valid_vector_argument<T> {
};
template <typename T, int N>
struct is_simd_mask_array_internal<fixed_size_simd_mask<T, N>>
: is_valid_vector_argument<T> {
};
template <typename T, std::size_t N, typename V, std::size_t M>
struct is_integral_internal<SimdArray<T, N, V, M>, false> : std::is_integral<T> {
};
template <typename T, std::size_t N, typename V, std::size_t M>
struct is_floating_point_internal<SimdArray<T, N, V, M>, false>
: std::is_floating_point<T> {
};
template <typename T, std::size_t N, typename V, std::size_t M>
struct is_signed_internal<SimdArray<T, N, V, M>, false> : std::is_signed<T> {
};
template <typename T, std::size_t N, typename V, std::size_t M>
struct is_unsigned_internal<SimdArray<T, N, V, M>, false> : std::is_unsigned<T> {
};
template <typename T, std::size_t N>
struct has_no_allocated_data_impl<Vc::SimdArray<T, N>> : std::true_type {
};
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace detail
{
template <class T> struct is_fixed_size_abi : std::false_type {
};
template <int N> struct is_fixed_size_abi<simd_abi::fixed_size<N>> : std::true_type {
};
template <class T>
using not_fixed_size_abi = typename std::enable_if<!is_fixed_size_abi<T>::value, T>::type;
}
}
#endif
#ifndef VC_COMMON_VECTORTRAITS_H_
#define VC_COMMON_VECTORTRAITS_H_
namespace Vc_VERSIONED_NAMESPACE
{
template <typename T, typename Abi> struct VectorTraits;
}
#endif
#ifndef VC_COMMON_LOADSTOREFLAGS_H_
#define VC_COMMON_LOADSTOREFLAGS_H_
namespace Vc_VERSIONED_NAMESPACE
{
struct Exclusive {};
struct Shared {};
namespace LoadStoreFlags
{
struct StreamingFlag {};
struct UnalignedFlag {};
struct PrefetchFlagBase {};
template <size_t L1 = 16 * 64, size_t L2 = 128 * 64, typename ExclusiveOrShared_ = void>
struct PrefetchFlag : public PrefetchFlagBase {
typedef ExclusiveOrShared_ ExclusiveOrShared;
static constexpr size_t L1Stride = L1;
static constexpr size_t L2Stride = L2;
static constexpr bool IsExclusive = std::is_same<ExclusiveOrShared, Exclusive>::value;
static constexpr bool IsShared = std::is_same<ExclusiveOrShared, Shared>::value;
};
template<typename Base, typename Default, typename... LoadStoreFlags> struct ExtractType
{
typedef Default type;
};
template<typename Base, typename Default, typename T, typename... LoadStoreFlags> struct ExtractType<Base, Default, T, LoadStoreFlags...>
{
typedef typename std::conditional<std::is_base_of<Base, T>::value, T, typename ExtractType<Base, Default, LoadStoreFlags...>::type>::type type;
};
#ifdef Vc_ICC
#pragma warning(disable: 177)
#endif
template<typename... Flags> struct LoadStoreFlags
{
private:
typedef typename ExtractType<PrefetchFlagBase, PrefetchFlag<0, 0>, Flags...>::type Prefetch;
public:
constexpr LoadStoreFlags() {}
static constexpr bool IsStreaming = !std::is_same<typename ExtractType<StreamingFlag, void, Flags...>::type, void>::value;
static constexpr bool IsUnaligned = !std::is_same<typename ExtractType<UnalignedFlag, void, Flags...>::type, void>::value;
static constexpr bool IsAligned = !IsUnaligned;
static constexpr bool IsPrefetch = !std::is_same<typename ExtractType<PrefetchFlagBase, void, Flags...>::type, void>::value;
static constexpr bool IsExclusivePrefetch = Prefetch::IsExclusive;
static constexpr bool IsSharedPrefetch = Prefetch::IsShared;
static constexpr size_t L1Stride = Prefetch::L1Stride;
static constexpr size_t L2Stride = Prefetch::L2Stride;
typedef LoadStoreFlags<typename std::conditional<std::is_same<Flags, UnalignedFlag>::value, void, Flags>::type...> UnalignedRemoved;
typedef typename std::conditional<IsAligned && !IsStreaming, void *, void>::type EnableIfAligned;
typedef typename std::conditional<IsAligned && IsStreaming, void *, void>::type EnableIfStreaming;
typedef typename std::conditional<IsUnaligned && !IsStreaming, void *, void>::type EnableIfUnalignedNotStreaming;
typedef typename std::conditional<IsUnaligned && IsStreaming, void *, void>::type EnableIfUnalignedAndStreaming;
typedef typename std::conditional<IsUnaligned , void *, void>::type EnableIfUnaligned;
typedef typename std::conditional<!IsUnaligned , void *, void>::type EnableIfNotUnaligned;
typedef typename std::conditional<IsPrefetch , void *, void>::type EnableIfPrefetch;
typedef typename std::conditional<!IsPrefetch , void *, void>::type EnableIfNotPrefetch;
};
template<> struct LoadStoreFlags<>
{
constexpr LoadStoreFlags() {}
static constexpr bool IsStreaming = false;
static constexpr bool IsUnaligned = false;
static constexpr bool IsAligned = !IsUnaligned;
static constexpr bool IsPrefetch = false;
static constexpr bool IsExclusivePrefetch = false;
static constexpr bool IsSharedPrefetch = false;
static constexpr size_t L1Stride = 0;
static constexpr size_t L2Stride = 0;
typedef void* EnableIfAligned;
typedef void* EnableIfNotUnaligned;
typedef void* EnableIfNotPrefetch;
};
template<typename... LFlags, typename... RFlags>
constexpr LoadStoreFlags<LFlags..., RFlags...> operator|(LoadStoreFlags<LFlags...>, LoadStoreFlags<RFlags...>)
{
return LoadStoreFlags<LFlags..., RFlags...>();
}
}
using LoadStoreFlags::PrefetchFlag;
typedef LoadStoreFlags::LoadStoreFlags<> AlignedTag;
typedef LoadStoreFlags::LoadStoreFlags<LoadStoreFlags::StreamingFlag> StreamingTag;
typedef LoadStoreFlags::LoadStoreFlags<LoadStoreFlags::UnalignedFlag> UnalignedTag;
typedef UnalignedTag DefaultLoadTag;
typedef UnalignedTag DefaultStoreTag;
constexpr AlignedTag Aligned;
constexpr UnalignedTag Unaligned;
constexpr StreamingTag Streaming;
constexpr LoadStoreFlags::LoadStoreFlags<PrefetchFlag<>> PrefetchDefault;
template <size_t L1 = PrefetchFlag<>::L1Stride,
size_t L2 = PrefetchFlag<>::L2Stride,
typename ExclusiveOrShared = PrefetchFlag<>::ExclusiveOrShared>
struct Prefetch : public LoadStoreFlags::LoadStoreFlags<PrefetchFlag<L1, L2, ExclusiveOrShared>>
{
};
namespace Traits
{
template <typename... Ts>
struct is_loadstoreflag_internal<LoadStoreFlags::LoadStoreFlags<Ts...>> : public std::true_type
{
};
template <size_t L1, size_t L2, typename ExclusiveOrShared>
struct is_loadstoreflag_internal<Prefetch<L1, L2, ExclusiveOrShared>> : public std::true_type
{
};
}
}
#endif
#ifndef VC_COMMON_WRITEMASKEDVECTOR_H_
#define VC_COMMON_WRITEMASKEDVECTOR_H_
#include <utility>
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template <typename V, typename M = typename V::Mask> class WriteMaskedVector
{
static_assert(
V::Size == M::Size,
"incorrect use of Vc::Common::WriteMaskedVector<V, M>. V and M must have the same «Size».");
public:
typedef M Mask;
static constexpr size_t Size = V::Size;
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask));
Vc_INTRINSIC WriteMaskedVector(V &v, const Mask &k) : mask(k), vec(v)
{
}
Vc_INTRINSIC V &operator++()
{
V one = V::One();
one.setZeroInverted(mask);
return vec += one;
}
Vc_INTRINSIC V &operator--()
{
V one = V::One();
one.setZeroInverted(mask);
return vec -= one;
}
Vc_INTRINSIC V operator++(int)
{
V ret(vec);
operator++();
return ret;
}
Vc_INTRINSIC V operator--(int)
{
V ret(vec);
operator--();
return ret;
}
#define Vc_OPERATOR_(op) \
template <typename U> Vc_ALWAYS_INLINE void operator op##=(U &&x) \
{ \
operator=(static_cast<V>(vec op std::forward<U>(x))); \
}
Vc_ALL_BINARY(Vc_OPERATOR_);
Vc_ALL_ARITHMETICS(Vc_OPERATOR_);
Vc_ALL_SHIFTS(Vc_OPERATOR_);
#undef Vc_OPERATOR_
Vc_ALWAYS_INLINE void operator=(const V &x)
{
vec.assign(x, mask);
}
template <typename T, typename I, typename S>
Vc_ALWAYS_INLINE void operator=(SubscriptOperation<T, I, S, true> &&x)
{
vec.gather(std::move(x).gatherArguments(), mask);
}
template <typename F> Vc_INTRINSIC void call(const F &f) const
{
return vec.call(f, mask);
}
template <typename F> Vc_INTRINSIC V apply(const F &f) const
{
return vec.apply(f, mask);
}
template <typename F> Vc_INTRINSIC void call(F &&f) const
{
return vec.call(std::forward<F>(f), mask);
}
template <typename F> Vc_INTRINSIC V apply(F &&f) const
{
return vec.apply(std::forward<F>(f), mask);
}
private:
#ifdef Vc_ICC
const Mask &mask;
#else
const Mask mask;
#endif
V &vec;
};
}
}
#endif
#ifndef VC_COMMON_DETAIL_H_
#define VC_COMMON_DETAIL_H_
#include <vector>
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template <typename IV>
Vc_INTRINSIC enable_if<(Traits::is_simd_vector<IV>::value &&
sizeof(typename IV::EntryType) >= sizeof(int)),
const IV &>
convertIndexVector(const IV &indexVector)
{
return indexVector;
}
template <typename IV>
Vc_INTRINSIC enable_if<(Traits::is_simd_vector<IV>::value &&
sizeof(typename IV::EntryType) < sizeof(int)),
fixed_size_simd<int, IV::Size>>
convertIndexVector(const IV &indexVector)
{
return static_cast<fixed_size_simd<int, IV::Size>>(indexVector);
}
template <class T> using promoted_type = decltype(std::declval<T>() + 1);
template <typename T, std::size_t N>
Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
convertIndexVector(const std::array<T, N> &indexVector)
{
return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
Vc::Unaligned};
}
template <typename T, std::size_t N>
Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
convertIndexVector(const Vc::array<T, N> &indexVector)
{
return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
Vc::Unaligned};
}
template <typename T, std::size_t N>
Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
convertIndexVector(const T (&indexVector)[N])
{
return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
Vc::Unaligned};
}
#ifndef Vc_MSVC
template <class T>
enable_if<std::is_pointer<T>::value, void> convertIndexVector(T indexVector) = delete;
#endif
template <typename T>
Vc_INTRINSIC std::vector<promoted_type<T>> convertIndexVector(
const std::initializer_list<T> &indexVector)
{
return {begin(indexVector), end(indexVector)};
}
template <typename T>
Vc_INTRINSIC
enable_if<(std::is_integral<T>::value && sizeof(T) >= sizeof(int)), std::vector<T>>
convertIndexVector(const std::vector<T> &indexVector)
{
return indexVector;
}
template <typename T>
Vc_INTRINSIC enable_if<(std::is_integral<T>::value && sizeof(T) < sizeof(int)),
std::vector<promoted_type<T>>>
convertIndexVector(const std::vector<T> &indexVector)
{
return {std::begin(indexVector), std::end(indexVector)};
}
template <class T,
class = enable_if<
(!std::is_pointer<T>::value && !Traits::is_simd_vector<T>::value &&
!std::is_lvalue_reference<decltype(std::declval<const T &>()[0])>::value)>>
Vc_INTRINSIC const T &convertIndexVector(const T &i)
{
return i;
}
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
template <typename T, typename Abi,
typename = enable_if<std::is_floating_point<T>::value &&
!detail::is_fixed_size_abi<Abi>::value>>
inline Vector<T, Abi> copysign(Vector<T, Abi> magnitude, Vector<T, Abi> sign);
template <typename T, typename Abi,
typename = enable_if<std::is_floating_point<T>::value &&
!detail::is_fixed_size_abi<Abi>::value>>
inline Vector<T, Abi> exponent(Vector<T, Abi> x);
template <typename T, typename Abi>
Vc_INTRINSIC Vc_CONST typename Vector<T, detail::not_fixed_size_abi<Abi>>::MaskType
isnegative(Vector<T, Abi> x)
{
return x < Vector<T, Abi>::Zero();
}
template<typename T, typename Abi = VectorAbi::Best<T>> class Vector
{
public:
static constexpr size_t size() { return VectorTraits<T, Abi>::size(); }
static constexpr size_t MemoryAlignment = VectorTraits<T, Abi>::memoryAlignment();
using abi = Abi;
using EntryType = typename VectorTraits<T, Abi>::EntryType;
using value_type = EntryType;
using VectorEntryType = typename VectorTraits<T, Abi>::VectorEntryType;
using VectorType = typename VectorTraits<T, Abi>::VectorType;
using vector_type = VectorType;
using MaskType = Vc::Mask<T, Abi>;
using mask_type = MaskType;
using MaskArgument = MaskType;
using VectorArgument = Vector;
using IndexType = Vc::fixed_size_simd<int, VectorTraits<T, Abi>::size()>;
using index_type = IndexType;
using reference = Detail::ElementReference<Vector>;
static inline Vector Zero();
static inline Vector One();
static inline Vector IndexesFromZero();
static inline Vector Random();
template <typename G> static inline Vector generate(G gen);
inline Vector() = default;
explicit inline Vector(VectorSpecialInitializerZero);
explicit inline Vector(VectorSpecialInitializerOne);
explicit inline Vector(VectorSpecialInitializerIndexesFromZero);
template <typename U>
inline Vector(Vector<U, abi> x,
enable_if<Traits::is_implicit_cast_allowed<U, T>::value> = nullarg);
#if Vc_IS_VERSION_1
template <typename U>
Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
"vector types") inline explicit Vector(
Vector<U, abi> x,
enable_if<!Traits::is_implicit_cast_allowed<U, T>::value> = nullarg);
#endif
inline Vector(EntryType a);
template <typename U>
inline Vector(U a, enable_if<std::is_same<U, int>::value &&
!std::is_same<U, EntryType>::value> = nullarg);
inline explicit Vector(reference a);
explicit Vc_INTRINSIC Vector(const EntryType *mem)
{
load(mem);
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
{
load(mem, flags);
}
template <typename U, typename Flags = DefaultLoadTag,
typename = enable_if<
(!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
sizeof(EntryType) >= sizeof(U)) &&
std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
{
load<U, Flags>(x, flags);
}
Vc_INTRINSIC void load(const EntryType *mem)
{
load(mem, DefaultLoadTag());
}
template <typename Flags>
Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
load(const EntryType *mem, Flags flags)
{
load<EntryType, Flags>(mem, flags);
}
private:
template <typename U, typename Flags>
struct load_concept : public std::enable_if<
(!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
sizeof(EntryType) >= sizeof(U)) &&
std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
{};
public:
template <typename U, typename Flags = DefaultLoadTag>
Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
template <
typename U,
typename Flags = DefaultStoreTag,
typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
template <
typename U,
typename Flags = DefaultStoreTag,
typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
Vc_INTRINSIC void store(EntryType *mem) const
{
store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
{
store<EntryType, Flags>(mem, flags);
}
Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
{
store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
{
store<EntryType, Flags>(mem, mask, flags);
}
inline void setZero();
inline void setZero(MaskType mask);
inline void setZeroInverted(MaskType mask);
inline void setQnan();
inline void setQnan(MaskType mask);
#define Vc_CURRENT_CLASS_NAME Vector
#ifndef Vc_CURRENT_CLASS_NAME
#error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
#endif
private:
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
MaskArgument mask);
public:
#define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<MT, EntryType>::value, \
"The memory pointer needs to point to a type that can be converted to the " \
"EntryType of this SIMD vector type."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
template <typename MT, typename IT,
typename = enable_if<Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
#undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
private:
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes) const;
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
public:
#define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<EntryType, MT>::value, \
"The memory pointer needs to point to a type that the EntryType of this " \
"SIMD vector type can be converted to."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes));
}
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes), mask);
}
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
{
scatter(args.address, args.indexes);
}
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
{
scatter(args.address, args.indexes, mask);
}
#undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
#undef Vc_CURRENT_CLASS_NAME
inline reference operator[](size_t index) noexcept;
inline EntryType operator[](size_t index) const noexcept;
inline MaskType operator!() const;
inline Vector operator~() const;
inline Vector operator-() const;
inline Vector operator+() const;
inline Vector &operator++();
inline Vector operator++(int);
inline Vector &operator--();
inline Vector operator--(int);
#define Vc_OP(symbol) \
inline Vc_PURE Vector operator symbol(const Vector &x) const;
Vc_ALL_ARITHMETICS(Vc_OP);
Vc_ALL_BINARY(Vc_OP);
Vc_ALL_SHIFTS(Vc_OP);
#undef Vc_OP
#define Vc_CMP_OP(symbol) inline Vc_PURE MaskType operator symbol(const Vector &x) const;
Vc_ALL_COMPARES(Vc_CMP_OP);
#undef Vc_CMP_OP
inline Common::WriteMaskedVector<Vector, MaskType> operator()(MaskType mask);
inline EntryType min() const;
inline EntryType max() const;
inline EntryType product() const;
inline EntryType sum() const;
inline Vector partialSum() const;
inline EntryType min(MaskType mask) const;
inline EntryType max(MaskType mask) const;
inline EntryType product(MaskType mask) const;
inline EntryType sum(MaskType mask) const;
inline Vector shifted(int amount) const;
inline Vector shifted(int amount, Vector shiftIn) const;
inline Vector rotated(int amount) const;
inline Vector reversed() const;
inline Vector sorted() const;
template <typename F> void callWithValuesSorted(F &&f);
template <typename F> inline void call(F &&f) const;
template <typename F> inline void call(F &&f, MaskType mask) const;
template <typename F> inline Vector apply(F &&f) const;
template <typename F> inline Vector apply(F &&f, MaskType mask) const;
template <typename IndexT> inline void fill(EntryType(&f)(IndexT));
inline void fill(EntryType(&f)());
inline Vector interleaveLow(Vector x) const;
inline Vector interleaveHigh(Vector x) const;
inline void assign(const Vector &v, const MaskType &m);
inline VectorType &data();
inline const VectorType &data() const;
Vc_DEPRECATED("use exponent(x) instead") inline Vector exponent() const;
Vc_DEPRECATED("use isnegative(x) instead") inline MaskType isNegative() const;
static constexpr size_t Size = VectorTraits<T, Abi>::size();
template <typename V2> inline V2 staticCast() const;
template <typename V2>
Vc_DEPRECATED("use reinterpret_components_cast instead") inline V2
reinterpretCast() const;
Vc_DEPRECATED("use copysign(x, y) instead") inline Vector
copySign(Vector reference) const;
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Vector));
private:
VectorType d;
};
template <typename V, typename T, typename Abi>
Vc_ALWAYS_INLINE Vc_CONST enable_if<
(V::size() == Vector<T, Abi>::size() &&
sizeof(typename V::VectorEntryType) ==
sizeof(typename Vector<T, Abi>::VectorEntryType) &&
sizeof(V) == sizeof(Vector<T, Abi>) && alignof(V) <= alignof(Vector<T, Abi>)),
V>
reinterpret_components_cast(const Vector<T, Abi> &x)
{
return reinterpret_cast<const V &>(x);
}
#define Vc_OP(symbol) \
template <typename T, typename Abi> \
inline Vector<T, Abi> &operator symbol##=(Vector<T, Abi> &, \
const Vector<T, Abi> &x);
#undef Vc_OP
}
#endif
#ifndef VC_COMMON_MASK_H_
#define VC_COMMON_MASK_H_
namespace Vc_VERSIONED_NAMESPACE
{
template <typename T, typename Abi = VectorAbi::Best<T>> class Mask
{
public:
static constexpr size_t size() { return VectorTraits<T, Abi>::size(); }
static constexpr size_t Size = VectorTraits<T, Abi>::size();
static constexpr size_t MemoryAlignment = VectorTraits<T, Abi>::maskMemoryAlignment();
using abi = Abi;
using EntryType = bool;
using value_type = EntryType;
using EntryReference = typename VectorTraits<T, Abi>::EntryReference;
using value_reference = EntryReference;
using VectorEntryType = typename VectorTraits<T, Abi>::VectorEntryType;
using VectorType = typename VectorTraits<T, Abi>::VectorType;
using vector_type = VectorType;
Vc_INTRINSIC static Mask Zero();
Vc_INTRINSIC static Mask One();
template <typename G> static Vc_INTRINSIC Mask generate(G &&gen);
Vc_INTRINSIC Mask() = default;
Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero);
Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne);
Vc_INTRINSIC explicit Mask(bool b);
template <typename U>
Vc_INTRINSIC Mask(U &&otherMask,
Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg);
#if Vc_IS_VERSION_1
template <typename U>
Vc_DEPRECATED(
"use simd_cast instead of explicit type casting to convert between mask types")
Vc_INTRINSIC_L
explicit Mask(U &&otherMask, Common::enable_if_mask_converts_explicitly<T, U> =
nullarg) Vc_INTRINSIC_R;
#endif
Vc_ALWAYS_INLINE explicit Mask(const bool *mem);
template <typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags flags);
Vc_ALWAYS_INLINE void load(const bool *mem);
template <typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags flags);
Vc_ALWAYS_INLINE void store(bool *mem) const;
template <typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags flags) const;
Vc_ALWAYS_INLINE bool operator==(const Mask &mask) const;
Vc_ALWAYS_INLINE bool operator!=(const Mask &mask) const;
Vc_ALWAYS_INLINE Mask operator&&(const Mask &mask) const;
Vc_ALWAYS_INLINE Mask operator&(const Mask &mask) const;
Vc_ALWAYS_INLINE Mask operator||(const Mask &mask) const;
Vc_ALWAYS_INLINE Mask operator|(const Mask &mask) const;
Vc_ALWAYS_INLINE Mask operator^(const Mask &mask) const;
Vc_ALWAYS_INLINE Mask operator!() const;
Vc_ALWAYS_INLINE Mask &operator&=(const Mask &mask);
Vc_ALWAYS_INLINE Mask &operator|=(const Mask &mask);
Vc_ALWAYS_INLINE Mask &operator^=(const Mask &mask);
Vc_ALWAYS_INLINE bool isFull() const;
Vc_ALWAYS_INLINE bool isNotEmpty() const;
Vc_ALWAYS_INLINE bool isEmpty() const;
Vc_ALWAYS_INLINE bool isMix() const;
Vc_ALWAYS_INLINE bool data() const;
Vc_ALWAYS_INLINE bool dataI() const;
Vc_ALWAYS_INLINE bool dataD() const;
Vc_ALWAYS_INLINE EntryReference operator[](size_t index);
Vc_ALWAYS_INLINE EntryType operator[](size_t index) const;
Vc_ALWAYS_INLINE int count() const;
Vc_ALWAYS_INLINE int firstOne() const;
Vc_ALWAYS_INLINE int toInt() const;
Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const;
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask));
private:
VectorType d;
};
template<typename Mask> constexpr bool all_of(const Mask &m) { return m.isFull(); }
constexpr bool all_of(bool b) { return b; }
template<typename Mask> constexpr bool any_of(const Mask &m) { return m.isNotEmpty(); }
constexpr bool any_of(bool b) { return b; }
template<typename Mask> constexpr bool none_of(const Mask &m) { return m.isEmpty(); }
constexpr bool none_of(bool b) { return !b; }
template<typename Mask> constexpr bool some_of(const Mask &m) { return m.isMix(); }
constexpr bool some_of(bool) { return false; }
}
#endif
#ifndef VC_COMMON_MEMORYFWD_H_
#define VC_COMMON_MEMORYFWD_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template <typename V, std::size_t Size1 = 0, std::size_t Size2 = 0,
bool InitPadding = true>
class Memory;
template <typename V, typename Parent, int Dimension, typename RowMemory>
class MemoryBase;
}
using Common::Memory;
}
#endif
#endif
#ifndef VC_SCALAR_TYPES_H_
#define VC_SCALAR_TYPES_H_
#ifdef Vc_DEFAULT_IMPL_Scalar
#define Vc_DOUBLE_V_SIZE 1
#define Vc_FLOAT_V_SIZE 1
#define Vc_INT_V_SIZE 1
#define Vc_UINT_V_SIZE 1
#define Vc_SHORT_V_SIZE 1
#define Vc_USHORT_V_SIZE 1
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Scalar
{
template <typename T> using Vector = Vc::Vector<T, VectorAbi::Scalar>;
typedef Vector<double> double_v;
typedef Vector<float> float_v;
typedef Vector<int> int_v;
typedef Vector<unsigned int> uint_v;
typedef Vector<short> short_v;
typedef Vector<unsigned short> ushort_v;
template <typename T> using Mask = Vc::Mask<T, VectorAbi::Scalar>;
typedef Mask<double> double_m;
typedef Mask<float> float_m;
typedef Mask<int> int_m;
typedef Mask<unsigned int> uint_m;
typedef Mask<short> short_m;
typedef Mask<unsigned short> ushort_m;
template <typename T> struct is_vector : public std::false_type {};
template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
template <typename T> struct is_mask : public std::false_type {};
template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
}
namespace Traits
{
template <typename T> struct is_simd_mask_internal<Scalar::Mask<T>>
: public std::true_type {};
template <class T> struct
is_simd_vector_internal<Vector<T, VectorAbi::Scalar>>
: public is_valid_vector_argument<T> {};
}
}
#endif
#ifndef VC_SCALAR_DETAIL_H_
#define VC_SCALAR_DETAIL_H_
#ifndef VC_SCALAR_MACROS_H_
#define VC_SCALAR_MACROS_H_
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template<typename V, size_t Size, size_t VSize> struct InterleaveImpl;
template<typename V, size_t VSize> struct InterleaveImpl<V, 1, VSize> {
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1)
{
data[i[0] + 0] = v0.data();
data[i[0] + 1] = v1.data();
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2)
{
data[i[0] + 0] = v0.data();
data[i[0] + 1] = v1.data();
data[i[0] + 2] = v2.data();
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3)
{
data[i[0] + 0] = v0.data();
data[i[0] + 1] = v1.data();
data[i[0] + 2] = v2.data();
data[i[0] + 3] = v3.data();
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4)
{
interleave(data, i, v0, v1, v2, v3);
data[i[0] + 4] = v4.data();
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6, const typename V::AsArg v7)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6, v7);
}
template <typename I>
static inline void deinterleave(typename V::EntryType const *const data, const I &i,
V &v0, V &v1)
{
v0.data() = data[i[0] + 0];
v1.data() = data[i[0] + 1];
}
template <typename I>
static inline void deinterleave(typename V::EntryType const *const data, const I &i,
V &v0, V &v1, V &v2)
{
v0.data() = data[i[0] + 0];
v1.data() = data[i[0] + 1];
v2.data() = data[i[0] + 2];
}
template <typename I>
static inline void deinterleave(typename V::EntryType const *const data, const I &i,
V &v0, V &v1, V &v2, V &v3)
{
v0.data() = data[i[0] + 0];
v1.data() = data[i[0] + 1];
v2.data() = data[i[0] + 2];
v3.data() = data[i[0] + 3];
}
template <typename I>
static inline void deinterleave(typename V::EntryType const *const data, const I &i,
V &v0, V &v1, V &v2, V &v3, V &v4)
{
deinterleave(data, i, v0, v1, v2, v3);
v4.data() = data[i[0] + 4];
}
template <typename I>
static inline void deinterleave(typename V::EntryType const *const data, const I &i,
V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
{
deinterleave(data, i, v0, v1, v2, v3);
deinterleave(data + 4, i, v4, v5);
}
template <typename I>
static inline void deinterleave(typename V::EntryType const *const data, const I &i,
V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
{
deinterleave(data, i, v0, v1, v2, v3);
deinterleave(data + 4, i, v4, v5, v6);
}
template <typename I>
static inline void deinterleave(typename V::EntryType const *const data, const I &i,
V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6,
V &v7)
{
deinterleave(data, i, v0, v1, v2, v3);
deinterleave(data + 4, i, v4, v5, v6, v7);
}
};
}
}
#endif
#ifndef VC_SCALAR_MASK_H_
#define VC_SCALAR_MASK_H_
namespace Vc_VERSIONED_NAMESPACE
{
template <typename T> class Mask<T, VectorAbi::Scalar>
{
friend class Mask< double, VectorAbi::Scalar>;
friend class Mask< float, VectorAbi::Scalar>;
friend class Mask< int32_t, VectorAbi::Scalar>;
friend class Mask<uint32_t, VectorAbi::Scalar>;
friend class Mask< int16_t, VectorAbi::Scalar>;
friend class Mask<uint16_t, VectorAbi::Scalar>;
public:
using abi = VectorAbi::Scalar;
static constexpr size_t Size = 1;
static constexpr size_t MemoryAlignment = 1;
static constexpr std::size_t size() { return 1; }
typedef bool EntryType;
using value_type = EntryType;
using EntryReference = Vc::Detail::ElementReference<Mask>;
using reference = EntryReference;
typedef bool VectorEntryType;
using VectorType = bool;
using Vector = Scalar::Vector<T>;
Vc_INTRINSIC Mask() = default;
Vc_INTRINSIC explicit Mask(bool b) : m(b) {}
Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : m(false) {}
Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : m(true) {}
Vc_INTRINSIC static Mask Zero() { return Mask(false); }
Vc_INTRINSIC static Mask One() { return Mask(true); }
template <typename U>
Vc_INTRINSIC Mask(U &&rhs, Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg)
: m(rhs.m) {}
#if Vc_IS_VERSION_1
template <typename U>
Vc_DEPRECATED(
"use simd_cast instead of explicit type casting to convert between mask types")
Vc_INTRINSIC_L
explicit Mask(U &&rhs, Common::enable_if_mask_converts_explicitly<T, U> = nullarg)
Vc_INTRINSIC_R;
#endif
Vc_ALWAYS_INLINE explicit Mask(const bool *mem) : m(mem[0]) {}
template<typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags) : m(mem[0]) {}
Vc_ALWAYS_INLINE void load(const bool *mem) { m = mem[0]; }
template<typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags) { m = mem[0]; }
Vc_ALWAYS_INLINE void store(bool *mem) const { *mem = m; }
template<typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags) const { *mem = m; }
Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return m == rhs.m; }
Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return m != rhs.m; }
Vc_ALWAYS_INLINE Mask operator&&(const Mask &rhs) const { return Mask(m && rhs.m); }
Vc_ALWAYS_INLINE Mask operator& (const Mask &rhs) const { return Mask(m && rhs.m); }
Vc_ALWAYS_INLINE Mask operator||(const Mask &rhs) const { return Mask(m || rhs.m); }
Vc_ALWAYS_INLINE Mask operator| (const Mask &rhs) const { return Mask(m || rhs.m); }
Vc_ALWAYS_INLINE Mask operator^ (const Mask &rhs) const { return Mask(m ^ rhs.m); }
Vc_ALWAYS_INLINE Mask operator!() const { return Mask(!m); }
Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { m &= rhs.m; return *this; }
Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { m |= rhs.m; return *this; }
Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { m ^= rhs.m; return *this; }
Vc_ALWAYS_INLINE bool isFull () const { return m; }
Vc_ALWAYS_INLINE bool isNotEmpty() const { return m; }
Vc_ALWAYS_INLINE bool isEmpty() const { return !m; }
Vc_ALWAYS_INLINE bool isMix () const { return false; }
Vc_ALWAYS_INLINE bool data () const { return m; }
Vc_ALWAYS_INLINE bool dataI() const { return m; }
Vc_ALWAYS_INLINE bool dataD() const { return m; }
private:
friend reference;
static Vc_INTRINSIC bool get(const Mask &o, int) noexcept { return o.m; }
template <typename U>
static Vc_INTRINSIC void set(Mask &o, int, U &&v) noexcept(
noexcept(std::declval<bool &>() = std::declval<U>()))
{
o.m = std::forward<U>(v);
}
public:
Vc_ALWAYS_INLINE reference operator[](size_t i) noexcept
{
Vc_ASSERT(i == 0); if (i) {}
return {*this, 0};
}
Vc_ALWAYS_INLINE value_type operator[](size_t i) const noexcept
{
Vc_ASSERT(i == 0); if (i) {}
return m;
}
Vc_ALWAYS_INLINE int count() const { return m ? 1 : 0; }
Vc_ALWAYS_INLINE int firstOne() const { return 0; }
Vc_ALWAYS_INLINE int toInt() const { return m ? 1 : 0; }
template <typename G> static Vc_INTRINSIC Mask generate(G &&gen)
{
return Mask(gen(0));
}
Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const
{
if (amount == 0) {
return *this;
} else {
return Zero();
}
}
private:
bool m;
};
template <typename T> constexpr size_t Mask<T, VectorAbi::Scalar>::Size;
template <typename T> constexpr size_t Mask<T, VectorAbi::Scalar>::MemoryAlignment;
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
#define Vc_CURRENT_CLASS_NAME Vector
template <typename T> class Vector<T, VectorAbi::Scalar>
{
static_assert(std::is_arithmetic<T>::value,
"Vector<T> only accepts arithmetic builtin types as template parameter T.");
public:
using abi = VectorAbi::Scalar;
using EntryType = T;
using VectorEntryType = EntryType;
using value_type = EntryType;
using VectorType = EntryType;
using vector_type = VectorType;
using reference = Detail::ElementReference<Vector>;
protected:
VectorType m_data = VectorType();
template <typename U> using V = Vector<U, abi>;
public:
typedef Scalar::Mask<T> Mask;
using MaskType = Mask;
using mask_type = Mask;
typedef Mask MaskArgument;
typedef Vector AsArg;
Vc_ALWAYS_INLINE VectorType &data() { return m_data; }
Vc_ALWAYS_INLINE const VectorType &data() const { return m_data; }
static constexpr size_t Size = 1;
static constexpr size_t MemoryAlignment = alignof(VectorType);
using IndexType = fixed_size_simd<int, 1>;
using index_type = IndexType;
public:
Vc_INTRINSIC Vector() = default;
static constexpr std::size_t size() { return Size; }
explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R;
explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R;
explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R;
static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); }
static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); }
static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero()
{
return Vector(Vc::IndexesFromZero);
}
template <class G, int = 0,
class = typename std::enable_if<std::is_convertible<
decltype(std::declval<G>()(size_t())), value_type>::value>::type>
explicit Vector(G &&g) : Vector(generate(std::forward<G>(g)))
{
}
static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R;
template <typename U>
Vc_INTRINSIC Vector(
V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
void *>::type = nullptr)
: m_data(static_cast<EntryType>(x.data()))
{
}
#if Vc_IS_VERSION_1
template <typename U>
Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
"vector types") Vc_INTRINSIC
explicit Vector(
V<U> x,
typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
void *>::type = nullptr)
: m_data(static_cast<EntryType>(x.data()))
{
}
#endif
Vc_INTRINSIC explicit Vector(reference a) : Vector(static_cast<EntryType>(a)) {}
Vc_INTRINSIC Vector(EntryType a) : m_data(a) {}
template <typename U>
Vc_INTRINSIC Vector(U a,
typename std::enable_if<std::is_same<U, int>::value &&
!std::is_same<U, EntryType>::value,
void *>::type = nullptr)
: Vector(static_cast<EntryType>(a))
{
}
explicit Vc_INTRINSIC Vector(const EntryType *mem)
{
load(mem);
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
{
load(mem, flags);
}
template <typename U, typename Flags = DefaultLoadTag,
typename = enable_if<
(!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
sizeof(EntryType) >= sizeof(U)) &&
std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
{
load<U, Flags>(x, flags);
}
Vc_INTRINSIC void load(const EntryType *mem)
{
load(mem, DefaultLoadTag());
}
template <typename Flags>
Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
load(const EntryType *mem, Flags flags)
{
load<EntryType, Flags>(mem, flags);
}
private:
template <typename U, typename Flags>
struct load_concept : public std::enable_if<
(!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
sizeof(EntryType) >= sizeof(U)) &&
std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
{};
public:
template <typename U, typename Flags = DefaultLoadTag>
Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
template <
typename U,
typename Flags = DefaultStoreTag,
typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
template <
typename U,
typename Flags = DefaultStoreTag,
typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
Vc_INTRINSIC void store(EntryType *mem) const
{
store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
{
store<EntryType, Flags>(mem, flags);
}
Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
{
store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
{
store<EntryType, Flags>(mem, mask, flags);
}
Vc_ALWAYS_INLINE void setZero() { m_data = 0; }
Vc_ALWAYS_INLINE void setZero(Mask k) { if (k.data()) m_data = 0; }
Vc_ALWAYS_INLINE void setZeroInverted(Mask k) { if (!k.data()) m_data = 0; }
Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
Vc_INTRINSIC_L void setQnan(Mask m) Vc_INTRINSIC_R;
#ifndef Vc_CURRENT_CLASS_NAME
#error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
#endif
private:
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
MaskArgument mask);
public:
#define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<MT, EntryType>::value, \
"The memory pointer needs to point to a type that can be converted to the " \
"EntryType of this SIMD vector type."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
template <typename MT, typename IT,
typename = enable_if<Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
#undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
private:
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes) const;
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
public:
#define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<EntryType, MT>::value, \
"The memory pointer needs to point to a type that the EntryType of this " \
"SIMD vector type can be converted to."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes));
}
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes), mask);
}
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
{
scatter(args.address, args.indexes);
}
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
{
scatter(args.address, args.indexes, mask);
}
#undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
Vc_ALWAYS_INLINE Vector &operator++() { ++m_data; return *this; }
Vc_ALWAYS_INLINE Vector &operator--() { --m_data; return *this; }
Vc_ALWAYS_INLINE Vector operator++(int) { return m_data++; }
Vc_ALWAYS_INLINE Vector operator--(int) { return m_data--; }
private:
friend reference;
Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
{
Vc_ASSERT(i == 0); if (i) {}
return o.m_data;
}
template <typename U>
Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
noexcept(std::declval<value_type &>() = v))
{
Vc_ASSERT(i == 0); if (i) {}
o.m_data = v;
}
public:
Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
{
static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
return {*this, int(index)};
}
Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
{
Vc_ASSERT(index == 0); if (index) {}
return m_data;
}
Vc_ALWAYS_INLINE Mask operator!() const
{
return Mask(!m_data);
}
Vc_ALWAYS_INLINE Vector operator~() const
{
#ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
static_assert(std::is_integral<T>::value, "bit-complement can only be used with Vectors of integral type");
#endif
return Vector(~m_data);
}
Vc_ALWAYS_INLINE Vector operator-() const
{
return -m_data;
}
Vc_INTRINSIC Vector Vc_PURE operator+() const { return *this; }
#define Vc_OP(symbol) \
Vc_ALWAYS_INLINE Vc_PURE Vector operator symbol(const Vector &x) const { return Vector(m_data symbol x.m_data); }
Vc_ALL_SHIFTS(Vc_OP);
#undef Vc_OP
Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
isNegative() const
{
return Vc::isnegative(*this);
}
Vc_ALWAYS_INLINE void assign(const Vector &v, const Mask &m) {
if (m.data()) m_data = v.m_data;
}
template <typename V2>
Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2
staticCast() const
{
return V2(static_cast<typename V2::EntryType>(m_data));
}
Vc_ALWAYS_INLINE Common::WriteMaskedVector<Vector, Mask> operator()(Mask m)
{
return {*this, m};
}
Vc_ALWAYS_INLINE EntryType min() const { return m_data; }
Vc_ALWAYS_INLINE EntryType max() const { return m_data; }
Vc_ALWAYS_INLINE EntryType product() const { return m_data; }
Vc_ALWAYS_INLINE EntryType sum() const { return m_data; }
Vc_ALWAYS_INLINE Vector partialSum() const { return *this; }
Vc_ALWAYS_INLINE EntryType min(Mask) const { return m_data; }
Vc_ALWAYS_INLINE EntryType max(Mask) const { return m_data; }
Vc_ALWAYS_INLINE EntryType product(Mask m) const
{
if (m.data()) {
return m_data;
} else {
return EntryType(1);
}
}
Vc_ALWAYS_INLINE EntryType sum(Mask m) const { if (m.data()) return m_data; return static_cast<EntryType>(0); }
Vc_INTRINSIC Vector Vc_VDECL shifted(int amount, Vector shiftIn) const {
Vc_ASSERT(amount >= -1 && amount <= 1);
return amount == 0 ? *this : shiftIn;
}
Vc_INTRINSIC Vector shifted(int amount) const { return amount == 0 ? *this : Zero(); }
Vc_INTRINSIC Vector rotated(int) const { return *this; }
Vc_INTRINSIC Vector reversed() const { return *this; }
Vc_INTRINSIC Vector sorted() const { return *this; }
template <typename F> void callWithValuesSorted(F &&f) { f(m_data); }
template <typename F> Vc_INTRINSIC void call(F &&f) const { f(m_data); }
template <typename F> Vc_INTRINSIC void call(F &&f, Mask mask) const
{
if (mask.data()) {
f(m_data);
}
}
template <typename F> Vc_INTRINSIC Vector apply(F &&f) const { return Vector(f(m_data)); }
template <typename F> Vc_INTRINSIC Vector apply(F &&f, Mask mask) const
{
if (mask.data()) {
return Vector(f(m_data));
} else {
return *this;
}
}
template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
m_data = f(0);
}
Vc_INTRINSIC void fill(EntryType (&f)()) {
m_data = f();
}
template <typename G> static Vc_INTRINSIC Vector generate(G gen)
{
return gen(0);
}
Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector Vc_VDECL
copySign(Vector x) const
{
return Vc::copysign(*this, x);
}
Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
{
return Vc::exponent(*this);
}
Vc_INTRINSIC Vector Vc_VDECL interleaveLow(Vector) const { return *this; }
Vc_INTRINSIC Vector Vc_VDECL interleaveHigh(Vector x) const { return x; }
};
#undef Vc_CURRENT_CLASS_NAME
template <typename T> constexpr size_t Vector<T, VectorAbi::Scalar>::Size;
template <typename T> constexpr size_t Vector<T, VectorAbi::Scalar>::MemoryAlignment;
#define Vc_OP(symbol) \
template <typename T, typename U, \
typename = decltype(std::declval<T &>() symbol## = std::declval<T>())> \
Vc_INTRINSIC enable_if<std::is_convertible<U, Vector<T, VectorAbi::Scalar>>::value, \
Vector<T, VectorAbi::Scalar>> \
&operator symbol##=(Vector<T, VectorAbi::Scalar> &lhs, U &&rhs) \
{ \
lhs.data() symbol## = Vector<T, VectorAbi::Scalar>(std::forward<U>(rhs)).data(); \
return lhs; \
}
Vc_ALL_SHIFTS(Vc_OP);
#undef Vc_OP
#define Vc_CONDITIONAL_ASSIGN(name_,op_) \
template <Operator O, typename T, typename M, typename U> \
Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign( \
Vector<T, VectorAbi::Scalar> &lhs, M &&mask, U &&rhs) \
{ \
if (mask.isFull()) { \
lhs op_ std::forward<U>(rhs); \
} \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN( Assign, =);
Vc_CONDITIONAL_ASSIGN( PlusAssign, +=);
Vc_CONDITIONAL_ASSIGN( MinusAssign, -=);
Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=);
Vc_CONDITIONAL_ASSIGN( DivideAssign, /=);
Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
Vc_CONDITIONAL_ASSIGN( XorAssign, ^=);
Vc_CONDITIONAL_ASSIGN( AndAssign, &=);
Vc_CONDITIONAL_ASSIGN( OrAssign, |=);
Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
#undef Vc_CONDITIONAL_ASSIGN
#define Vc_CONDITIONAL_ASSIGN(name_,expr_) \
template <Operator O, typename T, typename M> \
Vc_INTRINSIC enable_if<O == Operator::name_, Vector<T, VectorAbi::Scalar>> \
conditional_assign(Vector<T, VectorAbi::Scalar> &lhs, M &&mask) \
{ \
return mask.isFull() ? (expr_) : lhs; \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs++);
Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs);
Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs--);
Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs);
#undef Vc_CONDITIONAL_ASSIGN
}
#include <cmath>
#ifndef VC_COMMON_CONST_DATA_H_
#define VC_COMMON_CONST_DATA_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
alignas(64) extern unsigned int RandomState[];
alignas(32) extern const unsigned int AllBitsSet[8];
}
}
#endif
#ifndef VC_COMMON_WHERE_H_
#define VC_COMMON_WHERE_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace WhereImpl
{
template<typename _Mask, typename _LValue> struct MaskedLValue
{
typedef _Mask Mask;
typedef _LValue LValue;
const Mask &mask;
LValue &lhs;
constexpr MaskedLValue(const Mask &m, LValue &l) : mask(m), lhs(l) {}
MaskedLValue(const MaskedLValue &) = delete;
#ifndef __cpp_guaranteed_copy_elision
constexpr MaskedLValue(MaskedLValue &&) = default;
#endif
template<typename T> Vc_ALWAYS_INLINE void operator =(T &&rhs) { conditional_assign<Operator:: Assign>(lhs, mask, std::forward<T>(rhs)); }
template<typename T> Vc_ALWAYS_INLINE void operator +=(T &&rhs) { conditional_assign<Operator:: PlusAssign>(lhs, mask, std::forward<T>(rhs)); }
template<typename T> Vc_ALWAYS_INLINE void operator -=(T &&rhs) { conditional_assign<Operator:: MinusAssign>(lhs, mask, std::forward<T>(rhs)); }
template<typename T> Vc_ALWAYS_INLINE void operator *=(T &&rhs) { conditional_assign<Operator:: MultiplyAssign>(lhs, mask, std::forward<T>(rhs)); }
template<typename T> Vc_ALWAYS_INLINE void operator /=(T &&rhs) { conditional_assign<Operator:: DivideAssign>(lhs, mask, std::forward<T>(rhs)); }
template<typename T> Vc_ALWAYS_INLINE void operator %=(T &&rhs) { conditional_assign<Operator:: RemainderAssign>(lhs, mask, std::forward<T>(rhs)); }
template<typename T> Vc_ALWAYS_INLINE void operator ^=(T &&rhs) { conditional_assign<Operator:: XorAssign>(lhs, mask, std::forward<T>(rhs)); }
template<typename T> Vc_ALWAYS_INLINE void operator &=(T &&rhs) { conditional_assign<Operator:: AndAssign>(lhs, mask, std::forward<T>(rhs)); }
template<typename T> Vc_ALWAYS_INLINE void operator |=(T &&rhs) { conditional_assign<Operator:: OrAssign>(lhs, mask, std::forward<T>(rhs)); }
template<typename T> Vc_ALWAYS_INLINE void operator<<=(T &&rhs) { conditional_assign<Operator:: LeftShiftAssign>(lhs, mask, std::forward<T>(rhs)); }
template<typename T> Vc_ALWAYS_INLINE void operator>>=(T &&rhs) { conditional_assign<Operator::RightShiftAssign>(lhs, mask, std::forward<T>(rhs)); }
Vc_ALWAYS_INLINE void operator++() { conditional_assign<Operator:: PreIncrement>(lhs, mask); }
Vc_ALWAYS_INLINE void operator++(int) { conditional_assign<Operator::PostIncrement>(lhs, mask); }
Vc_ALWAYS_INLINE void operator--() { conditional_assign<Operator:: PreDecrement>(lhs, mask); }
Vc_ALWAYS_INLINE void operator--(int) { conditional_assign<Operator::PostDecrement>(lhs, mask); }
template <class T, class IV, class S>
Vc_INTRINSIC void operator=(Common::SubscriptOperation<T, IV, S, true> &&rhs)
{
lhs.gather(std::move(rhs).gatherArguments(), mask);
}
template <class T, class IV, class S>
void operator+=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
template <class T, class IV, class S>
void operator-=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
template <class T, class IV, class S>
void operator*=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
template <class T, class IV, class S>
void operator/=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
template <class T, class IV, class S>
void operator%=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
template <class T, class IV, class S>
void operator^=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
template <class T, class IV, class S>
void operator&=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
template <class T, class IV, class S>
void operator|=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
template <class T, class IV, class S>
void operator<<=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
template <class T, class IV, class S>
void operator>>=(Common::SubscriptOperation<T, IV, S, true> &&rhs) = delete;
};
template <typename _Mask, typename T_, typename I_, typename S_>
struct MaskedLValue<_Mask, Common::SubscriptOperation<T_, I_, S_, true>>
{
typedef _Mask Mask;
typedef Common::SubscriptOperation<T_, I_, S_, true> SO;
const Mask &mask;
SO &lhs;
template <typename T> using Decay = typename std::decay<T>::type;
constexpr MaskedLValue(const Mask &m, SO &&l) : mask(m), lhs(l) {}
MaskedLValue(const MaskedLValue &) = delete;
#ifndef __cpp_guaranteed_copy_elision
constexpr MaskedLValue(MaskedLValue &&) = default;
#endif
template <class T> Vc_ALWAYS_INLINE void operator=(T &&rhs) &&
{
std::forward<T>(rhs).scatter(std::move(lhs).scatterArguments(), mask);
}
};
template<typename _LValue> struct MaskedLValue<bool, _LValue>
{
typedef bool Mask;
typedef _LValue LValue;
const Mask &mask;
LValue &lhs;
constexpr MaskedLValue(const Mask &m, LValue &l) : mask(m), lhs(l) {}
MaskedLValue(const MaskedLValue &) = delete;
constexpr MaskedLValue(MaskedLValue &&) = default;
template<typename T> Vc_ALWAYS_INLINE void operator =(T &&rhs) { if (mask) lhs = std::forward<T>(rhs); }
template<typename T> Vc_ALWAYS_INLINE void operator +=(T &&rhs) { if (mask) lhs += std::forward<T>(rhs); }
template<typename T> Vc_ALWAYS_INLINE void operator -=(T &&rhs) { if (mask) lhs -= std::forward<T>(rhs); }
template<typename T> Vc_ALWAYS_INLINE void operator *=(T &&rhs) { if (mask) lhs *= std::forward<T>(rhs); }
template<typename T> Vc_ALWAYS_INLINE void operator /=(T &&rhs) { if (mask) lhs /= std::forward<T>(rhs); }
template<typename T> Vc_ALWAYS_INLINE void operator %=(T &&rhs) { if (mask) lhs %= std::forward<T>(rhs); }
template<typename T> Vc_ALWAYS_INLINE void operator ^=(T &&rhs) { if (mask) lhs ^= std::forward<T>(rhs); }
template<typename T> Vc_ALWAYS_INLINE void operator &=(T &&rhs) { if (mask) lhs &= std::forward<T>(rhs); }
template<typename T> Vc_ALWAYS_INLINE void operator |=(T &&rhs) { if (mask) lhs |= std::forward<T>(rhs); }
template<typename T> Vc_ALWAYS_INLINE void operator<<=(T &&rhs) { if (mask) lhs <<= std::forward<T>(rhs); }
template<typename T> Vc_ALWAYS_INLINE void operator>>=(T &&rhs) { if (mask) lhs >>= std::forward<T>(rhs); }
Vc_ALWAYS_INLINE void operator++() { if (mask) ++lhs; }
Vc_ALWAYS_INLINE void operator++(int) { if (mask) lhs++; }
Vc_ALWAYS_INLINE void operator--() { if (mask) --lhs; }
Vc_ALWAYS_INLINE void operator--(int) { if (mask) lhs--; }
};
template<typename _Mask> struct WhereMask
{
typedef _Mask Mask;
const Mask &mask;
constexpr WhereMask(const Mask &m) : mask(m) {}
WhereMask(const WhereMask &) = delete;
template <typename T, typename I, typename S>
constexpr Vc_WARN_UNUSED_RESULT
MaskedLValue<Mask, Common::SubscriptOperation<T, I, S, true>>
operator|(Common::SubscriptOperation<T, I, S, true> &&lhs) const
{
static_assert(!std::is_const<T>::value,
"masked scatter to constant memory not possible.");
return {mask, std::move(lhs)};
}
template<typename T> constexpr Vc_WARN_UNUSED_RESULT MaskedLValue<Mask, T> operator|(T &&lhs) const
{
static_assert(std::is_lvalue_reference<T>::value, "Syntax error: Incorrect use of Vc::where. Maybe operator precedence got you by surprise. Examples of correct usage:\n"
" Vc::where(x < 2) | x += 1;\n"
" (Vc::where(x < 2) | x)++;\n"
" Vc::where(x < 2)(x) += 1;\n"
" Vc::where(x < 2)(x)++;\n"
);
return { mask, lhs };
}
template <class T,
class = decltype(std::declval<T>() = std::declval<const T &>())>
constexpr Vc_WARN_UNUSED_RESULT MaskedLValue<Mask, T> operator()(T &&lhs) const
{
return operator|(std::forward<T>(lhs));
}
};
}
template<typename M> constexpr Vc_WARN_UNUSED_RESULT WhereImpl::WhereMask<M> where(const M &mask)
{
return { mask };
}
template <class M, class V>
constexpr Vc_WARN_UNUSED_RESULT WhereImpl::MaskedLValue<M, V> where(const M &mask,
V &value)
{
return {mask, value};
}
template <class M, class T, class IT, class Scale>
constexpr Vc_WARN_UNUSED_RESULT
WhereImpl::MaskedLValue<M, Common::SubscriptOperation<T, IT, Scale, true>>
where(const M &mask, Common::SubscriptOperation<T, IT, Scale, true> &&value)
{
return {mask, std::move(value)};
}
template<typename M> constexpr Vc_WARN_UNUSED_RESULT WhereImpl::WhereMask<M> _if(const M &m)
{
return { m };
}
}
#endif
#ifndef VC_COMMON_TRANSPOSE_H_
#define VC_COMMON_TRANSPOSE_H_
#include <tuple>
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template <typename... Inputs> struct TransposeProxy
{
TransposeProxy(const Inputs &... inputs) : in{inputs...} {}
std::tuple<const Inputs &...> in;
};
template <int LhsLength, size_t RhsLength> struct TransposeTag {
};
}
template <typename... Vs> Common::TransposeProxy<Vs...> transpose(Vs... vs)
{
return {vs...};
}
}
#endif
#ifndef VC_SCALAR_OPERATORS_H_
#define VC_SCALAR_OPERATORS_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
#define Vc_OP(op_) \
template <typename T> \
Vc_INTRINSIC Scalar::Mask<T> operator op_(Scalar::Vector<T> a, Scalar::Vector<T> b) \
{ \
return Scalar::Mask<T>(a.data() op_ b.data()); \
}
Vc_ALL_COMPARES(Vc_OP);
#undef Vc_OP
#define Vc_OP(symbol) \
template <typename T> \
Vc_INTRINSIC enable_if<std::is_integral<T>::value, Scalar::Vector<T>> \
operator symbol(Scalar::Vector<T> a, Scalar::Vector<T> b) \
{ \
return a.data() symbol b.data(); \
} \
template <typename T> \
Vc_INTRINSIC enable_if<std::is_floating_point<T>::value, Scalar::Vector<T>> \
operator symbol(Scalar::Vector<T> &lhs, Scalar::Vector<T> rhs) \
{ \
using uinta = \
MayAlias<typename std::conditional<sizeof(T) == sizeof(int), unsigned int, \
unsigned long long>::type>; \
uinta *left = reinterpret_cast<uinta *>(&lhs.data()); \
const uinta *right = reinterpret_cast<const uinta *>(&rhs.data()); \
*left symbol## = *right; \
return lhs; \
}
Vc_ALL_BINARY(Vc_OP);
#undef Vc_OP
template <typename T>
Vc_INTRINSIC Scalar::Vector<T> operator+(Scalar::Vector<T> a, Scalar::Vector<T> b)
{
return a.data() + b.data();
}
template <typename T>
Vc_INTRINSIC Scalar::Vector<T> operator-(Scalar::Vector<T> a, Scalar::Vector<T> b)
{
return a.data() - b.data();
}
template <typename T>
Vc_INTRINSIC Scalar::Vector<T> operator*(Scalar::Vector<T> a, Scalar::Vector<T> b)
{
return a.data() * b.data();
}
template <typename T>
Vc_INTRINSIC Scalar::Vector<T> operator/(Scalar::Vector<T> a, Scalar::Vector<T> b)
{
return a.data() / b.data();
}
template <typename T>
Vc_INTRINSIC Scalar::Vector<T> operator%(Scalar::Vector<T> a, Scalar::Vector<T> b)
{
return a.data() % b.data();
}
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
template <typename T>
Vc_INTRINSIC Vector<T, VectorAbi::Scalar>::Vector(VectorSpecialInitializerZero)
: m_data(0)
{
}
template <typename T>
Vc_INTRINSIC Vector<T, VectorAbi::Scalar>::Vector(VectorSpecialInitializerOne)
: m_data(1)
{
}
template <typename T>
Vc_INTRINSIC Vector<T, VectorAbi::Scalar>::Vector(VectorSpecialInitializerIndexesFromZero)
: m_data(0)
{
}
template <typename T>
template <typename U, typename Flags>
Vc_INTRINSIC typename Vector<T, VectorAbi::Scalar>::
#ifndef Vc_MSVC
template
#endif
load_concept<U, Flags>::type Vector<T, VectorAbi::Scalar>::load(const U *mem, Flags)
{
m_data = mem[0];
}
template <typename T>
template <typename U, typename Flags, typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::store(U *mem, Flags) const
{
mem[0] = m_data;
}
template <typename T>
template <typename U, typename Flags, typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::store(U *mem, Mask mask, Flags) const
{
if (mask.data())
mem[0] = m_data;
}
template <typename T>
template <class MT, class IT, int Scale>
Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::gatherImplementation(
const Common::GatherArguments<MT, IT, Scale> &args)
{
m_data = args.address[Scale * args.indexes[0]];
}
template <typename T>
template <class MT, class IT, int Scale>
Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::gatherImplementation(
const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
{
if (mask.data()) {
m_data = args.address[Scale * args.indexes[0]];
}
}
template <typename T>
template <typename MT, typename IT>
Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::scatterImplementation(MT *mem,
IT &&indexes)
const
{
mem[indexes[0]] = m_data;
}
template <typename T>
template <typename MT, typename IT>
Vc_ALWAYS_INLINE void Vector<T, VectorAbi::Scalar>::scatterImplementation(
MT *mem, IT &&indexes, MaskArgument mask) const
{
if (mask.data()) {
mem[indexes[0]] = m_data;
}
}
Vc_INTRINSIC Vc_CONST Scalar::float_v exponent(Scalar::float_v x)
{
Vc_ASSERT(x.data() >= 0.f);
union { float f; int i; } value;
value.f = x.data();
return Scalar::float_v(static_cast<float>((value.i >> 23) - 0x7f));
}
Vc_INTRINSIC Vc_CONST Scalar::double_v Vc_VDECL exponent(Scalar::double_v x)
{
Vc_ASSERT(x.data() >= 0.);
union { double f; long long i; } value;
value.f = x.data();
return Scalar::double_v(static_cast<double>((value.i >> 52) - 0x3ff));
}
static Vc_ALWAYS_INLINE void _doRandomStep(Scalar::uint_v &state0, Scalar::uint_v &state1)
{
using Scalar::uint_v;
state0.load(&Common::RandomState[0]);
state1.load(&Common::RandomState[uint_v::Size]);
Detail::operator+(Detail::operator*(state1, uint_v(0xdeece66du)),
uint_v(11))
.store(&Common::RandomState[uint_v::Size]);
uint_v(Detail::operator+(Detail::operator*(state0, uint_v(0xdeece66du)), uint_v(11))
.data() ^
(state1.data() >> 16))
.store(&Common::RandomState[0]);
}
template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Scalar> Vector<T, VectorAbi::Scalar>::Random()
{
Scalar::uint_v state0, state1;
_doRandomStep(state0, state1);
return Vector<T, VectorAbi::Scalar>(static_cast<EntryType>(state0.data()));
}
template<> Vc_INTRINSIC Scalar::float_v Scalar::float_v::Random()
{
Scalar::uint_v state0, state1;
_doRandomStep(state0, state1);
union { unsigned int i; float f; } x;
x.i = (state0.data() & 0x0fffffffu) | 0x3f800000u;
return Scalar::float_v(x.f - 1.f);
}
template<> Vc_INTRINSIC Scalar::double_v Scalar::double_v::Random()
{
typedef unsigned long long uint64 Vc_MAY_ALIAS;
uint64 state0 = *reinterpret_cast<const uint64 *>(&Common::RandomState[8]);
state0 = (state0 * 0x5deece66dull + 11) & 0x000fffffffffffffull;
*reinterpret_cast<uint64 *>(&Common::RandomState[8]) = state0;
union { unsigned long long i; double f; } x;
x.i = state0 | 0x3ff0000000000000ull;
return Scalar::double_v(x.f - 1.);
}
Vc_INTRINSIC Vc_CONST Scalar::float_m isnegative(Scalar::float_v x)
{
static_assert(sizeof(float) == sizeof(unsigned int),
"This code assumes float and unsigned int have the same number of "
"Bytes. Please file a bug report if this is a problem.");
union { float f; unsigned int i; } u;
u.f = x.data();
return Scalar::float_m(0u != (u.i & 0x80000000u));
}
Vc_INTRINSIC Vc_CONST Scalar::double_m Vc_VDECL isnegative(Scalar::double_v x)
{
static_assert(sizeof(double) == sizeof(unsigned long long),
"This code assumes double and unsigned long long have the same number "
"of Bytes. Please file a bug report if this is a problem.");
union { double d; unsigned long long l; } u;
u.d = x.data();
return Scalar::double_m(0ull != (u.l & 0x8000000000000000ull));
}
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::setQnan()
{
union { float f; unsigned int i; } u;
u.i = 0xffffffffu;
m_data = u.f;
}
template<> Vc_INTRINSIC void Scalar::double_v::setQnan()
{
union { double d; unsigned long long l; } u;
u.l = 0xffffffffffffffffull;
m_data = u.d;
}
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Scalar>::setQnan(Mask m)
{
if (m.data()) {
setQnan();
}
}
template<> Vc_INTRINSIC void Scalar::double_v::setQnan(Scalar::double_v::Mask m)
{
if (m.data()) {
setQnan();
}
}
namespace Common
{
Vc_ALWAYS_INLINE void transpose_impl(TransposeTag<1, 1>, Scalar::float_v *Vc_RESTRICT r[],
const TransposeProxy<Scalar::float_v> &proxy)
{
*r[0] = std::get<0>(proxy.in).data();
}
}
}
#ifndef VC_SCALAR_SIMD_CAST_H_
#define VC_SCALAR_SIMD_CAST_H_
#ifndef VC_COMMON_SIMD_CAST_H_
#define VC_COMMON_SIMD_CAST_H_
#include <type_traits>
template <class> void simd_cast();
namespace Vc_VERSIONED_NAMESPACE
{
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
simd_cast(From &&x, enable_if<std::is_same<To, Traits::decay<From>>::value> = nullarg)
{
return std::forward<From>(x);
}
template <typename To> Vc_INTRINSIC Vc_CONST To simd_cast() { return To(); }
}
#endif
#ifndef VC_SCALAR_TYPE_TRAITS_H_
#define VC_SCALAR_TYPE_TRAITS_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Scalar
{
namespace Traits
{
template <typename T> struct is_vector : public std::false_type {};
template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
template <typename T> struct is_mask : public std::false_type {};
template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
}
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
simd_cast(Scalar::Vector<From> x, enable_if<Scalar::is_vector<To>::value> = nullarg)
{
return static_cast<To>(x.data());
}
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
simd_cast(Scalar::Mask<From> x, enable_if<Scalar::is_mask<To>::value> = nullarg)
{
return static_cast<To>(x.data());
}
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(
T &&x,
enable_if<Traits::is_simd_vector<T>::value && Scalar::is_vector<Return>::value> = nullarg)
{
return Return(x[offset]);
}
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<offset == 0 && Traits::is_simd_vector<Return>::value &&
!Scalar::is_vector<Return>::value,
Return>
simd_cast(Scalar::Vector<T> x)
{
Return r{};
r[0] = static_cast<typename Return::EntryType>(x.data());
return r;
}
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(
T &&x,
enable_if<Traits::is_simd_mask<T>::value && Scalar::is_mask<Return>::value> = nullarg)
{
return Return(bool(x[offset]));
}
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<
offset == 0 && Traits::is_simd_mask<Return>::value && !Scalar::is_mask<Return>::value,
Return>
simd_cast(Scalar::Mask<T> x)
{
Return r(false);
r[0] = x[0];
return r;
}
}
#endif
#endif
#if defined(Vc_IMPL_SSE)
#ifndef VC_SSE_VECTOR_H_
#define VC_SSE_VECTOR_H_
#ifndef VC_SSE_INTRINSICS_H_
#define VC_SSE_INTRINSICS_H_
#ifdef Vc_MSVC
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#ifndef VC_COMMON_STORAGE_H_
#define VC_COMMON_STORAGE_H_
#ifndef VC_COMMON_ALIASINGENTRYHELPER_H_
#define VC_COMMON_ALIASINGENTRYHELPER_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template<class StorageType> class AliasingEntryHelper
{
private:
typedef typename StorageType::EntryType T;
#ifdef Vc_ICC
StorageType *const m_storage;
const int m_index;
public:
Vc_ALWAYS_INLINE AliasingEntryHelper(StorageType *d, int index) : m_storage(d), m_index(index) {}
Vc_ALWAYS_INLINE AliasingEntryHelper(const AliasingEntryHelper &) = default;
Vc_ALWAYS_INLINE AliasingEntryHelper(AliasingEntryHelper &&) = default;
Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) {
m_storage->assign(m_index, rhs);
return *this;
}
Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_storage->assign(m_index, x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator +=(T x) { m_storage->assign(m_index, m_storage->m(m_index) + x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator -=(T x) { m_storage->assign(m_index, m_storage->m(m_index) - x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator /=(T x) { m_storage->assign(m_index, m_storage->m(m_index) / x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator *=(T x) { m_storage->assign(m_index, m_storage->m(m_index) * x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator |=(T x) { m_storage->assign(m_index, m_storage->m(m_index) | x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator &=(T x) { m_storage->assign(m_index, m_storage->m(m_index) & x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator ^=(T x) { m_storage->assign(m_index, m_storage->m(m_index) ^ x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator %=(T x) { m_storage->assign(m_index, m_storage->m(m_index) % x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_storage->assign(m_index, m_storage->m(m_index)<< x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_storage->assign(m_index, m_storage->m(m_index)>> x); return *this; }
#define m_data m_storage->read(m_index)
#else
typedef T A Vc_MAY_ALIAS;
A &m_data;
public:
template<typename T2>
Vc_ALWAYS_INLINE AliasingEntryHelper(T2 &d) : m_data(reinterpret_cast<A &>(d)) {}
Vc_ALWAYS_INLINE AliasingEntryHelper(A &d) : m_data(d) {}
Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) {
m_data = rhs.m_data;
return *this;
}
Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_data = x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator+=(T x) { m_data += x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator-=(T x) { m_data -= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator/=(T x) { m_data /= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator*=(T x) { m_data *= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator|=(T x) { m_data |= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator&=(T x) { m_data &= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator^=(T x) { m_data ^= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator%=(T x) { m_data %= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_data <<= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_data >>= x; return *this; }
#endif
Vc_ALWAYS_INLINE Vc_PURE operator const T() const { return m_data; }
Vc_ALWAYS_INLINE Vc_PURE bool operator==(T x) const { return static_cast<T>(m_data) == x; }
Vc_ALWAYS_INLINE Vc_PURE bool operator!=(T x) const { return static_cast<T>(m_data) != x; }
Vc_ALWAYS_INLINE Vc_PURE bool operator<=(T x) const { return static_cast<T>(m_data) <= x; }
Vc_ALWAYS_INLINE Vc_PURE bool operator>=(T x) const { return static_cast<T>(m_data) >= x; }
Vc_ALWAYS_INLINE Vc_PURE bool operator< (T x) const { return static_cast<T>(m_data) < x; }
Vc_ALWAYS_INLINE Vc_PURE bool operator> (T x) const { return static_cast<T>(m_data) > x; }
Vc_ALWAYS_INLINE Vc_PURE T operator-() const { return -static_cast<T>(m_data); }
Vc_ALWAYS_INLINE Vc_PURE T operator~() const { return ~static_cast<T>(m_data); }
Vc_ALWAYS_INLINE Vc_PURE T operator+(T x) const { return static_cast<T>(m_data) + x; }
Vc_ALWAYS_INLINE Vc_PURE T operator-(T x) const { return static_cast<T>(m_data) - x; }
Vc_ALWAYS_INLINE Vc_PURE T operator/(T x) const { return static_cast<T>(m_data) / x; }
Vc_ALWAYS_INLINE Vc_PURE T operator*(T x) const { return static_cast<T>(m_data) * x; }
Vc_ALWAYS_INLINE Vc_PURE T operator|(T x) const { return static_cast<T>(m_data) | x; }
Vc_ALWAYS_INLINE Vc_PURE T operator&(T x) const { return static_cast<T>(m_data) & x; }
Vc_ALWAYS_INLINE Vc_PURE T operator^(T x) const { return static_cast<T>(m_data) ^ x; }
Vc_ALWAYS_INLINE Vc_PURE T operator%(T x) const { return static_cast<T>(m_data) % x; }
#ifdef m_data
#undef m_data
#endif
};
}
}
#endif
#ifndef VC_COMMON_MASKENTRY_H_
#define VC_COMMON_MASKENTRY_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
namespace
{
template<size_t Bytes> struct MaskBoolStorage;
template<> struct MaskBoolStorage<1> { typedef std::int8_t type; };
template<> struct MaskBoolStorage<2> { typedef std::int16_t type; };
template<> struct MaskBoolStorage<4> { typedef std::int32_t type; };
template<> struct MaskBoolStorage<8> { typedef std::int64_t type; };
}
template<size_t Bytes> class MaskBool
{
typedef typename MaskBoolStorage<Bytes>::type storage_type Vc_MAY_ALIAS;
storage_type data;
public:
constexpr MaskBool(bool x) noexcept : data(x ? -1 : 0) {}
Vc_ALWAYS_INLINE MaskBool &operator=(bool x) noexcept { data = x ? -1 : 0; return *this; }
template <typename T, typename = enable_if<(!std::is_same<T, bool>::value &&
std::is_fundamental<T>::value)>>
Vc_ALWAYS_INLINE MaskBool &operator=(T x) noexcept
{
data = reinterpret_cast<const storage_type &>(x);
return *this;
}
Vc_ALWAYS_INLINE MaskBool(const MaskBool &) noexcept = default;
Vc_ALWAYS_INLINE MaskBool &operator=(const MaskBool &) noexcept = default;
template <typename T, typename = enable_if<(std::is_same<T, bool>::value ||
(std::is_fundamental<T>::value &&
sizeof(storage_type) == sizeof(T)))>>
constexpr operator T() const noexcept
{
return std::is_same<T, bool>::value ? T((data & 1) != 0) : aliasing_cast<T>(data);
}
} Vc_MAY_ALIAS;
template <typename A,
typename B,
typename std::enable_if<
std::is_convertible<A, bool>::value &&std::is_convertible<B, bool>::value,
int>::type = 0>
constexpr bool operator==(A &&a, B &&b)
{
return static_cast<bool>(a) == static_cast<bool>(b);
}
template <typename A,
typename B,
typename std::enable_if<
std::is_convertible<A, bool>::value &&std::is_convertible<B, bool>::value,
int>::type = 0>
constexpr bool operator!=(A &&a, B &&b)
{
return static_cast<bool>(a) != static_cast<bool>(b);
}
}
}
#endif
#ifdef Vc_IMPL_AVX
#ifndef VC_AVX_INTRINSICS_H_
#define VC_AVX_INTRINSICS_H_
extern "C" {
#include <immintrin.h>
#if (defined(Vc_IMPL_XOP) || defined(Vc_IMPL_FMA4)) && !defined(Vc_MSVC)
#include <x86intrin.h>
#endif
}
#ifndef VC_COMMON_FIX_CLANG_EMMINTRIN_H_
#define VC_COMMON_FIX_CLANG_EMMINTRIN_H_
#if (defined Vc_CLANG && Vc_CLANG < 0x30700) || (defined Vc_APPLECLANG && Vc_APPLECLANG < 0x70000)
#ifdef _mm_slli_si128
#undef _mm_slli_si128
#define _mm_slli_si128(a,count) __extension__ ({ \
(__m128i)__builtin_ia32_pslldqi128((__m128i)(a), (count)*8); })
#endif
#ifdef _mm_srli_si128
#undef _mm_srli_si128
#define _mm_srli_si128(a,count) __extension__ ({ \
(__m128i)__builtin_ia32_psrldqi128((__m128i)(a), (count)*8); })
#endif
#ifdef _mm_shuffle_epi32
#undef _mm_shuffle_epi32
#define _mm_shuffle_epi32(a,imm) __extension__ ({ \
(__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), (__v4si) _mm_set1_epi32(0), \
(imm) & 0x3, ((imm) & 0xc) >> 2, \
((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
#endif
#ifdef _mm_shufflelo_epi16
#undef _mm_shufflelo_epi16
#define _mm_shufflelo_epi16(a,imm) __extension__ ({ \
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \
(imm) & 0x3, ((imm) & 0xc) >> 2, \
((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
4, 5, 6, 7); })
#endif
#ifdef _mm_shufflehi_epi16
#undef _mm_shufflehi_epi16
#define _mm_shufflehi_epi16(a,imm) __extension__ ({ \
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \
0, 1, 2, 3, \
4 + (((imm) & 0x03) >> 0), \
4 + (((imm) & 0x0c) >> 2), \
4 + (((imm) & 0x30) >> 4), \
4 + (((imm) & 0xc0) >> 6)); })
#endif
#ifdef _mm_shuffle_pd
#undef _mm_shuffle_pd
#define _mm_shuffle_pd(a,b,i) __extension__ ({ \
__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, (((i) & 2) >> 1) + 2); })
#endif
#endif
#endif
#ifndef VC_AVX_CONST_DATA_H_
#define VC_AVX_CONST_DATA_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
alignas(64) extern const unsigned int _IndexesFromZero32[ 8];
alignas(16) extern const unsigned short _IndexesFromZero16[16];
alignas(16) extern const unsigned char _IndexesFromZero8 [32];
struct alignas(64) c_general
{
static const float oneFloat;
static const unsigned int absMaskFloat[2];
static const unsigned int signMaskFloat[2];
static const unsigned int highMaskFloat;
static const unsigned short minShort[2];
static const unsigned short one16[2];
static const float _2power31;
static const double oneDouble;
static const unsigned long long frexpMask;
static const unsigned long long highMaskDouble;
};
template<typename T> struct c_trig
{
alignas(64) static const T data[];
};
#ifndef Vc_MSVC
template <> alignas(64) const float c_trig<float>::data[];
template <> alignas(64) const double c_trig<double>::data[];
#endif
template<typename T> struct c_log
{
typedef float floatAlias Vc_MAY_ALIAS;
static Vc_ALWAYS_INLINE float d(int i) { return *reinterpret_cast<const floatAlias *>(&data[i]); }
alignas(64) static const unsigned int data[21];
};
#ifndef Vc_MSVC
template<> alignas(64) const unsigned int c_log<float>::data[21];
#endif
template<> struct c_log<double>
{
enum VectorSize { Size = 16 / sizeof(double) };
typedef double doubleAlias Vc_MAY_ALIAS;
static Vc_ALWAYS_INLINE double d(int i) { return *reinterpret_cast<const doubleAlias *>(&data[i]); }
alignas(64) static const unsigned long long data[21];
};
}
}
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX2
{
using AVX::_IndexesFromZero8;
using AVX::_IndexesFromZero16;
using AVX::_IndexesFromZero32;
using AVX::c_general;
using AVX::c_trig;
using AVX::c_log;
}
}
#endif
#include <cstdlib>
#if (defined Vc_CLANG && Vc_CLANG >= 0x30900 && Vc_CLANG < 0x70000)
#ifdef _mm256_permute2f128_si256
#undef _mm256_permute2f128_si256
#define _mm256_permute2f128_si256(V1,V2,M) __extension__ ({ \
(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
(__v8si)(__m256i)(V2), (char)(M)); })
#endif
#ifdef _mm256_permute2f128_ps
#undef _mm256_permute2f128_ps
#define _mm256_permute2f128_ps(V1,V2,M) __extension__ ({ \
(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (char)(M)); })
#endif
#ifdef _mm256_permute2x128_si256
#undef _mm256_permute2x128_si256
#define _mm256_permute2x128_si256(V1,V2,M) __extension__ ({ \
(__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (char)(M)); })
#endif
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace AvxIntrinsics
{
using AVX::c_general;
using AVX::_IndexesFromZero32;
using AVX::_IndexesFromZero16;
using AVX::_IndexesFromZero8;
typedef __m128 m128 ;
typedef __m128d m128d;
typedef __m128i m128i;
typedef __m256 m256 ;
typedef __m256d m256d;
typedef __m256i m256i;
#ifdef Vc_GCC
static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) * static_cast<__v4df>(b)); }
static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) + static_cast<__v4df>(b)); }
static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) - static_cast<__v4df>(b)); }
static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) * static_cast<__v8sf>(b)); }
static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) + static_cast<__v8sf>(b)); }
static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); }
#endif
static Vc_INTRINSIC m256d Vc_CONST set1_pd (double a) { return _mm256_set1_pd (a); }
static Vc_INTRINSIC m256i Vc_CONST set1_epi32(int a) { return _mm256_set1_epi32(a); }
static Vc_INTRINSIC Vc_CONST m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
static Vc_INTRINSIC Vc_CONST m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
static Vc_INTRINSIC Vc_CONST m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
static Vc_INTRINSIC Vc_CONST m256i setallone_si256() { return _mm256_castps_si256(_mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet))); }
static Vc_INTRINSIC Vc_CONST m256d setallone_pd() { return _mm256_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
static Vc_INTRINSIC Vc_CONST m256 setallone_ps() { return _mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
static Vc_INTRINSIC m256i Vc_CONST setone_epi8 () { return _mm256_set1_epi8(1); }
static Vc_INTRINSIC m256i Vc_CONST setone_epu8 () { return setone_epi8(); }
static Vc_INTRINSIC m256i Vc_CONST setone_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::one16))); }
static Vc_INTRINSIC m256i Vc_CONST setone_epu16() { return setone_epi16(); }
static Vc_INTRINSIC m256i Vc_CONST setone_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&_IndexesFromZero32[1]))); }
static Vc_INTRINSIC m256i Vc_CONST setone_epu32() { return setone_epi32(); }
static Vc_INTRINSIC m256 Vc_CONST setone_ps() { return _mm256_broadcast_ss(&c_general::oneFloat); }
static Vc_INTRINSIC m256d Vc_CONST setone_pd() { return _mm256_broadcast_sd(&c_general::oneDouble); }
static Vc_INTRINSIC m256d Vc_CONST setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::absMaskFloat[0])); }
static Vc_INTRINSIC m256 Vc_CONST setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::absMaskFloat[1])); }
static Vc_INTRINSIC m256d Vc_CONST setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::signMaskFloat[0])); }
static Vc_INTRINSIC m256 Vc_CONST setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1])); }
static Vc_INTRINSIC m256 Vc_CONST set2power31_ps() { return _mm256_broadcast_ss(&c_general::_2power31); }
static Vc_INTRINSIC m128 Vc_CONST _mm_set2power31_ps() { return _mm_broadcast_ss(&c_general::_2power31); }
static Vc_INTRINSIC m256i Vc_CONST set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
static Vc_INTRINSIC m128i Vc_CONST _mm_set2power31_epu32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
static Vc_INTRINSIC m256i Vc_CONST setmin_epi8 () { return _mm256_set1_epi8(-0x80); }
static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
static Vc_INTRINSIC m256i Vc_CONST setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
static Vc_INTRINSIC m256i Vc_CONST setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
template <int i>
static Vc_INTRINSIC Vc_CONST unsigned int extract_epu32(__m128i x)
{
return _mm_extract_epi32(x, i);
}
template <int offset> Vc_INTRINSIC __m256 insert128(__m256 a, __m128 b) { return _mm256_insertf128_ps(a, b, offset); }
template <int offset> Vc_INTRINSIC __m256d insert128(__m256d a, __m128d b) { return _mm256_insertf128_pd(a, b, offset); }
template <int offset> Vc_INTRINSIC __m256i insert128(__m256i a, __m128i b) {
#ifdef Vc_IMPL_AVX2
return _mm256_inserti128_si256(a, b, offset);
#else
return _mm256_insertf128_si256(a, b, offset);
#endif
}
template <int offset> Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_extractf128_ps(a, offset); }
template <int offset> Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); }
template <int offset> Vc_INTRINSIC __m128i extract128(__m256i a) {
#ifdef Vc_IMPL_AVX2
return _mm256_extracti128_si256(a, offset);
#else
return _mm256_extractf128_si256(a, offset);
#endif
}
#ifdef Vc_GCC
Vc_INTRINSIC __m256d cmpeq_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a == b); }
Vc_INTRINSIC __m256d cmpneq_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a != b); }
Vc_INTRINSIC __m256d cmplt_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a < b); }
Vc_INTRINSIC __m256d cmpge_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a >= b); }
Vc_INTRINSIC __m256d cmple_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a <= b); }
Vc_INTRINSIC __m256d cmpgt_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a > b); }
Vc_INTRINSIC __m256 cmpeq_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a == b); }
Vc_INTRINSIC __m256 cmpneq_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a != b); }
Vc_INTRINSIC __m256 cmplt_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a < b); }
Vc_INTRINSIC __m256 cmpge_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a >= b); }
Vc_INTRINSIC __m256 cmple_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a <= b); }
Vc_INTRINSIC __m256 cmpgt_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a > b); }
#else
Vc_INTRINSIC __m256d cmpeq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
Vc_INTRINSIC __m256d cmpneq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
Vc_INTRINSIC __m256d cmplt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
Vc_INTRINSIC __m256d cmpge_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
Vc_INTRINSIC __m256d cmple_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
Vc_INTRINSIC __m256d cmpgt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
Vc_INTRINSIC __m256 cmpeq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
Vc_INTRINSIC __m256 cmpneq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
Vc_INTRINSIC __m256 cmplt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
Vc_INTRINSIC __m256 cmpge_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
Vc_INTRINSIC __m256 cmple_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
Vc_INTRINSIC __m256 cmpgt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
#endif
Vc_INTRINSIC __m256d cmpnlt_pd (__m256d a, __m256d b) { return cmpge_pd(a, b); }
Vc_INTRINSIC __m256d cmpnle_pd (__m256d a, __m256d b) { return cmpgt_pd(a, b); }
Vc_INTRINSIC __m256 cmpnlt_ps (__m256 a, __m256 b) { return cmpge_ps(a, b); }
Vc_INTRINSIC __m256 cmpnle_ps (__m256 a, __m256 b) { return cmpgt_ps(a, b); }
Vc_INTRINSIC __m256d cmpord_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); }
Vc_INTRINSIC __m256d cmpunord_pd(__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); }
Vc_INTRINSIC __m256 cmpord_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); }
Vc_INTRINSIC __m256 cmpunord_ps(__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); }
#if defined(Vc_IMPL_XOP)
static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
return _mm_comlt_epu16(a, b);
}
static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
return _mm_comgt_epu16(a, b);
}
#else
static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
}
static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
}
#endif
#ifdef Vc_IMPL_AVX2
template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
{
return _mm256_alignr_epi8(s1, s2, shift);
}
#else
template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
{
return insert128<1>(
_mm256_castsi128_si256(_mm_alignr_epi8(_mm256_castsi256_si128(s1),
_mm256_castsi256_si128(s2), shift)),
_mm_alignr_epi8(extract128<1>(s1), extract128<1>(s2), shift));
}
#endif
#ifdef Vc_IMPL_AVX2
#define Vc_AVX_TO_SSE_2_NEW(name) \
Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \
{ \
return _mm256_##name(a0, b0); \
}
#define Vc_AVX_TO_SSE_256_128(name) \
Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \
{ \
return _mm256_##name(a0, b0); \
}
#define Vc_AVX_TO_SSE_1i(name) \
template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \
{ \
return _mm256_##name(a0, i); \
}
#define Vc_AVX_TO_SSE_1(name) \
Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) { return _mm256_##name(a0); }
#define Vc_AVX_TO_SSE_1_128(name,shift__) \
Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) { return _mm256_##name(a0); }
#else
#define Vc_AVX_TO_SSE_1(name) \
Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) \
{ \
__m128i a1 = extract128<1>(a0); \
__m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \
__m128i r1 = _mm_##name(a1); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
#define Vc_AVX_TO_SSE_1_128(name,shift__) \
Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) \
{ \
__m128i r0 = _mm_##name(a0); \
__m128i r1 = _mm_##name(_mm_srli_si128(a0, shift__)); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
#define Vc_AVX_TO_SSE_2_NEW(name) \
Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \
{ \
m128i a1 = extract128<1>(a0); \
m128i b1 = extract128<1>(b0); \
m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \
m128i r1 = _mm_##name(a1, b1); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
#define Vc_AVX_TO_SSE_256_128(name) \
Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \
{ \
m128i a1 = extract128<1>(a0); \
m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), b0); \
m128i r1 = _mm_##name(a1, b0); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
#define Vc_AVX_TO_SSE_1i(name) \
template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \
{ \
m128i a1 = extract128<1>(a0); \
m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \
m128i r1 = _mm_##name(a1, i); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
#endif
Vc_INTRINSIC Vc_CONST __m128i sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); }
Vc_INTRINSIC Vc_CONST __m128i sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); }
Vc_INTRINSIC Vc_CONST __m128i sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); }
Vc_INTRINSIC Vc_CONST __m128i srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); }
Vc_INTRINSIC Vc_CONST __m128i srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); }
Vc_INTRINSIC Vc_CONST __m128i srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); }
Vc_INTRINSIC Vc_CONST __m128i sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); }
Vc_INTRINSIC Vc_CONST __m128i sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); }
Vc_AVX_TO_SSE_1i(slli_epi16)
Vc_AVX_TO_SSE_1i(slli_epi32)
Vc_AVX_TO_SSE_1i(slli_epi64)
Vc_AVX_TO_SSE_1i(srai_epi16)
Vc_AVX_TO_SSE_1i(srai_epi32)
Vc_AVX_TO_SSE_1i(srli_epi16)
Vc_AVX_TO_SSE_1i(srli_epi32)
Vc_AVX_TO_SSE_1i(srli_epi64)
Vc_AVX_TO_SSE_256_128(sll_epi16)
Vc_AVX_TO_SSE_256_128(sll_epi32)
Vc_AVX_TO_SSE_256_128(sll_epi64)
Vc_AVX_TO_SSE_256_128(srl_epi16)
Vc_AVX_TO_SSE_256_128(srl_epi32)
Vc_AVX_TO_SSE_256_128(srl_epi64)
Vc_AVX_TO_SSE_256_128(sra_epi16)
Vc_AVX_TO_SSE_256_128(sra_epi32)
Vc_AVX_TO_SSE_2_NEW(cmpeq_epi8)
Vc_AVX_TO_SSE_2_NEW(cmpeq_epi16)
Vc_AVX_TO_SSE_2_NEW(cmpeq_epi32)
Vc_AVX_TO_SSE_2_NEW(cmpeq_epi64)
Vc_AVX_TO_SSE_2_NEW(cmpgt_epi8)
Vc_AVX_TO_SSE_2_NEW(cmpgt_epi16)
Vc_AVX_TO_SSE_2_NEW(cmpgt_epi32)
Vc_AVX_TO_SSE_2_NEW(cmpgt_epi64)
Vc_AVX_TO_SSE_2_NEW(unpackhi_epi16)
Vc_AVX_TO_SSE_2_NEW(unpacklo_epi16)
Vc_AVX_TO_SSE_2_NEW(add_epi16)
Vc_AVX_TO_SSE_2_NEW(add_epi32)
Vc_AVX_TO_SSE_2_NEW(add_epi64)
Vc_AVX_TO_SSE_2_NEW(sub_epi16)
Vc_AVX_TO_SSE_2_NEW(sub_epi32)
Vc_AVX_TO_SSE_2_NEW(mullo_epi16)
Vc_AVX_TO_SSE_2_NEW(sign_epi16)
Vc_AVX_TO_SSE_2_NEW(sign_epi32)
Vc_AVX_TO_SSE_2_NEW(min_epi8)
Vc_AVX_TO_SSE_2_NEW(max_epi8)
Vc_AVX_TO_SSE_2_NEW(min_epu16)
Vc_AVX_TO_SSE_2_NEW(max_epu16)
Vc_AVX_TO_SSE_2_NEW(min_epi32)
Vc_AVX_TO_SSE_2_NEW(max_epi32)
Vc_AVX_TO_SSE_2_NEW(min_epu32)
Vc_AVX_TO_SSE_2_NEW(max_epu32)
Vc_AVX_TO_SSE_2_NEW(mullo_epi32)
Vc_AVX_TO_SSE_1(abs_epi8)
Vc_AVX_TO_SSE_1(abs_epi16)
Vc_AVX_TO_SSE_1(abs_epi32)
Vc_AVX_TO_SSE_1_128(cvtepi8_epi16, 8)
Vc_AVX_TO_SSE_1_128(cvtepi8_epi32, 4)
Vc_AVX_TO_SSE_1_128(cvtepi8_epi64, 2)
Vc_AVX_TO_SSE_1_128(cvtepi16_epi32, 8)
Vc_AVX_TO_SSE_1_128(cvtepi16_epi64, 4)
Vc_AVX_TO_SSE_1_128(cvtepi32_epi64, 8)
Vc_AVX_TO_SSE_1_128(cvtepu8_epi16, 8)
Vc_AVX_TO_SSE_1_128(cvtepu8_epi32, 4)
Vc_AVX_TO_SSE_1_128(cvtepu8_epi64, 2)
Vc_AVX_TO_SSE_1_128(cvtepu16_epi32, 8)
Vc_AVX_TO_SSE_1_128(cvtepu16_epi64, 4)
Vc_AVX_TO_SSE_1_128(cvtepu32_epi64, 8)
#ifndef Vc_IMPL_AVX2
static Vc_INTRINSIC m256i Vc_CONST and_si256(__m256i x, __m256i y) {
return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
}
static Vc_INTRINSIC m256i Vc_CONST andnot_si256(__m256i x, __m256i y) {
return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
}
static Vc_INTRINSIC m256i Vc_CONST or_si256(__m256i x, __m256i y) {
return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
}
static Vc_INTRINSIC m256i Vc_CONST xor_si256(__m256i x, __m256i y) {
return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
}
Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
{
m128i a1 = extract128<1>(a0);
return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0));
}
template <int m> Vc_INTRINSIC Vc_CONST m256i blend_epi16(__m256i a0, __m256i b0)
{
m128i a1 = extract128<1>(a0);
m128i b1 = extract128<1>(b0);
m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff);
m128i r1 = _mm_blend_epi16(a1, b1, m >> 8);
return insert128<1>(_mm256_castsi128_si256(r0), r1);
}
Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0) {
m128i a1 = extract128<1>(a0);
m128i b1 = extract128<1>(b0);
m128i m1 = extract128<1>(m0);
m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0));
m128i r1 = _mm_blendv_epi8(a1, b1, m1);
return insert128<1>(_mm256_castsi128_si256(r0), r1);
}
#else
static Vc_INTRINSIC Vc_CONST m256i xor_si256(__m256i x, __m256i y) { return _mm256_xor_si256(x, y); }
static Vc_INTRINSIC Vc_CONST m256i or_si256(__m256i x, __m256i y) { return _mm256_or_si256(x, y); }
static Vc_INTRINSIC Vc_CONST m256i and_si256(__m256i x, __m256i y) { return _mm256_and_si256(x, y); }
static Vc_INTRINSIC Vc_CONST m256i andnot_si256(__m256i x, __m256i y) { return _mm256_andnot_si256(x, y); }
Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0)
{
return _mm256_blendv_epi8(a0, b0, m0);
}
Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
{
return _mm256_movemask_epi8(a0);
}
#endif
static Vc_INTRINSIC m256i cmplt_epi64(__m256i a, __m256i b) {
return cmpgt_epi64(b, a);
}
static Vc_INTRINSIC m256i cmplt_epi32(__m256i a, __m256i b) {
return cmpgt_epi32(b, a);
}
static Vc_INTRINSIC m256i cmplt_epi16(__m256i a, __m256i b) {
return cmpgt_epi16(b, a);
}
static Vc_INTRINSIC m256i cmplt_epi8(__m256i a, __m256i b) {
return cmpgt_epi8(b, a);
}
static Vc_INTRINSIC m256i cmpgt_epu8(__m256i a, __m256i b) {
return cmpgt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8()));
}
#if defined(Vc_IMPL_XOP)
Vc_AVX_TO_SSE_2_NEW(comlt_epu32)
Vc_AVX_TO_SSE_2_NEW(comgt_epu32)
Vc_AVX_TO_SSE_2_NEW(comlt_epu16)
Vc_AVX_TO_SSE_2_NEW(comgt_epu16)
static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i a, __m256i b) { return comlt_epu32(a, b); }
static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i a, __m256i b) { return comgt_epu32(a, b); }
static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i a, __m256i b) { return comlt_epu16(a, b); }
static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i a, __m256i b) { return comgt_epu16(a, b); }
#else
static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i _a, __m256i _b) {
m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
return cmplt_epi32(a, b);
}
static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i _a, __m256i _b) {
m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
return cmpgt_epi32(a, b);
}
static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i _a, __m256i _b) {
m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
return cmplt_epi16(a, b);
}
static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i _a, __m256i _b) {
m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
return cmpgt_epi16(a, b);
}
#endif
static Vc_INTRINSIC void _mm256_maskstore(float *mem, const __m256 mask, const __m256 v) {
_mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v);
}
static Vc_INTRINSIC void _mm256_maskstore(double *mem, const __m256d mask, const __m256d v) {
_mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v);
}
static Vc_INTRINSIC void _mm256_maskstore(int *mem, const __m256i mask, const __m256i v) {
#ifdef Vc_IMPL_AVX2
_mm256_maskstore_epi32(mem, mask, v);
#else
_mm256_maskstore_ps(reinterpret_cast<float *>(mem), mask, _mm256_castsi256_ps(v));
#endif
}
static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const __m256i mask, const __m256i v) {
_mm256_maskstore(reinterpret_cast<int *>(mem), mask, v);
}
static Vc_INTRINSIC void _mm256_maskstore(short *mem, const __m256i mask, const __m256i v) {
using namespace AVX;
_mm_maskmoveu_si128(_mm256_castsi256_si128(v), _mm256_castsi256_si128(mask), reinterpret_cast<char *>(&mem[0]));
_mm_maskmoveu_si128(extract128<1>(v), extract128<1>(mask), reinterpret_cast<char *>(&mem[8]));
}
static Vc_INTRINSIC void _mm256_maskstore(unsigned short *mem, const __m256i mask, const __m256i v) {
_mm256_maskstore(reinterpret_cast<short *>(mem), mask, v);
}
#undef Vc_AVX_TO_SSE_1
#undef Vc_AVX_TO_SSE_1_128
#undef Vc_AVX_TO_SSE_2_NEW
#undef Vc_AVX_TO_SSE_256_128
#undef Vc_AVX_TO_SSE_1i
template<typename R> Vc_INTRINSIC_L R stream_load(const float *mem) Vc_INTRINSIC_R;
template<> Vc_INTRINSIC m128 stream_load<m128>(const float *mem)
{
return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
}
template<> Vc_INTRINSIC m256 stream_load<m256>(const float *mem)
{
return insert128<1>(_mm256_castps128_ps256(stream_load<m128>(mem)),
stream_load<m128>(mem + 4));
}
template<typename R> Vc_INTRINSIC_L R stream_load(const double *mem) Vc_INTRINSIC_R;
template<> Vc_INTRINSIC m128d stream_load<m128d>(const double *mem)
{
return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
}
template<> Vc_INTRINSIC m256d stream_load<m256d>(const double *mem)
{
return insert128<1>(_mm256_castpd128_pd256(stream_load<m128d>(mem)),
stream_load<m128d>(mem + 2));
}
template<typename R> Vc_INTRINSIC_L R stream_load(const void *mem) Vc_INTRINSIC_R;
template<> Vc_INTRINSIC m128i stream_load<m128i>(const void *mem)
{
return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<void *>(mem)));
}
template<> Vc_INTRINSIC m256i stream_load<m256i>(const void *mem)
{
return insert128<1>(_mm256_castsi128_si256(stream_load<m128i>(mem)),
stream_load<m128i>(static_cast<const __m128i *>(mem) + 1));
}
Vc_INTRINSIC void stream_store(float *mem, __m128 value, __m128 mask)
{
_mm_maskmoveu_si128(_mm_castps_si128(value), _mm_castps_si128(mask), reinterpret_cast<char *>(mem));
}
Vc_INTRINSIC void stream_store(float *mem, __m256 value, __m256 mask)
{
stream_store(mem, _mm256_castps256_ps128(value), _mm256_castps256_ps128(mask));
stream_store(mem + 4, extract128<1>(value), extract128<1>(mask));
}
Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask)
{
_mm_maskmoveu_si128(_mm_castpd_si128(value), _mm_castpd_si128(mask), reinterpret_cast<char *>(mem));
}
Vc_INTRINSIC void stream_store(double *mem, __m256d value, __m256d mask)
{
stream_store(mem, _mm256_castpd256_pd128(value), _mm256_castpd256_pd128(mask));
stream_store(mem + 2, extract128<1>(value), extract128<1>(mask));
}
Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask)
{
_mm_maskmoveu_si128(value, mask, reinterpret_cast<char *>(mem));
}
Vc_INTRINSIC void stream_store(void *mem, __m256i value, __m256i mask)
{
stream_store(mem, _mm256_castsi256_si128(value), _mm256_castsi256_si128(mask));
stream_store(static_cast<__m128i *>(mem) + 1, extract128<1>(value), extract128<1>(mask));
}
#ifndef __x86_64__
Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
}
#endif
#ifdef Vc_IMPL_AVX2
template <int Scale> __m256 gather(const float *addr, __m256i idx)
{
return _mm256_i32gather_ps(addr, idx, Scale);
}
template <int Scale> __m256d gather(const double *addr, __m128i idx)
{
return _mm256_i32gather_pd(addr, idx, Scale);
}
template <int Scale> __m256i gather(const int *addr, __m256i idx)
{
return _mm256_i32gather_epi32(addr, idx, Scale);
}
template <int Scale> __m256i gather(const unsigned *addr, __m256i idx)
{
return _mm256_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
}
template <int Scale> __m256 gather(__m256 src, __m256 k, const float *addr, __m256i idx)
{
return _mm256_mask_i32gather_ps(src, addr, idx, k, Scale);
}
template <int Scale>
__m256d gather(__m256d src, __m256d k, const double *addr, __m128i idx)
{
return _mm256_mask_i32gather_pd(src, addr, idx, k, Scale);
}
template <int Scale> __m256i gather(__m256i src, __m256i k, const int *addr, __m256i idx)
{
return _mm256_mask_i32gather_epi32(src, addr, idx, k, Scale);
}
template <int Scale>
__m256i gather(__m256i src, __m256i k, const unsigned *addr, __m256i idx)
{
return _mm256_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
}
#endif
}
}
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
using namespace AvxIntrinsics;
}
namespace AVX2
{
using namespace AvxIntrinsics;
}
namespace AVX
{
template<typename T> struct VectorTypeHelper;
template<> struct VectorTypeHelper< char > { typedef __m256i Type; };
template<> struct VectorTypeHelper< signed char > { typedef __m256i Type; };
template<> struct VectorTypeHelper<unsigned char > { typedef __m256i Type; };
template<> struct VectorTypeHelper< short> { typedef __m256i Type; };
template<> struct VectorTypeHelper<unsigned short> { typedef __m256i Type; };
template<> struct VectorTypeHelper< int > { typedef __m256i Type; };
template<> struct VectorTypeHelper<unsigned int > { typedef __m256i Type; };
template<> struct VectorTypeHelper< long > { typedef __m256i Type; };
template<> struct VectorTypeHelper<unsigned long > { typedef __m256i Type; };
template<> struct VectorTypeHelper< long long> { typedef __m256i Type; };
template<> struct VectorTypeHelper<unsigned long long> { typedef __m256i Type; };
template<> struct VectorTypeHelper< float> { typedef __m256 Type; };
template<> struct VectorTypeHelper< double> { typedef __m256d Type; };
template <typename T>
using IntegerVectorType =
typename std::conditional<sizeof(T) == 16, __m128i, __m256i>::type;
template <typename T>
using DoubleVectorType =
typename std::conditional<sizeof(T) == 16, __m128d, __m256d>::type;
template <typename T>
using FloatVectorType =
typename std::conditional<sizeof(T) == 16, __m128, __m256>::type;
template<typename T> struct VectorHelper {};
template<typename T> struct VectorHelperSize;
}
}
#endif
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename V> inline V zero();
}
namespace Common
{
namespace Detail
{
#ifdef Vc_IMPL_AVX
template <typename ValueType, size_t Size> struct IntrinsicType {
using type = typename std::conditional<
std::is_integral<ValueType>::value,
typename std::conditional<sizeof(ValueType) * Size == 16, __m128i, __m256i>::type,
typename std::conditional<
std::is_same<ValueType, double>::value,
typename std::conditional<sizeof(ValueType) * Size == 16, __m128d,
__m256d>::type,
typename std::conditional<sizeof(ValueType) * Size == 16, __m128,
__m256>::type>::type>::type;
};
#elif defined Vc_IMPL_SSE
template <typename ValueType, size_t Size> struct IntrinsicType {
using type = typename std::conditional<
std::is_integral<ValueType>::value, __m128i,
typename std::conditional<std::is_same<ValueType, double>::value, __m128d,
__m128>::type>::type;
};
#else
template <typename ValueType, size_t Size> struct IntrinsicType {
static_assert(Size == 1,
"IntrinsicType without SIMD target support may only have Size = 1");
using type = ValueType;
};
#endif
template <typename ValueType, size_t Size, size_t Bytes = sizeof(ValueType) * Size>
struct BuiltinType;
#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
#define Vc_VECBUILTIN __attribute__((__vector_size__(16)))
template <size_t Size> struct BuiltinType< double , Size, 16> { typedef double type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< float , Size, 16> { typedef float type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< long long, Size, 16> { typedef long long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned long long, Size, 16> { typedef unsigned long long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< long , Size, 16> { typedef long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned long , Size, 16> { typedef unsigned long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< int , Size, 16> { typedef int type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned int , Size, 16> { typedef unsigned int type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< short , Size, 16> { typedef short type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned short , Size, 16> { typedef unsigned short type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< char , Size, 16> { typedef char type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned char , Size, 16> { typedef unsigned char type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< signed char , Size, 16> { typedef signed char type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< bool , Size, 16> { typedef unsigned char type Vc_VECBUILTIN; };
#undef Vc_VECBUILTIN
#define Vc_VECBUILTIN __attribute__((__vector_size__(32)))
template <size_t Size> struct BuiltinType< double , Size, 32> { typedef double type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< float , Size, 32> { typedef float type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< long long, Size, 32> { typedef long long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned long long, Size, 32> { typedef unsigned long long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< long , Size, 32> { typedef long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned long , Size, 32> { typedef unsigned long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< int , Size, 32> { typedef int type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned int , Size, 32> { typedef unsigned int type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< short , Size, 32> { typedef short type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned short , Size, 32> { typedef unsigned short type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< char , Size, 32> { typedef char type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned char , Size, 32> { typedef unsigned char type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< signed char , Size, 32> { typedef signed char type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< bool , Size, 32> { typedef unsigned char type Vc_VECBUILTIN; };
#undef Vc_VECBUILTIN
#endif
}
template <typename ValueType, size_t Size>
using IntrinsicType = typename Detail::IntrinsicType<ValueType, Size>::type;
template <typename ValueType, size_t Size>
using BuiltinType = typename Detail::BuiltinType<ValueType, Size>::type;
namespace AliasStrategy
{
struct Union {};
struct MayAlias {};
struct VectorBuiltin {};
struct UnionMembers {};
}
using DefaultStrategy =
#if defined Vc_USE_BUILTIN_VECTOR_TYPES
AliasStrategy::VectorBuiltin;
#elif defined Vc_MSVC
AliasStrategy::UnionMembers;
#elif defined Vc_ICC
AliasStrategy::Union;
#elif defined __GNUC__
AliasStrategy::MayAlias;
#else
AliasStrategy::Union;
#endif
template <typename ValueType, size_t Size, typename Strategy = DefaultStrategy>
class Storage;
template <typename ValueType, size_t Size>
class Storage<ValueType, Size, AliasStrategy::Union>
{
static_assert(std::is_fundamental<ValueType>::value &&
std::is_arithmetic<ValueType>::value,
"Only works for fundamental arithmetic types.");
public:
using VectorType = IntrinsicType<ValueType, Size>;
using EntryType = ValueType;
union Alias {
Vc_INTRINSIC Alias(VectorType vv) : v(vv) {}
VectorType v;
EntryType m[Size];
};
Vc_INTRINSIC Storage() : data(Vc::Detail::zero<VectorType>()) {}
Vc_INTRINSIC Storage(const VectorType &x) : data(x) { assertCorrectAlignment(&data); }
template <typename U>
Vc_INTRINSIC explicit Storage(const U &x,
enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
: data(reinterpret_cast<VectorType>(x))
{
assertCorrectAlignment(&data);
}
Vc_INTRINSIC Storage(const Storage &) = default;
Vc_INTRINSIC Storage &operator=(const Storage &) = default;
Vc_INTRINSIC operator const VectorType &() const { return data; }
Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return Alias(data).m[i]; }
Vc_INTRINSIC void set(size_t i, EntryType x)
{
Alias a(data);
a.m[i] = x;
data = a.v;
}
private:
VectorType data;
};
template <typename ValueType, size_t Size>
class Storage<ValueType, Size, AliasStrategy::MayAlias>
{
static_assert(std::is_fundamental<ValueType>::value &&
std::is_arithmetic<ValueType>::value,
"Only works for fundamental arithmetic types.");
public:
using VectorType = IntrinsicType<ValueType, Size>;
using EntryType = ValueType;
Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
Vc_INTRINSIC Storage(const VectorType &x) : data(x)
{
assertCorrectAlignment(&data);
}
template <typename U>
Vc_INTRINSIC explicit Storage(const U &x,
enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
: data(reinterpret_cast<const VectorType &>(x))
{
assertCorrectAlignment(&data);
}
Vc_INTRINSIC Storage &operator=(const VectorType &x)
{
data = x;
return *this;
}
Vc_INTRINSIC Storage(const Storage &) = default;
Vc_INTRINSIC Storage &operator=(const Storage &) = default;
Vc_INTRINSIC operator const VectorType &() const { return v(); }
Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const
{
return aliasing_cast<EntryType>(&data)[i];
}
Vc_INTRINSIC void set(size_t i, EntryType x)
{
aliasing_cast<EntryType>(&data)[i] = x;
}
private:
VectorType data;
};
template <typename ValueType, size_t Size>
class Storage<ValueType, Size, AliasStrategy::VectorBuiltin>
{
static_assert(std::is_fundamental<ValueType>::value &&
std::is_arithmetic<ValueType>::value,
"Only works for fundamental arithmetic types.");
using Builtin = BuiltinType<ValueType, Size>;
public:
using VectorType =
#ifdef Vc_TEMPLATES_DROP_ATTRIBUTES
MayAlias<IntrinsicType<ValueType, Size>>;
#else
IntrinsicType<ValueType, Size>;
#endif
using EntryType = ValueType;
Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
Vc_INTRINSIC Storage(const Storage &) = default;
Vc_INTRINSIC Storage &operator=(const Storage &) = default;
Vc_INTRINSIC Storage(const VectorType &x)
: data(aliasing_cast<Builtin>(x))
{
assertCorrectAlignment(&data);
}
template <typename U>
Vc_INTRINSIC explicit Storage(const U &x,
enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
: data(aliasing_cast<Builtin>(x))
{
assertCorrectAlignment(&data);
}
Vc_INTRINSIC Storage &operator=(const VectorType &x)
{
data = aliasing_cast<Builtin>(x);
return *this;
}
Vc_INTRINSIC operator const VectorType &() const { return v(); }
Vc_INTRINSIC Vc_PURE VectorType &v() { return reinterpret_cast<VectorType &>(data); }
Vc_INTRINSIC Vc_PURE const VectorType &v() const { return reinterpret_cast<const VectorType &>(data); }
Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return data[i]; }
Vc_INTRINSIC void set(size_t i, EntryType x) { data[i] = x; }
Vc_INTRINSIC Builtin &builtin() { return data; }
Vc_INTRINSIC const Builtin &builtin() const { return data; }
private:
Builtin data;
};
template <typename ValueType, size_t Size>
class Storage<ValueType, Size, AliasStrategy::UnionMembers>
{
static_assert(std::is_fundamental<ValueType>::value &&
std::is_arithmetic<ValueType>::value,
"Only works for fundamental arithmetic types.");
public:
using VectorType = IntrinsicType<ValueType, Size>;
using EntryType = ValueType;
Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
Vc_INTRINSIC Storage(const VectorType &x) : data(x)
{
assertCorrectAlignment(&data);
}
template <typename U>
Vc_INTRINSIC explicit Storage(const U &x,
enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
: data(reinterpret_cast<const VectorType &>(x))
{
assertCorrectAlignment(&data);
}
Vc_INTRINSIC Storage &operator=(const VectorType &x)
{
data = x;
return *this;
}
Vc_INTRINSIC Storage(const Storage &) = default;
Vc_INTRINSIC Storage &operator=(const Storage &) = default;
Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
Vc_INTRINSIC_L Vc_PURE_L EntryType m(size_t i) const Vc_INTRINSIC_R Vc_PURE_R;
Vc_INTRINSIC void set(size_t i, EntryType x) { ref(i) = x; }
private:
Vc_INTRINSIC_L Vc_PURE_L EntryType &ref(size_t i) Vc_INTRINSIC_R Vc_PURE_R;
VectorType data;
};
#ifdef Vc_MSVC
template <> Vc_INTRINSIC Vc_PURE double Storage< double, 2, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128d_f64[i]; }
template <> Vc_INTRINSIC Vc_PURE float Storage< float , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128_f32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed int Storage< signed int , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed short Storage< signed short , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i16[i]; }
template <> Vc_INTRINSIC Vc_PURE signed char Storage< signed char ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i8[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned int Storage<unsigned int , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u32[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned short Storage<unsigned short , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u16[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned char Storage<unsigned char ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u8[i]; }
template <> Vc_INTRINSIC Vc_PURE double &Storage< double, 2, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128d_f64[i]; }
template <> Vc_INTRINSIC Vc_PURE float &Storage< float , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128_f32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed int &Storage< signed int , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed short &Storage< signed short , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i16[i]; }
template <> Vc_INTRINSIC Vc_PURE signed char &Storage< signed char ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast<signed char &>(data.m128i_i8[i]); }
template <> Vc_INTRINSIC Vc_PURE unsigned int &Storage<unsigned int , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u32[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned short &Storage<unsigned short , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u16[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned char &Storage<unsigned char ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u8[i]; }
#ifdef Vc_IMPL_AVX
template <> Vc_INTRINSIC Vc_PURE double Storage< double, 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256d_f64[i]; }
template <> Vc_INTRINSIC Vc_PURE float Storage< float , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256_f32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed int Storage< signed int , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed short Storage< signed short ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i16[i]; }
template <> Vc_INTRINSIC Vc_PURE signed char Storage< signed char ,32, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i8[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned int Storage<unsigned int , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u32[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned short Storage<unsigned short ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u16[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned char Storage<unsigned char ,32, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u8[i]; }
template <> Vc_INTRINSIC Vc_PURE double &Storage< double, 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256d_f64[i]; }
template <> Vc_INTRINSIC Vc_PURE float &Storage< float , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256_f32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed int &Storage< signed int , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed short &Storage< signed short ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i16[i]; }
template <> Vc_INTRINSIC Vc_PURE signed char &Storage< signed char ,32, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast<signed char &>(data.m256i_i8[i]); }
template <> Vc_INTRINSIC Vc_PURE unsigned int &Storage<unsigned int , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u32[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned short &Storage<unsigned short ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u16[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned char &Storage<unsigned char ,32, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u8[i]; }
#endif
#endif
template <typename VectorType, typename EntryType>
using VectorMemoryUnion = Storage<EntryType, sizeof(VectorType) / sizeof(EntryType)>;
}
}
#endif
#ifndef VC_SSE_CONST_DATA_H_
#define VC_SSE_CONST_DATA_H_
#ifndef VC_SSE_MACROS_H_
#define VC_SSE_MACROS_H_
#if defined(Vc_IMPL_SSE4_1) && !defined(Vc_DISABLE_PTEST)
#define Vc_USE_PTEST
#endif
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{
alignas(16) extern const unsigned int _IndexesFromZero4[4];
alignas(16) extern const unsigned short _IndexesFromZero8[8];
alignas(16) extern const unsigned char _IndexesFromZero16[16];
struct c_general
{
alignas(64) static const int absMaskFloat[4];
alignas(16) static const unsigned int signMaskFloat[4];
alignas(16) static const unsigned int highMaskFloat[4];
alignas(16) static const short minShort[8];
alignas(16) static const unsigned short one16[8];
alignas(16) static const unsigned int one32[4];
alignas(16) static const float oneFloat[4];
alignas(16) static const unsigned long long highMaskDouble[2];
alignas(16) static const double oneDouble[2];
alignas(16) static const long long absMaskDouble[2];
alignas(16) static const unsigned long long signMaskDouble[2];
alignas(16) static const unsigned long long frexpMask[2];
};
template<typename T> struct c_trig
{
alignas(64) static const T data[];
};
#ifndef Vc_MSVC
template <> alignas(64) const float c_trig<float>::data[];
template <> alignas(64) const double c_trig<double>::data[];
#endif
template<typename T> struct c_log
{
enum VectorSize { Size = 16 / sizeof(T) };
static Vc_ALWAYS_INLINE Vc_CONST const float *d(int i) { return reinterpret_cast<const float *>(&data[i * Size]); }
alignas(64) static const unsigned int data[21 * Size];
};
#ifndef Vc_MSVC
template<> alignas(64) const unsigned int c_log<float>::data[21 * 4];
#endif
template<> struct c_log<double>
{
enum VectorSize { Size = 16 / sizeof(double) };
static Vc_ALWAYS_INLINE Vc_CONST const double *d(int i) { return reinterpret_cast<const double *>(&data[i * Size]); }
alignas(64) static const unsigned long long data[21 * Size];
};
}
}
#endif
#include <cstdlib>
#if defined(Vc_GCC) && !defined(__OPTIMIZE__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace SseIntrinsics
{
using SSE::c_general;
constexpr std::size_t VectorAlignment = 16;
#if defined(Vc_GCC) && Vc_GCC < 0x40600 && !defined(Vc_DONT_FIX_SSE_SHIFT)
static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; }
static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; }
static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; }
static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; }
static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; }
static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; }
#endif
#ifdef Vc_GCC
static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); }
static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); }
static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); }
static Vc_INTRINSIC Vc_CONST __m128 _mm_mul_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); }
static Vc_INTRINSIC Vc_CONST __m128 _mm_add_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); }
static Vc_INTRINSIC Vc_CONST __m128 _mm_sub_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); }
#endif
static Vc_INTRINSIC Vc_CONST __m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
static Vc_INTRINSIC Vc_CONST __m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
static Vc_INTRINSIC Vc_CONST __m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); }
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); }
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); }
static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); }
static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); }
static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); }
static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); }
static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); }
static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); }
static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); }
static Vc_INTRINSIC __m128i Vc_CONST setmin_epi8 () { return _mm_set1_epi8(-0x80); }
static Vc_INTRINSIC __m128i Vc_CONST setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); }
static Vc_INTRINSIC __m128i Vc_CONST setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); }
#if defined(Vc_IMPL_XOP)
static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b) { return _mm_comgt_epu8(a, b); }
static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b) { return _mm_comlt_epu16(a, b); }
static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b) { return _mm_comgt_epu16(a, b); }
static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b) { return _mm_comlt_epu32(a, b); }
static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b) { return _mm_comgt_epu32(a, b); }
static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu64(__m128i a, __m128i b) { return _mm_comlt_epu64(a, b); }
#else
static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b)
{
return _mm_cmpgt_epi8(_mm_xor_si128(a, setmin_epi8()),
_mm_xor_si128(b, setmin_epi8()));
}
static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b)
{
return _mm_cmplt_epi16(_mm_xor_si128(a, setmin_epi16()),
_mm_xor_si128(b, setmin_epi16()));
}
static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b)
{
return _mm_cmpgt_epi16(_mm_xor_si128(a, setmin_epi16()),
_mm_xor_si128(b, setmin_epi16()));
}
static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b)
{
return _mm_cmplt_epi32(_mm_xor_si128(a, setmin_epi32()),
_mm_xor_si128(b, setmin_epi32()));
}
static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b)
{
return _mm_cmpgt_epi32(_mm_xor_si128(a, setmin_epi32()),
_mm_xor_si128(b, setmin_epi32()));
}
Vc_INTRINSIC __m128i Vc_CONST cmpgt_epi64(__m128i a, __m128i b)
{
#ifdef Vc_IMPL_SSE4_2
return _mm_cmpgt_epi64(a, b);
#else
const auto aa = _mm_xor_si128(a, _mm_srli_epi64(setmin_epi32(),32));
const auto bb = _mm_xor_si128(b, _mm_srli_epi64(setmin_epi32(),32));
const auto gt = _mm_cmpgt_epi32(aa, bb);
const auto eq = _mm_cmpeq_epi32(aa, bb);
const auto gt2 =
_mm_shuffle_epi32(gt, 0xf5);
const auto lo =
_mm_shuffle_epi32(_mm_and_si128(_mm_srli_epi64(eq, 32), gt), 0xa0);
return _mm_or_si128(gt2, lo);
#endif
}
#endif
}
}
#ifdef Vc_IMPL_SSSE3
namespace Vc_VERSIONED_NAMESPACE
{
namespace SseIntrinsics
{
Vc_INTRINSIC Vc_CONST __m128i abs_epi8(__m128i a) { return _mm_abs_epi8(a); }
Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) { return _mm_abs_epi16(a); }
Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) { return _mm_abs_epi32(a); }
template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
{
return _mm_alignr_epi8(a, b, s & 0x1fu);
}
}
}
#else
namespace Vc_VERSIONED_NAMESPACE
{
namespace SseIntrinsics
{
Vc_INTRINSIC Vc_CONST __m128i abs_epi8 (__m128i a) {
__m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128());
return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_set1_epi8(1)));
}
Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) {
__m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128());
return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15));
}
Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) {
__m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128());
return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31));
}
template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
{
switch (s & 0x1fu) {
case 0: return b;
case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1));
case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2));
case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3));
case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4));
case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5));
case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6));
case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7));
case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8));
case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9));
case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10));
case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11));
case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12));
case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13));
case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14));
case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15));
case 16: return a;
case 17: return _mm_srli_si128(a, 1);
case 18: return _mm_srli_si128(a, 2);
case 19: return _mm_srli_si128(a, 3);
case 20: return _mm_srli_si128(a, 4);
case 21: return _mm_srli_si128(a, 5);
case 22: return _mm_srli_si128(a, 6);
case 23: return _mm_srli_si128(a, 7);
case 24: return _mm_srli_si128(a, 8);
case 25: return _mm_srli_si128(a, 9);
case 26: return _mm_srli_si128(a, 10);
case 27: return _mm_srli_si128(a, 11);
case 28: return _mm_srli_si128(a, 12);
case 29: return _mm_srli_si128(a, 13);
case 30: return _mm_srli_si128(a, 14);
case 31: return _mm_srli_si128(a, 15);
}
return _mm_setzero_si128();
}
}
}
#endif
#ifdef Vc_IMPL_SSE4_1
namespace Vc_VERSIONED_NAMESPACE
{
namespace SseIntrinsics
{
Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b)
{
return _mm_cmpeq_epi64(a, b);
}
template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
{
return _mm_extract_epi32(v, index);
}
Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c)
{
return _mm_blendv_pd(a, b, c);
}
Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c)
{
return _mm_blendv_ps(a, b, c);
}
Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c)
{
return _mm_blendv_epi8(a, b, c);
}
template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
{
return _mm_blend_pd(a, b, mask);
}
template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
{
return _mm_blend_ps(a, b, mask);
}
template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
{
return _mm_blend_epi16(a, b, mask);
}
Vc_INTRINSIC Vc_CONST __m128i max_epi8(__m128i a, __m128i b)
{
return _mm_max_epi8(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b)
{
return _mm_max_epi32(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b)
{
return _mm_max_epu16(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b)
{
return _mm_max_epu32(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b)
{
return _mm_min_epu16(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b)
{
return _mm_min_epu32(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i min_epi8(__m128i a, __m128i b)
{
return _mm_min_epi8(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b)
{
return _mm_min_epi32(a, b);
}
Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8)
{
return _mm_cvtepu8_epi16(epu8);
}
Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8)
{
return _mm_cvtepi8_epi16(epi8);
}
Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16)
{
return _mm_cvtepu16_epi32(epu16);
}
Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16)
{
return _mm_cvtepi16_epi32(epu16);
}
Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8)
{
return _mm_cvtepu8_epi32(epu8);
}
Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8)
{
return _mm_cvtepi8_epi32(epi8);
}
}
}
#else
namespace Vc_VERSIONED_NAMESPACE
{
namespace SseIntrinsics
{
Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b) {
auto tmp = _mm_cmpeq_epi32(a, b);
return _mm_and_si128(tmp, _mm_shuffle_epi32(tmp, 1*1 + 0*4 + 3*16 + 2*64));
}
template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
{
#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
typedef int int32v4 __attribute__((__vector_size__(16)));
return aliasing_cast<int32v4>(v)[index];
#else
return _mm_cvtsi128_si32(_mm_srli_si128(v, index * 4));
#endif
}
Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c) {
#ifdef Vc_GCC
return reinterpret_cast<__m128d>(
(~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
(reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
#else
return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b));
#endif
}
Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c) {
#ifdef Vc_GCC
return reinterpret_cast<__m128>(
(~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
(reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
#else
return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b));
#endif
}
Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c) {
#ifdef Vc_GCC
return (~c & a) | (c & b);
#else
return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
#endif
}
template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
{
switch (mask) {
case 0x0:
return a;
case 0x1:
return _mm_shuffle_pd(b, a, 2);
case 0x2:
return _mm_shuffle_pd(a, b, 2);
case 0x3:
return b;
default:
abort();
return a;
}
}
template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
{
__m128i c;
switch (mask) {
case 0x0:
return a;
case 0x1:
c = _mm_srli_si128(_mm_setallone_si128(), 12);
break;
case 0x2:
c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4);
break;
case 0x3:
c = _mm_srli_si128(_mm_setallone_si128(), 8);
break;
case 0x4:
c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8);
break;
case 0x5:
c = _mm_set_epi32(0, -1, 0, -1);
break;
case 0x6:
c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4);
break;
case 0x7:
c = _mm_srli_si128(_mm_setallone_si128(), 4);
break;
case 0x8:
c = _mm_slli_si128(_mm_setallone_si128(), 12);
break;
case 0x9:
c = _mm_set_epi32(-1, 0, 0, -1);
break;
case 0xa:
c = _mm_set_epi32(-1, 0, -1, 0);
break;
case 0xb:
c = _mm_set_epi32(-1, 0, -1, -1);
break;
case 0xc:
c = _mm_slli_si128(_mm_setallone_si128(), 8);
break;
case 0xd:
c = _mm_set_epi32(-1, -1, 0, -1);
break;
case 0xe:
c = _mm_slli_si128(_mm_setallone_si128(), 4);
break;
case 0xf:
return b;
default:
abort();
c = _mm_setzero_si128();
break;
}
__m128 _c = _mm_castsi128_ps(c);
return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b));
}
template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
{
__m128i c;
switch (mask) {
case 0x00:
return a;
case 0x01:
c = _mm_srli_si128(_mm_setallone_si128(), 14);
break;
case 0x03:
c = _mm_srli_si128(_mm_setallone_si128(), 12);
break;
case 0x07:
c = _mm_srli_si128(_mm_setallone_si128(), 10);
break;
case 0x0f:
return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a);
case 0x1f:
c = _mm_srli_si128(_mm_setallone_si128(), 6);
break;
case 0x3f:
c = _mm_srli_si128(_mm_setallone_si128(), 4);
break;
case 0x7f:
c = _mm_srli_si128(_mm_setallone_si128(), 2);
break;
case 0x80:
c = _mm_slli_si128(_mm_setallone_si128(), 14);
break;
case 0xc0:
c = _mm_slli_si128(_mm_setallone_si128(), 12);
break;
case 0xe0:
c = _mm_slli_si128(_mm_setallone_si128(), 10);
break;
case 0xf0:
c = _mm_slli_si128(_mm_setallone_si128(), 8);
break;
case 0xf8:
c = _mm_slli_si128(_mm_setallone_si128(), 6);
break;
case 0xfc:
c = _mm_slli_si128(_mm_setallone_si128(), 4);
break;
case 0xfe:
c = _mm_slli_si128(_mm_setallone_si128(), 2);
break;
case 0xff:
return b;
case 0xcc:
return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1)));
case 0x33:
return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1)));
default:
const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff);
c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15);
break;
}
return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
}
Vc_INTRINSIC Vc_CONST __m128i max_epi8 (__m128i a, __m128i b) {
return blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b));
}
Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b) {
return blendv_epi8(b, a, _mm_cmpgt_epi32(a, b));
}
Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b) {
return blendv_epi8(b, a, cmpgt_epu16(a, b));
}
Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b) {
return blendv_epi8(b, a, cmpgt_epu32(a, b));
}
Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b) {
return blendv_epi8(a, b, cmpgt_epu16(a, b));
}
Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b) {
return blendv_epi8(a, b, cmpgt_epu32(a, b));
}
Vc_INTRINSIC Vc_CONST __m128i min_epi8 (__m128i a, __m128i b) {
return blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b));
}
Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b) {
return blendv_epi8(a, b, _mm_cmpgt_epi32(a, b));
}
Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8) {
return _mm_unpacklo_epi8(epu8, _mm_setzero_si128());
}
Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8) {
return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128()));
}
Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16) {
return _mm_unpacklo_epi16(epu16, _mm_setzero_si128());
}
Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16) {
return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128()));
}
Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8) {
return cvtepu16_epi32(cvtepu8_epi16(epu8));
}
Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8) {
const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128());
const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg);
return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg));
}
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace SseIntrinsics
{
static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem) {
#ifdef Vc_IMPL_SSE4_1
return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
#else
return _mm_load_ps(mem);
#endif
}
static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) {
#ifdef Vc_IMPL_SSE4_1
return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
#else
return _mm_load_pd(mem);
#endif
}
static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) {
#ifdef Vc_IMPL_SSE4_1
return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem)));
#else
return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
#endif
}
static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) {
return _mm_stream_load(reinterpret_cast<const int *>(mem));
}
static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) {
return _mm_stream_load(reinterpret_cast<const int *>(mem));
}
static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) {
return _mm_stream_load(reinterpret_cast<const int *>(mem));
}
static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) {
return _mm_stream_load(reinterpret_cast<const int *>(mem));
}
static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) {
return _mm_stream_load(reinterpret_cast<const int *>(mem));
}
#ifndef __x86_64__
Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
}
#endif
#ifdef Vc_IMPL_AVX2
template <int Scale> __m128 gather(const float *addr, __m128i idx)
{
return _mm_i32gather_ps(addr, idx, Scale);
}
template <int Scale> __m128d gather(const double *addr, __m128i idx)
{
return _mm_i32gather_pd(addr, idx, Scale);
}
template <int Scale> __m128i gather(const int *addr, __m128i idx)
{
return _mm_i32gather_epi32(addr, idx, Scale);
}
template <int Scale> __m128i gather(const unsigned *addr, __m128i idx)
{
return _mm_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
}
template <int Scale> __m128 gather(__m128 src, __m128 k, const float *addr, __m128i idx)
{
return _mm_mask_i32gather_ps(src, addr, idx, k, Scale);
}
template <int Scale>
__m128d gather(__m128d src, __m128d k, const double *addr, __m128i idx)
{
return _mm_mask_i32gather_pd(src, addr, idx, k, Scale);
}
template <int Scale> __m128i gather(__m128i src, __m128i k, const int *addr, __m128i idx)
{
return _mm_mask_i32gather_epi32(src, addr, idx, k, Scale);
}
template <int Scale>
__m128i gather(__m128i src, __m128i k, const unsigned *addr, __m128i idx)
{
return _mm_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
}
#endif
}
}
namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{
using namespace SseIntrinsics;
template <typename T> struct ParameterHelper
{
typedef T ByValue;
typedef T &Reference;
typedef const T &ConstRef;
};
template <typename T> struct VectorHelper
{
};
template <typename T> struct VectorTypeHelper
{
typedef __m128i Type;
};
template <> struct VectorTypeHelper<double>
{
typedef __m128d Type;
};
template <> struct VectorTypeHelper<float>
{
typedef __m128 Type;
};
template <typename T> struct DetermineGatherMask
{
typedef T Type;
};
template <typename T> struct VectorTraits
{
typedef typename VectorTypeHelper<T>::Type VectorType;
using EntryType = T;
static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
typedef Mask<T> MaskType;
typedef typename DetermineGatherMask<MaskType>::Type GatherMaskType;
typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
};
template <typename T> struct VectorHelperSize;
}
}
#if defined(Vc_GCC) && !defined(__OPTIMIZE__)
#pragma GCC diagnostic pop
#endif
#ifndef VC_SSE_SHUFFLE_H_
#define VC_SSE_SHUFFLE_H_
namespace Vc_VERSIONED_NAMESPACE
{
enum VecPos {
X0, X1, X2, X3, X4, X5, X6, X7,
Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7,
Const0
};
namespace Mem
{
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
}
template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
static_assert(Dst0 >= X0 && Dst1 >= Y0, "Incorrect_Range");
static_assert(Dst0 <= X1 && Dst1 <= Y1, "Incorrect_Range");
return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2);
}
template <VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3>
Vc_INTRINSIC Vc_CONST __m128i shuffle(__m128i x, __m128i y)
{
return _mm_castps_si128(shuffle<Dst0, Dst1, Dst2, Dst3>(_mm_castsi128_ps(x),
_mm_castsi128_ps(y)));
}
template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
return Vc::SseIntrinsics::blend_pd<(Dst0 / Y0) + (Dst1 / Y0) * 2>(x, y);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
return Vc::SseIntrinsics::blend_ps<(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
(Dst2 / Y2) * 4 + (Dst3 / Y3) * 8>(x, y);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) {
static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
return Vc::SseIntrinsics::blend_epi16<
(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
(Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + (Dst6 / Y6) * 64 +
(Dst7 / Y7) * 128>(x, y);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE Vc_CONST __m128d permute(__m128d x) {
static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
return _mm_shuffle_pd(x, x, Dst0 + Dst1 * 4);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) {
static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
static_assert(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, "Incorrect_Range");
static_assert(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, "Incorrect_Range");
if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) {
x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) {
x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
}
return x;
}
}
namespace Reg
{
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y);
}
template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
return Mem::shuffle<Dst0, Dst1>(x, y);
}
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64));
}
template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
return Mem::blend<Dst0, Dst1>(x, y);
}
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y);
}
}
}
#endif
#endif
#ifndef VC_SSE_VECTORHELPER_H_
#define VC_SSE_VECTORHELPER_H_
#include <limits>
namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{
#define Vc_OP0(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; }
#define Vc_OP1(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a) { return code; }
#define Vc_OP2(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b) { return code; }
#define Vc_OP3(name,code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b, const VectorType c) { return code; }
template<> struct VectorHelper<__m128>
{
typedef __m128 VectorType;
template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_ps(x); }
template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_ps(x); }
template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_ps(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_ps(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_ps(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast<char *>(mem)); }
Vc_OP0(allone, _mm_setallone_ps())
Vc_OP0(zero, _mm_setzero_ps())
Vc_OP3(blend, blendv_ps(a, b, c))
};
template<> struct VectorHelper<__m128d>
{
typedef __m128d VectorType;
template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_pd(x); }
template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_pd(x); }
template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_pd(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_pd(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_pd(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast<char *>(mem)); }
Vc_OP0(allone, _mm_setallone_pd())
Vc_OP0(zero, _mm_setzero_pd())
Vc_OP3(blend, blendv_pd(a, b, c))
};
template<> struct VectorHelper<__m128i>
{
typedef __m128i VectorType;
template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_si128(reinterpret_cast<const VectorType *>(x)); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_si128(reinterpret_cast<const VectorType *>(x)); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_si128(reinterpret_cast<VectorType *>(mem), x); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_si128(reinterpret_cast<VectorType *>(mem), x); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_si128(reinterpret_cast<VectorType *>(mem), x); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(x, m, reinterpret_cast<char *>(mem)); }
Vc_OP0(allone, _mm_setallone_si128())
Vc_OP0(zero, _mm_setzero_si128())
Vc_OP3(blend, blendv_epi8(a, b, c))
};
#undef Vc_OP1
#undef Vc_OP2
#undef Vc_OP3
#define Vc_OP1(op) \
static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return Vc_CAT2(_mm_##op##_, Vc_SUFFIX)(a); }
#define Vc_OP(op) \
static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op##_ , Vc_SUFFIX)(a, b); }
#define Vc_OP_(op) \
static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op , Vc_SUFFIX)(a, b); }
#define Vc_OPx(op,op2) \
static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op2##_, Vc_SUFFIX)(a, b); }
#define Vc_OP_CAST_(op) \
static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_castps_, Vc_SUFFIX)( \
_mm_##op##ps(Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(a), \
Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(b))); \
}
#define Vc_MINMAX \
static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return Vc_CAT2(_mm_min_, Vc_SUFFIX)(a, b); } \
static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return Vc_CAT2(_mm_max_, Vc_SUFFIX)(a, b); }
template<> struct VectorHelper<double> {
typedef __m128d VectorType;
typedef double EntryType;
#define Vc_SUFFIX pd
Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_pd(mask), a); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
#ifdef Vc_IMPL_FMA4
static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
v1 = _mm_macc_pd(v1, v2, v3);
}
#else
static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
#if defined(Vc_GCC) && Vc_GCC < 0x40703
asm("":"+x"(h1), "+x"(h2));
#endif
const VectorType l1 = _mm_sub_pd(v1, h1);
const VectorType l2 = _mm_sub_pd(v2, h2);
const VectorType ll = mul(l1, l2);
const VectorType lh = add(mul(l1, h2), mul(h1, l2));
const VectorType hh = mul(h1, h2);
const VectorType lh_lt_v3 = _mm_cmplt_pd(abs(lh), abs(v3));
const VectorType b = blendv_pd(v3, lh, lh_lt_v3);
const VectorType c = blendv_pd(lh, v3, lh_lt_v3);
v1 = add(add(ll, b), add(c, hh));
}
#endif
Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
Vc_OP1(sqrt)
static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) {
return _mm_div_pd(one(), sqrt(x));
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
return _mm_div_pd(one(), x);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
return _mm_cmpunord_pd(x, x);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x));
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
return _mm_castsi128_pd(cmpeq_epi64(_mm_castpd_si128(abs(x)), _mm_castpd_si128(_mm_load_pd(c_log<double>::d(1)))));
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_pd());
}
Vc_MINMAX
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
return _mm_cvtsd_f64(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
return _mm_cvtsd_f64(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
return _mm_cvtsd_f64(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
return _mm_cvtsd_f64(a);
}
#undef Vc_SUFFIX
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
#ifdef Vc_IMPL_SSE4_1
return _mm_round_pd(a, _MM_FROUND_NINT);
#else
return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
#endif
}
};
template<> struct VectorHelper<float> {
typedef float EntryType;
typedef __m128 VectorType;
#define Vc_SUFFIX ps
Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(mask, a); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE Vc_CONST __m128 concat(__m128d a, __m128d b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); }
#ifdef Vc_IMPL_FMA4
static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
v1 = _mm_macc_ps(v1, v2, v3);
}
#else
static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
__m128d v1_0 = _mm_cvtps_pd(v1);
__m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1));
__m128d v2_0 = _mm_cvtps_pd(v2);
__m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2));
__m128d v3_0 = _mm_cvtps_pd(v3);
__m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3));
v1 = _mm_movelh_ps(
_mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)),
_mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1)));
}
#endif
Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
Vc_OP1(sqrt) Vc_OP1(rsqrt)
static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
return _mm_cmpunord_ps(x, x);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x));
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(abs(x)), _mm_castps_si128(_mm_load_ps(c_log<float>::d(1)))));
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
return _mm_rcp_ps(x);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_ps());
}
Vc_MINMAX
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
a = _mm_min_ps(a, _mm_movehl_ps(a, a));
a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtss_f32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
a = _mm_max_ps(a, _mm_movehl_ps(a, a));
a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtss_f32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
return _mm_cvtss_f32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
return _mm_cvtss_f32(a);
}
#undef Vc_SUFFIX
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
#ifdef Vc_IMPL_SSE4_1
return _mm_round_ps(a, _MM_FROUND_NINT);
#else
return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
#endif
}
};
template<> struct VectorHelper<int> {
typedef int EntryType;
typedef __m128i VectorType;
#define Vc_SUFFIX si128
Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
#undef Vc_SUFFIX
#define Vc_SUFFIX epi32
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi32(a); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epi32(a, b); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epi32(a, b); }
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
return _mm_cvtsi128_si32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
return _mm_cvtsi128_si32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
return _mm_cvtsi128_si32(a);
}
#ifdef Vc_IMPL_SSE4_1
static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); }
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
return _mm_cvtsi128_si32(a);
}
#else
static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
const VectorType aShift = _mm_srli_si128(a, 4);
const VectorType ab02 = _mm_mul_epu32(a, b);
const VectorType bShift = _mm_srli_si128(b, 4);
const VectorType ab13 = _mm_mul_epu32(aShift, bShift);
return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
}
#endif
Vc_OP(add) Vc_OP(sub)
#undef Vc_SUFFIX
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
};
template<> struct VectorHelper<unsigned int> {
typedef unsigned int EntryType;
typedef __m128i VectorType;
#define Vc_SUFFIX si128
Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
#undef Vc_SUFFIX
#define Vc_SUFFIX epu32
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu32(a, b); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu32(a, b); }
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
return _mm_cvtsi128_si32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
return _mm_cvtsi128_si32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
return _mm_cvtsi128_si32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
return _mm_cvtsi128_si32(a);
}
static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
return VectorHelper<int>::mul(a, b);
}
#undef Vc_SUFFIX
#define Vc_SUFFIX epi32
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
Vc_OP(add) Vc_OP(sub)
#undef Vc_SUFFIX
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
};
template<> struct VectorHelper<signed short> {
typedef __m128i VectorType;
typedef signed short EntryType;
#define Vc_SUFFIX si128
Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packs_epi32(a, b); }
static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); }
static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); }
#undef Vc_SUFFIX
#define Vc_SUFFIX epi16
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
}
static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
v1 = add(mul(v1, v2), v3); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi16(a); }
Vc_OPx(mul, mullo)
Vc_OP(min) Vc_OP(max)
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtsi128_si32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtsi128_si32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtsi128_si32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtsi128_si32(a);
}
Vc_OP(add) Vc_OP(sub)
#undef Vc_SUFFIX
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
};
template<> struct VectorHelper<unsigned short> {
typedef __m128i VectorType;
typedef unsigned short EntryType;
#define Vc_SUFFIX si128
Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
#ifdef Vc_IMPL_SSE4_1
static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); }
#else
static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) {
auto tmp0 = _mm_unpacklo_epi16(a, b);
auto tmp1 = _mm_unpackhi_epi16(a, b);
auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
return _mm_unpacklo_epi16(tmp2, tmp3);
}
#endif
static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_unpacklo_epi16(x, _mm_setzero_si128()); }
static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); }
#undef Vc_SUFFIX
#define Vc_SUFFIX epu16
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
#if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || Vc_IMPL_SSE4_1
static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu16(a, b); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu16(a, b); }
#endif
#undef Vc_SUFFIX
#define Vc_SUFFIX epi16
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
}
static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
Vc_OPx(mul, mullo)
#if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(Vc_IMPL_SSE4_1)
Vc_OP(min) Vc_OP(max)
#endif
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtsi128_si32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtsi128_si32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtsi128_si32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtsi128_si32(a);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c,
const EntryType d, const EntryType e, const EntryType f,
const EntryType g, const EntryType h) {
return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
}
Vc_OP(add) Vc_OP(sub)
#undef Vc_SUFFIX
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
};
#undef Vc_OP1
#undef Vc_OP
#undef Vc_OP_
#undef Vc_OPx
#undef Vc_OP_CAST_
#undef Vc_MINMAX
}
}
#endif
#ifndef VC_SSE_MASK_H_
#define VC_SSE_MASK_H_
#ifndef VC_SSE_DETAIL_H_
#define VC_SSE_DETAIL_H_
#ifndef VC_SSE_CASTS_H_
#define VC_SSE_CASTS_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{
using uint = unsigned int;
using ushort = unsigned short;
using uchar = unsigned char;
using schar = signed char;
template <typename To, typename From> Vc_ALWAYS_INLINE Vc_CONST To sse_cast(From v)
{
return v;
}
template<> Vc_ALWAYS_INLINE Vc_CONST __m128i sse_cast<__m128i, __m128 >(__m128 v) { return _mm_castps_si128(v); }
template<> Vc_ALWAYS_INLINE Vc_CONST __m128i sse_cast<__m128i, __m128d>(__m128d v) { return _mm_castpd_si128(v); }
template<> Vc_ALWAYS_INLINE Vc_CONST __m128 sse_cast<__m128 , __m128d>(__m128d v) { return _mm_castpd_ps(v); }
template<> Vc_ALWAYS_INLINE Vc_CONST __m128 sse_cast<__m128 , __m128i>(__m128i v) { return _mm_castsi128_ps(v); }
template<> Vc_ALWAYS_INLINE Vc_CONST __m128d sse_cast<__m128d, __m128i>(__m128i v) { return _mm_castsi128_pd(v); }
template<> Vc_ALWAYS_INLINE Vc_CONST __m128d sse_cast<__m128d, __m128 >(__m128 v) { return _mm_castps_pd(v); }
template <typename From, typename To> struct ConvertTag
{
};
template <typename From, typename To>
Vc_INTRINSIC typename VectorTraits<To>::VectorType convert(
typename VectorTraits<From>::VectorType v)
{
return convert(v, ConvertTag<From, To>());
}
Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , int >) { return _mm_cvttps_epi32(v); }
Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, int >) { return _mm_cvttpd_epi32(v); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , int >) { return v; }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , int >) { return v; }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , int >) {
#ifdef Vc_IMPL_SSE4_1
return _mm_cvtepi16_epi32(v);
#else
return _mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16);
#endif
}
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, int >) {
#ifdef Vc_IMPL_SSE4_1
return _mm_cvtepu16_epi32(v);
#else
return _mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16);
#endif
}
Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , uint >) {
return _mm_castps_si128(
blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(v)),
_mm_castsi128_ps(_mm_xor_si128(
_mm_cvttps_epi32(_mm_sub_ps(v, _mm_set1_ps(1u << 31))),
_mm_set1_epi32(1 << 31))),
_mm_cmpge_ps(v, _mm_set1_ps(1u << 31))));
}
Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, uint >) {
#ifdef Vc_IMPL_SSE4_1
return _mm_xor_si128(_mm_cvttpd_epi32(_mm_sub_pd(_mm_floor_pd(v), _mm_set1_pd(0x80000000u))),
_mm_cvtsi64_si128(0x8000000080000000ull));
#else
return blendv_epi8(_mm_cvttpd_epi32(v),
_mm_xor_si128(_mm_cvttpd_epi32(_mm_sub_pd(v, _mm_set1_pd(0x80000000u))),
_mm_cvtsi64_si128(0x8000000080000000ull)),
_mm_castpd_si128(_mm_cmpge_pd(v, _mm_set1_pd(0x80000000u))));
#endif
}
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , uint >) { return v; }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , uint >) { return v; }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , uint >) { return convert(v, ConvertTag<short, int>()); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, uint >) { return convert(v, ConvertTag<ushort, int>()); }
Vc_INTRINSIC __m128 convert(__m128 v, ConvertTag<float , float >) { return v; }
Vc_INTRINSIC __m128 convert(__m128d v, ConvertTag<double, float >) { return _mm_cvtpd_ps(v); }
Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<int , float >) { return _mm_cvtepi32_ps(v); }
Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<uint , float >) {
using namespace SSE;
return blendv_ps(_mm_cvtepi32_ps(v),
_mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(v, _mm_set1_epi32(0x7ffffe00))),
_mm_add_ps(_mm_set1_ps(1u << 31), _mm_cvtepi32_ps(_mm_and_si128(
v, _mm_set1_epi32(0x000001ff))))),
_mm_castsi128_ps(_mm_cmplt_epi32(v, _mm_setzero_si128())));
}
Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<short , float >) { return convert(convert(v, ConvertTag<short, int>()), ConvertTag<int, float>()); }
Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<ushort, float >) { return convert(convert(v, ConvertTag<ushort, int>()), ConvertTag<int, float>()); }
Vc_INTRINSIC __m128d convert(__m128 v, ConvertTag<float , double>) { return _mm_cvtps_pd(v); }
Vc_INTRINSIC __m128d convert(__m128d v, ConvertTag<double, double>) { return v; }
Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<int , double>) { return _mm_cvtepi32_pd(v); }
Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<uint , double>) { return _mm_add_pd(_mm_cvtepi32_pd(_mm_xor_si128(v, setmin_epi32())), _mm_set1_pd(1u << 31)); }
Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<short , double>) { return convert(convert(v, ConvertTag<short, int>()), ConvertTag<int, double>()); }
Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<ushort, double>) { return convert(convert(v, ConvertTag<ushort, int>()), ConvertTag<int, double>()); }
Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , short >) { return _mm_packs_epi32(_mm_cvttps_epi32(v), _mm_setzero_si128()); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , short >) { return _mm_packs_epi32(v, _mm_setzero_si128()); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , short >) { return _mm_packs_epi32(v, _mm_setzero_si128()); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , short >) { return v; }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, short >) { return v; }
Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, short >) { return convert(convert(v, ConvertTag<double, int>()), ConvertTag<int, short>()); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , ushort>) {
auto tmp0 = _mm_unpacklo_epi16(v, _mm_setzero_si128());
auto tmp1 = _mm_unpackhi_epi16(v, _mm_setzero_si128());
auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
return _mm_unpacklo_epi16(tmp2, tmp3);
}
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , ushort>) {
auto tmp0 = _mm_unpacklo_epi16(v, _mm_setzero_si128());
auto tmp1 = _mm_unpackhi_epi16(v, _mm_setzero_si128());
auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
return _mm_unpacklo_epi16(tmp2, tmp3);
}
Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , ushort>) { return convert(_mm_cvttps_epi32(v), ConvertTag<int, ushort>()); }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , ushort>) { return v; }
Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, ushort>) { return v; }
Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, ushort>) { return convert(convert(v, ConvertTag<double, int>()), ConvertTag<int, ushort>()); }
}
}
#endif
#ifdef Vc_IMPL_AVX
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename V, typename DstT> struct LoadTag
{
};
class when_aligned
{
public:
template <typename F> constexpr when_aligned(F, typename F::EnableIfAligned = nullptr)
{
}
};
class when_unaligned
{
public:
template <typename F>
constexpr when_unaligned(F, typename F::EnableIfUnaligned = nullptr)
{
}
};
class when_streaming
{
public:
template <typename F>
constexpr when_streaming(F, typename F::EnableIfStreaming = nullptr)
{
}
};
Vc_INTRINSIC __m128 load16(const float *mem, when_aligned)
{
return _mm_load_ps(mem);
}
Vc_INTRINSIC __m128 load16(const float *mem, when_unaligned)
{
return _mm_loadu_ps(mem);
}
Vc_INTRINSIC __m128 load16(const float *mem, when_streaming)
{
return SseIntrinsics::_mm_stream_load(mem);
}
Vc_INTRINSIC __m128d load16(const double *mem, when_aligned)
{
return _mm_load_pd(mem);
}
Vc_INTRINSIC __m128d load16(const double *mem, when_unaligned)
{
return _mm_loadu_pd(mem);
}
Vc_INTRINSIC __m128d load16(const double *mem, when_streaming)
{
return SseIntrinsics::_mm_stream_load(mem);
}
template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_aligned)
{
static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
}
template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_unaligned)
{
static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem));
}
template <class T> Vc_INTRINSIC __m128i load16(const T *mem, when_streaming)
{
static_assert(std::is_integral<T>::value, "load16<T> is only intended for integral T");
return SseIntrinsics::_mm_stream_load(mem);
}
#ifdef Vc_MSVC
template <typename V, typename DstT, typename F>
Vc_INTRINSIC __m128d load(const double *mem, F f,
enable_if<(std::is_same<DstT, double>::value &&
std::is_same<V, __m128d>::value)> = nullarg)
{
return load16(mem, f);
}
template <typename V, typename DstT, typename F>
Vc_INTRINSIC __m128 load(const float *mem, F f,
enable_if<(std::is_same<DstT, float>::value &&
std::is_same<V, __m128>::value)> = nullarg)
{
return load16(mem, f);
}
template <typename V, typename DstT, typename F>
Vc_INTRINSIC __m128i load(const uint *mem, F f,
enable_if<(std::is_same<DstT, uint>::value &&
std::is_same<V, __m128i>::value)> = nullarg)
{
return load16(mem, f);
}
template <typename V, typename DstT, typename F>
Vc_INTRINSIC __m128i load(const int *mem, F f,
enable_if<(std::is_same<DstT, int>::value &&
std::is_same<V, __m128i>::value)> = nullarg)
{
return load16(mem, f);
}
template <typename V, typename DstT, typename F>
Vc_INTRINSIC __m128i load(const short *mem, F f,
enable_if<(std::is_same<DstT, short>::value &&
std::is_same<V, __m128i>::value)> = nullarg)
{
return load16(mem, f);
}
template <typename V, typename DstT, typename F>
Vc_INTRINSIC __m128i load(const ushort *mem, F f,
enable_if<(std::is_same<DstT, ushort>::value &&
std::is_same<V, __m128i>::value)> = nullarg)
{
return load16(mem, f);
}
#endif
template <typename V, typename DstT, typename SrcT, typename Flags,
typename = enable_if<
#ifdef Vc_MSVC
!std::is_same<DstT, SrcT>::value &&
#endif
(!std::is_integral<DstT>::value || !std::is_integral<SrcT>::value ||
sizeof(DstT) >= sizeof(SrcT))>>
Vc_INTRINSIC V load(const SrcT *mem, Flags flags)
{
return load(mem, flags, LoadTag<V, DstT>());
}
template <typename V, typename T, typename Flags>
Vc_INTRINSIC V
load(const T *mem, Flags, LoadTag<V, T>, enable_if<sizeof(V) == 16> = nullarg)
{
return SSE::VectorHelper<V>::template load<Flags>(mem);
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, short>)
{
return SSE::VectorHelper<__m128i>::load<Flags>(mem);
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, short>)
{
return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, short>)
{
return SSE::cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, ushort>)
{
return SSE::cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const uint *mem, Flags, LoadTag<__m128i, int>)
{
return SSE::VectorHelper<__m128i>::load<Flags>(mem);
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, int>)
{
return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const short *mem, Flags, LoadTag<__m128i, int>)
{
return SSE::cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, int>)
{
return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const schar *mem, Flags, LoadTag<__m128i, int>)
{
return SSE::cvtepi8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const ushort *mem, Flags, LoadTag<__m128i, uint>)
{
return SSE::cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128i load(const uchar *mem, Flags, LoadTag<__m128i, uint>)
{
return SSE::cvtepu8_epi32(_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128d load(const float *mem, Flags, LoadTag<__m128d, double>)
{
return SSE::convert<float, double>(
_mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64 *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128d load(const uint *mem, Flags, LoadTag<__m128d, double>)
{
return SSE::convert<uint, double>(
_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128d load(const int *mem, Flags, LoadTag<__m128d, double>)
{
return SSE::convert<int, double>(
_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128d load(const ushort *mem, Flags, LoadTag<__m128d, double>)
{
return SSE::convert<ushort, double>(
_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128d load(const short *mem, Flags, LoadTag<__m128d, double>)
{
return SSE::convert<short, double>(
_mm_cvtsi32_si128(*aliasing_cast<int>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128d load(const uchar *mem, Flags, LoadTag<__m128d, double>)
{
return SSE::convert<uchar, double>(
_mm_set1_epi16(*aliasing_cast<short>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128d load(const schar *mem, Flags, LoadTag<__m128d, double>)
{
return SSE::convert<char, double>(
_mm_set1_epi16(*aliasing_cast<short>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m128 load(const double *mem, Flags, LoadTag<__m128, float>)
{
#ifdef Vc_IMPL_AVX
if (Flags::IsUnaligned) {
return _mm256_cvtpd_ps(_mm256_loadu_pd(mem));
} else if (Flags::IsStreaming) {
return _mm256_cvtpd_ps(AvxIntrinsics::stream_load<__m256d>(mem));
} else {
return _mm256_cvtpd_ps(_mm256_load_pd(mem));
}
#else
return _mm_movelh_ps(_mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load<Flags>(&mem[0])),
_mm_cvtpd_ps(SSE::VectorHelper<__m128d>::load<Flags>(&mem[2])));
#endif
}
template <typename Flags>
Vc_INTRINSIC __m128 load(const uint *mem, Flags f, LoadTag<__m128, float>)
{
return SSE::convert<uint, float>(load<__m128i, uint>(mem, f));
}
template <typename T, typename Flags,
typename = enable_if<!std::is_same<T, float>::value>>
Vc_INTRINSIC __m128 load(const T *mem, Flags f, LoadTag<__m128, float>)
{
return _mm_cvtepi32_ps(load<__m128i, int>(mem, f));
}
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<amount == 0, T> shifted(T k)
{
return k;
}
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount > 0), T> shifted(T k)
{
return _mm_srli_si128(k, amount);
}
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 16 && amount < 0), T> shifted(T k)
{
return _mm_slli_si128(k, -amount);
}
template <typename T, int Size> Vc_INTRINSIC Vc_CONST const T *IndexesFromZero()
{
if (Size == 4) {
return reinterpret_cast<const T *>(SSE::_IndexesFromZero4);
} else if (Size == 8) {
return reinterpret_cast<const T *>(SSE::_IndexesFromZero8);
} else if (Size == 16) {
return reinterpret_cast<const T *>(SSE::_IndexesFromZero16);
}
return 0;
}
Vc_INTRINSIC Vc_CONST unsigned int popcnt4(unsigned int n)
{
#ifdef Vc_IMPL_POPCNT
return _mm_popcnt_u32(n);
#else
n = (n & 0x5U) + ((n >> 1) & 0x5U);
n = (n & 0x3U) + ((n >> 2) & 0x3U);
return n;
#endif
}
Vc_INTRINSIC Vc_CONST unsigned int popcnt8(unsigned int n)
{
#ifdef Vc_IMPL_POPCNT
return _mm_popcnt_u32(n);
#else
n = (n & 0x55U) + ((n >> 1) & 0x55U);
n = (n & 0x33U) + ((n >> 2) & 0x33U);
n = (n & 0x0fU) + ((n >> 4) & 0x0fU);
return n;
#endif
}
Vc_INTRINSIC Vc_CONST unsigned int popcnt16(unsigned int n)
{
#ifdef Vc_IMPL_POPCNT
return _mm_popcnt_u32(n);
#else
n = (n & 0x5555U) + ((n >> 1) & 0x5555U);
n = (n & 0x3333U) + ((n >> 2) & 0x3333U);
n = (n & 0x0f0fU) + ((n >> 4) & 0x0f0fU);
n = (n & 0x00ffU) + ((n >> 8) & 0x00ffU);
return n;
#endif
}
Vc_INTRINSIC Vc_CONST unsigned int popcnt32(unsigned int n)
{
#ifdef Vc_IMPL_POPCNT
return _mm_popcnt_u32(n);
#else
n = (n & 0x55555555U) + ((n >> 1) & 0x55555555U);
n = (n & 0x33333333U) + ((n >> 2) & 0x33333333U);
n = (n & 0x0f0f0f0fU) + ((n >> 4) & 0x0f0f0f0fU);
n = (n & 0x00ff00ffU) + ((n >> 8) & 0x00ff00ffU);
n = (n & 0x0000ffffU) + ((n >>16) & 0x0000ffffU);
return n;
#endif
}
template<size_t From, size_t To, typename R> Vc_INTRINSIC Vc_CONST R mask_cast(__m128i k)
{
static_assert(From == To, "Incorrect mask cast.");
static_assert(std::is_same<R, __m128>::value, "Incorrect mask cast.");
return SSE::sse_cast<__m128>(k);
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 4, __m128>(__m128i k)
{
return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128()));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<2, 8, __m128>(__m128i k)
{
return SSE::sse_cast<__m128>(
_mm_packs_epi16(_mm_packs_epi16(k, _mm_setzero_si128()), _mm_setzero_si128()));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 2, __m128>(__m128i k)
{
return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(k, k));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m128i k)
{
return SSE::sse_cast<__m128>(_mm_packs_epi16(k, _mm_setzero_si128()));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 2, __m128>(__m128i k)
{
const auto tmp = _mm_unpacklo_epi16(k, k);
return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m128i k)
{
return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 8, __m128>(__m128i k)
{
return SSE::sse_cast<__m128>(_mm_unpacklo_epi8(k, k));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 4, __m128>(__m128i k)
{
const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 8, __m128>(k));
return SSE::sse_cast<__m128>(_mm_unpacklo_epi16(tmp, tmp));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<16, 2, __m128>(__m128i k)
{
const auto tmp = SSE::sse_cast<__m128i>(mask_cast<16, 4, __m128>(k));
return SSE::sse_cast<__m128>(_mm_unpacklo_epi32(tmp, tmp));
}
template <typename V> Vc_INTRINSIC_L Vc_CONST_L V allone() Vc_INTRINSIC_R Vc_CONST_R;
template<> Vc_INTRINSIC Vc_CONST __m128 allone<__m128 >() { return SSE::_mm_setallone_ps(); }
template<> Vc_INTRINSIC Vc_CONST __m128i allone<__m128i>() { return SSE::_mm_setallone_si128(); }
template<> Vc_INTRINSIC Vc_CONST __m128d allone<__m128d>() { return SSE::_mm_setallone_pd(); }
template <typename V> inline V zero();
template<> Vc_INTRINSIC Vc_CONST __m128 zero<__m128 >() { return _mm_setzero_ps(); }
template<> Vc_INTRINSIC Vc_CONST __m128i zero<__m128i>() { return _mm_setzero_si128(); }
template<> Vc_INTRINSIC Vc_CONST __m128d zero<__m128d>() { return _mm_setzero_pd(); }
Vc_ALWAYS_INLINE Vc_CONST __m128 negate(__m128 v, std::integral_constant<std::size_t, 4>)
{
return _mm_xor_ps(v, SSE::_mm_setsignmask_ps());
}
Vc_ALWAYS_INLINE Vc_CONST __m128d negate(__m128d v, std::integral_constant<std::size_t, 8>)
{
return _mm_xor_pd(v, SSE::_mm_setsignmask_pd());
}
Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant<std::size_t, 4>)
{
#ifdef Vc_IMPL_SSSE3
return _mm_sign_epi32(v, allone<__m128i>());
#else
return _mm_sub_epi32(_mm_setzero_si128(), v);
#endif
}
Vc_ALWAYS_INLINE Vc_CONST __m128i negate(__m128i v, std::integral_constant<std::size_t, 2>)
{
#ifdef Vc_IMPL_SSSE3
return _mm_sign_epi16(v, allone<__m128i>());
#else
return _mm_sub_epi16(_mm_setzero_si128(), v);
#endif
}
Vc_INTRINSIC __m128 xor_(__m128 a, __m128 b) { return _mm_xor_ps(a, b); }
Vc_INTRINSIC __m128d xor_(__m128d a, __m128d b) { return _mm_xor_pd(a, b); }
Vc_INTRINSIC __m128i xor_(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
Vc_INTRINSIC __m128 or_(__m128 a, __m128 b) { return _mm_or_ps(a, b); }
Vc_INTRINSIC __m128d or_(__m128d a, __m128d b) { return _mm_or_pd(a, b); }
Vc_INTRINSIC __m128i or_(__m128i a, __m128i b) { return _mm_or_si128(a, b); }
Vc_INTRINSIC __m128 and_(__m128 a, __m128 b) { return _mm_and_ps(a, b); }
Vc_INTRINSIC __m128d and_(__m128d a, __m128d b) { return _mm_and_pd(a, b); }
Vc_INTRINSIC __m128i and_(__m128i a, __m128i b) { return _mm_and_si128(a, b); }
Vc_INTRINSIC __m128 andnot_(__m128 a, __m128 b) { return _mm_andnot_ps(a, b); }
Vc_INTRINSIC __m128d andnot_(__m128d a, __m128d b) { return _mm_andnot_pd(a, b); }
Vc_INTRINSIC __m128i andnot_(__m128i a, __m128i b) { return _mm_andnot_si128(a, b); }
Vc_INTRINSIC __m128 not_(__m128 a) { return andnot_(a, allone<__m128 >()); }
Vc_INTRINSIC __m128d not_(__m128d a) { return andnot_(a, allone<__m128d>()); }
Vc_INTRINSIC __m128i not_(__m128i a) { return andnot_(a, allone<__m128i>()); }
Vc_INTRINSIC __m128 add(__m128 a, __m128 b, float) { return _mm_add_ps(a, b); }
Vc_INTRINSIC __m128d add(__m128d a, __m128d b, double) { return _mm_add_pd(a, b); }
Vc_INTRINSIC __m128i add(__m128i a, __m128i b, int) { return _mm_add_epi32(a, b); }
Vc_INTRINSIC __m128i add(__m128i a, __m128i b, uint) { return _mm_add_epi32(a, b); }
Vc_INTRINSIC __m128i add(__m128i a, __m128i b, short) { return _mm_add_epi16(a, b); }
Vc_INTRINSIC __m128i add(__m128i a, __m128i b, ushort) { return _mm_add_epi16(a, b); }
Vc_INTRINSIC __m128i add(__m128i a, __m128i b, schar) { return _mm_add_epi8 (a, b); }
Vc_INTRINSIC __m128i add(__m128i a, __m128i b, uchar) { return _mm_add_epi8 (a, b); }
Vc_INTRINSIC __m128 sub(__m128 a, __m128 b, float) { return _mm_sub_ps(a, b); }
Vc_INTRINSIC __m128d sub(__m128d a, __m128d b, double) { return _mm_sub_pd(a, b); }
Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, int) { return _mm_sub_epi32(a, b); }
Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, uint) { return _mm_sub_epi32(a, b); }
Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, short) { return _mm_sub_epi16(a, b); }
Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, ushort) { return _mm_sub_epi16(a, b); }
Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, schar) { return _mm_sub_epi8 (a, b); }
Vc_INTRINSIC __m128i sub(__m128i a, __m128i b, uchar) { return _mm_sub_epi8 (a, b); }
Vc_INTRINSIC __m128 mul(__m128 a, __m128 b, float) { return _mm_mul_ps(a, b); }
Vc_INTRINSIC __m128d mul(__m128d a, __m128d b, double) { return _mm_mul_pd(a, b); }
Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, int) {
#ifdef Vc_IMPL_SSE4_1
return _mm_mullo_epi32(a, b);
#else
const __m128i aShift = _mm_srli_si128(a, 4);
const __m128i ab02 = _mm_mul_epu32(a, b);
const __m128i bShift = _mm_srli_si128(b, 4);
const __m128i ab13 = _mm_mul_epu32(aShift, bShift);
return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
#endif
}
Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, uint) { return mul(a, b, int()); }
Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, short) { return _mm_mullo_epi16(a, b); }
Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, ushort) { return _mm_mullo_epi16(a, b); }
Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, schar) {
#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
using B = Common::BuiltinType<schar, 16>;
const auto x = aliasing_cast<B>(a) * aliasing_cast<B>(b);
return reinterpret_cast<const __m128i &>(x);
#else
return or_(
and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)),
_mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8));
#endif
}
Vc_INTRINSIC __m128i mul(__m128i a, __m128i b, uchar) {
#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
using B = Common::BuiltinType<uchar, 16>;
const auto x = aliasing_cast<B>(a) * aliasing_cast<B>(b);
return reinterpret_cast<const __m128i &>(x);
#else
return or_(
and_(_mm_mullo_epi16(a, b), _mm_slli_epi16(allone<__m128i>(), 8)),
_mm_slli_epi16(_mm_mullo_epi16(_mm_srli_si128(a, 1), _mm_srli_si128(b, 1)), 8));
#endif
}
Vc_INTRINSIC __m128 div(__m128 a, __m128 b, float) { return _mm_div_ps(a, b); }
Vc_INTRINSIC __m128d div(__m128d a, __m128d b, double) { return _mm_div_pd(a, b); }
Vc_INTRINSIC __m128 min(__m128 a, __m128 b, float) { return _mm_min_ps(a, b); }
Vc_INTRINSIC __m128d min(__m128d a, __m128d b, double) { return _mm_min_pd(a, b); }
Vc_INTRINSIC __m128i min(__m128i a, __m128i b, int) { return SSE::min_epi32(a, b); }
Vc_INTRINSIC __m128i min(__m128i a, __m128i b, uint) { return SSE::min_epu32(a, b); }
Vc_INTRINSIC __m128i min(__m128i a, __m128i b, short) { return _mm_min_epi16(a, b); }
Vc_INTRINSIC __m128i min(__m128i a, __m128i b, ushort) { return SSE::min_epu16(a, b); }
Vc_INTRINSIC __m128i min(__m128i a, __m128i b, schar) { return SSE::min_epi8 (a, b); }
Vc_INTRINSIC __m128i min(__m128i a, __m128i b, uchar) { return _mm_min_epu8 (a, b); }
Vc_INTRINSIC __m128 max(__m128 a, __m128 b, float) { return _mm_max_ps(a, b); }
Vc_INTRINSIC __m128d max(__m128d a, __m128d b, double) { return _mm_max_pd(a, b); }
Vc_INTRINSIC __m128i max(__m128i a, __m128i b, int) { return SSE::max_epi32(a, b); }
Vc_INTRINSIC __m128i max(__m128i a, __m128i b, uint) { return SSE::max_epu32(a, b); }
Vc_INTRINSIC __m128i max(__m128i a, __m128i b, short) { return _mm_max_epi16(a, b); }
Vc_INTRINSIC __m128i max(__m128i a, __m128i b, ushort) { return SSE::max_epu16(a, b); }
Vc_INTRINSIC __m128i max(__m128i a, __m128i b, schar) { return SSE::max_epi8 (a, b); }
Vc_INTRINSIC __m128i max(__m128i a, __m128i b, uchar) { return _mm_max_epu8 (a, b); }
Vc_INTRINSIC float add(__m128 a, float) {
a = _mm_add_ps(a, _mm_movehl_ps(a, a));
a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtss_f32(a);
}
Vc_INTRINSIC double add(__m128d a, double) {
a = _mm_add_sd(a, _mm_unpackhi_pd(a, a));
return _mm_cvtsd_f64(a);
}
Vc_INTRINSIC int add(__m128i a, int) {
a = add(a, _mm_srli_si128(a, 8), int());
a = add(a, _mm_srli_si128(a, 4), int());
return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC uint add(__m128i a, uint) { return add(a, int()); }
Vc_INTRINSIC short add(__m128i a, short) {
a = add(a, _mm_srli_si128(a, 8), short());
a = add(a, _mm_srli_si128(a, 4), short());
a = add(a, _mm_srli_si128(a, 2), short());
return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC ushort add(__m128i a, ushort) { return add(a, short()); }
Vc_INTRINSIC schar add(__m128i a, schar) {
a = add(a, _mm_srli_si128(a, 8), schar());
a = add(a, _mm_srli_si128(a, 4), schar());
a = add(a, _mm_srli_si128(a, 2), schar());
a = add(a, _mm_srli_si128(a, 1), schar());
return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC uchar add(__m128i a, uchar) { return add(a, schar()); }
Vc_INTRINSIC float mul(__m128 a, float) {
a = _mm_mul_ps(a, _mm_movehl_ps(a, a));
a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtss_f32(a);
}
Vc_INTRINSIC double mul(__m128d a, double) {
a = _mm_mul_sd(a, _mm_unpackhi_pd(a, a));
return _mm_cvtsd_f64(a);
}
Vc_INTRINSIC int mul(__m128i a, int) {
a = mul(a, _mm_srli_si128(a, 8), int());
a = mul(a, _mm_srli_si128(a, 4), int());
return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC uint mul(__m128i a, uint) { return mul(a, int()); }
Vc_INTRINSIC short mul(__m128i a, short) {
a = mul(a, _mm_srli_si128(a, 8), short());
a = mul(a, _mm_srli_si128(a, 4), short());
a = mul(a, _mm_srli_si128(a, 2), short());
return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC ushort mul(__m128i a, ushort) { return mul(a, short()); }
Vc_INTRINSIC schar mul(__m128i a, schar) {
const __m128i s0 = _mm_srai_epi16(a, 1);
const __m128i s1 = Detail::and_(a, _mm_set1_epi32(0x0f0f0f0f));
return mul(mul(s0, s1, short()), short());
}
Vc_INTRINSIC uchar mul(__m128i a, uchar) { return mul(a, schar()); }
Vc_INTRINSIC float min(__m128 a, float) {
a = _mm_min_ps(a, _mm_movehl_ps(a, a));
a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtss_f32(a);
}
Vc_INTRINSIC double min(__m128d a, double) {
a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
return _mm_cvtsd_f64(a);
}
Vc_INTRINSIC int min(__m128i a, int) {
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC uint min(__m128i a, uint) {
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC short min(__m128i a, short) {
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short());
return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC ushort min(__m128i a, ushort) {
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort());
return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC schar min(__m128i a, schar) {
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
return std::min(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a)));
}
Vc_INTRINSIC uchar min(__m128i a, uchar) {
a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
return std::min((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff);
}
Vc_INTRINSIC float max(__m128 a, float) {
a = _mm_max_ps(a, _mm_movehl_ps(a, a));
a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtss_f32(a);
}
Vc_INTRINSIC double max(__m128d a, double) {
a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
return _mm_cvtsd_f64(a);
}
Vc_INTRINSIC int max(__m128i a, int) {
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), int());
return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC uint max(__m128i a, uint) {
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), uint());
return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC short max(__m128i a, short) {
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), short());
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), short());
return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC ushort max(__m128i a, ushort) {
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), ushort());
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), ushort());
return _mm_cvtsi128_si32(a);
}
Vc_INTRINSIC schar max(__m128i a, schar) {
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
return std::max(schar(_mm_cvtsi128_si32(a) >> 8), schar(_mm_cvtsi128_si32(a)));
}
Vc_INTRINSIC uchar max(__m128i a, uchar) {
a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)), schar());
a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)), schar());
return std::max((_mm_cvtsi128_si32(a) >> 8) & 0xff, _mm_cvtsi128_si32(a) & 0xff);
}
template <Vc::Implementation, typename T>
Vc_CONST_L SSE::Vector<T> Vc_VDECL sorted(SSE::Vector<T> x) Vc_CONST_R;
template <typename T> Vc_INTRINSIC Vc_CONST SSE::Vector<T> sorted(SSE::Vector<T> x)
{
static_assert(!CurrentImplementation::is(ScalarImpl),
"Detail::sorted can only be instantiated if a non-Scalar "
"implementation is selected.");
return sorted < CurrentImplementation::is_between(SSE2Impl, SSSE3Impl)
? SSE2Impl
: CurrentImplementation::is_between(SSE41Impl, SSE42Impl)
? SSE41Impl
: CurrentImplementation::current() > (x);
}
template <typename V> constexpr int sanitize(int n)
{
return (n >= int(sizeof(V)) || n <= -int(sizeof(V))) ? 0 : n;
}
template <typename T, size_t N, typename V>
static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> rotated(V v, int amount)
{
using namespace SSE;
switch (static_cast<unsigned int>(amount) % N) {
case 0:
return v;
case 1:
return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(1 * sizeof(T))));
case 2:
return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(2 * sizeof(T))));
case 3:
return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(3 * sizeof(T))));
case 4:
return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(4 * sizeof(T))));
case 5:
return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(5 * sizeof(T))));
case 6:
return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(6 * sizeof(T))));
case 7:
return sse_cast<V>(_mm_alignr_epi8(v, v, sanitize<V>(7 * sizeof(T))));
}
return sse_cast<V>(_mm_setzero_si128());
}
template<typename V, size_t Size, size_t VSize> struct InterleaveImpl;
template<typename V> struct InterleaveImpl<V, 8, 16> {
template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1)
{
const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
#ifdef __x86_64__
const long long tmp00 = _mm_cvtsi128_si64(tmp0);
const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0));
const long long tmp10 = _mm_cvtsi128_si64(tmp1);
const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1));
aliasing_cast<int>(data[i[0]]) = tmp00;
aliasing_cast<int>(data[i[1]]) = tmp00 >> 32;
aliasing_cast<int>(data[i[2]]) = tmp01;
aliasing_cast<int>(data[i[3]]) = tmp01 >> 32;
aliasing_cast<int>(data[i[4]]) = tmp10;
aliasing_cast<int>(data[i[5]]) = tmp10 >> 32;
aliasing_cast<int>(data[i[6]]) = tmp11;
aliasing_cast<int>(data[i[7]]) = tmp11 >> 32;
#elif defined(Vc_IMPL_SSE4_1)
using namespace SseIntrinsics;
aliasing_cast<int>(data[i[0]]) = _mm_cvtsi128_si32(tmp0);
aliasing_cast<int>(data[i[1]]) = extract_epi32<1>(tmp0);
aliasing_cast<int>(data[i[2]]) = extract_epi32<2>(tmp0);
aliasing_cast<int>(data[i[3]]) = extract_epi32<3>(tmp0);
aliasing_cast<int>(data[i[4]]) = _mm_cvtsi128_si32(tmp1);
aliasing_cast<int>(data[i[5]]) = extract_epi32<1>(tmp1);
aliasing_cast<int>(data[i[6]]) = extract_epi32<2>(tmp1);
aliasing_cast<int>(data[i[7]]) = extract_epi32<3>(tmp1);
#else
aliasing_cast<int>(data[i[0]]) = _mm_cvtsi128_si32(tmp0);
aliasing_cast<int>(data[i[1]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 4));
aliasing_cast<int>(data[i[2]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 8));
aliasing_cast<int>(data[i[3]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 12));
aliasing_cast<int>(data[i[4]]) = _mm_cvtsi128_si32(tmp1);
aliasing_cast<int>(data[i[5]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 4));
aliasing_cast<int>(data[i[6]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 8));
aliasing_cast<int>(data[i[7]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 12));
#endif
}
static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
const typename V::AsArg v0, const typename V::AsArg v1)
{
const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
V(tmp0).store(&data[i[0]], Vc::Unaligned);
V(tmp1).store(&data[i[4]], Vc::Unaligned);
}
template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
{
#if defined Vc_USE_MASKMOV_SCATTER && !defined Vc_MSVC
const __m64 mask = _mm_set_pi16(0, -1, -1, -1);
const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data());
const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data());
const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
_mm_maskmove_si64(_mm_movepi64_pi64(tmp4), mask, reinterpret_cast<char *>(&data[i[0]]));
_mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp4, 8)), mask, reinterpret_cast<char *>(&data[i[1]]));
_mm_maskmove_si64(_mm_movepi64_pi64(tmp5), mask, reinterpret_cast<char *>(&data[i[2]]));
_mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp5, 8)), mask, reinterpret_cast<char *>(&data[i[3]]));
_mm_maskmove_si64(_mm_movepi64_pi64(tmp6), mask, reinterpret_cast<char *>(&data[i[4]]));
_mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp6, 8)), mask, reinterpret_cast<char *>(&data[i[5]]));
_mm_maskmove_si64(_mm_movepi64_pi64(tmp7), mask, reinterpret_cast<char *>(&data[i[6]]));
_mm_maskmove_si64(_mm_movepi64_pi64(_mm_srli_si128(tmp7, 8)), mask, reinterpret_cast<char *>(&data[i[7]]));
_mm_empty();
#else
interleave(data, i, v0, v1);
v2.scatter(data + 2, i);
#endif
}
template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3)
{
const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());
const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
_mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4);
_mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5);
_mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6);
_mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7);
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4));
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5));
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6));
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7));
}
static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3)
{
const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());
const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
V(tmp4).store(&data[i[0]], ::Vc::Unaligned);
V(tmp5).store(&data[i[2]], ::Vc::Unaligned);
V(tmp6).store(&data[i[4]], ::Vc::Unaligned);
V(tmp7).store(&data[i[6]], ::Vc::Unaligned);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4)
{
interleave(data, i, v0, v1, v2, v3);
v4.scatter(data + 4, i);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6, const typename V::AsArg v7)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6, v7);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1)
{
const __m128i a = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[0]]));
const __m128i b = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[1]]));
const __m128i c = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[2]]));
const __m128i d = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[3]]));
const __m128i e = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[4]]));
const __m128i f = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[5]]));
const __m128i g = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[6]]));
const __m128i h = _mm_cvtsi32_si128(*aliasing_cast<int>(&data[i[7]]));
const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2)
{
const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[0]]));
const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[1]]));
const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[2]]));
const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[3]]));
const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[4]]));
const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[5]]));
const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[6]]));
const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[7]]));
const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3)
{
const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[0]]));
const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[1]]));
const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[2]]));
const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[3]]));
const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[4]]));
const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[5]]));
const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[6]]));
const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&data[i[7]]));
const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
{
const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
const __m128i tmp10 = _mm_unpackhi_epi16(a, e);
const __m128i tmp11 = _mm_unpackhi_epi16(c, g);
const __m128i tmp12 = _mm_unpackhi_epi16(b, f);
const __m128i tmp13 = _mm_unpackhi_epi16(d, h);
const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11);
const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13);
v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
{
const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
const __m128i tmp10 = _mm_unpackhi_epi16(a, e);
const __m128i tmp11 = _mm_unpackhi_epi16(c, g);
const __m128i tmp12 = _mm_unpackhi_epi16(b, f);
const __m128i tmp13 = _mm_unpackhi_epi16(d, h);
const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11);
const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13);
v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
{
const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
const __m128i tmp10 = _mm_unpackhi_epi16(a, e);
const __m128i tmp11 = _mm_unpackhi_epi16(c, g);
const __m128i tmp12 = _mm_unpackhi_epi16(b, f);
const __m128i tmp13 = _mm_unpackhi_epi16(d, h);
const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11);
const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13);
const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11);
const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13);
v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
{
const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]]));
const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]]));
const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]]));
const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]]));
const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]]));
const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]]));
const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]]));
const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]]));
const __m128i tmp2 = _mm_unpacklo_epi16(a, e);
const __m128i tmp4 = _mm_unpacklo_epi16(b, f);
const __m128i tmp3 = _mm_unpacklo_epi16(c, g);
const __m128i tmp5 = _mm_unpacklo_epi16(d, h);
const __m128i tmp10 = _mm_unpackhi_epi16(a, e);
const __m128i tmp11 = _mm_unpackhi_epi16(c, g);
const __m128i tmp12 = _mm_unpackhi_epi16(b, f);
const __m128i tmp13 = _mm_unpackhi_epi16(d, h);
const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3);
const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5);
const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3);
const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5);
const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11);
const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13);
const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11);
const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13);
v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
v7.data() = _mm_unpackhi_epi16(tmp14, tmp15);
}
};
template<typename V> struct InterleaveImpl<V, 4, 16> {
static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
const typename V::AsArg v0, const typename V::AsArg v1)
{
const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
_mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), tmp0);
_mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), tmp1);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1)
{
const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
_mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
_mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2)
{
#ifdef Vc_USE_MASKMOV_SCATTER
const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
const __m128i mask = _mm_set_epi32(0, -1, -1, -1);
_mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast<char *>(&data[i[0]]));
_mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast<char *>(&data[i[1]]));
_mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast<char *>(&data[i[2]]));
_mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast<char *>(&data[i[3]]));
#else
const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
_mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
_mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
v2.scatter(data + 2, i);
#endif
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3)
{
const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
_mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), _mm_movelh_ps(tmp0, tmp2));
_mm_storeu_ps(aliasing_cast<float>(&data[i[1]]), _mm_movehl_ps(tmp2, tmp0));
_mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), _mm_movelh_ps(tmp1, tmp3));
_mm_storeu_ps(aliasing_cast<float>(&data[i[3]]), _mm_movehl_ps(tmp3, tmp1));
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4)
{
interleave(data, i, v0, v1, v2, v3);
v4.scatter(data + 4, i);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6, const typename V::AsArg v7)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6, v7);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1)
{
const __m128 a = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[0]])));
const __m128 b = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[1]])));
const __m128 c = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[2]])));
const __m128 d = _mm_castpd_ps(_mm_load_sd(aliasing_cast<double>(&data[i[3]])));
const __m128 tmp0 = _mm_unpacklo_ps(a, b);
const __m128 tmp1 = _mm_unpacklo_ps(c, d);
v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2)
{
const __m128 a = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
const __m128 b = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
const __m128 c = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
const __m128 d = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
const __m128 tmp0 = _mm_unpacklo_ps(a, b);
const __m128 tmp1 = _mm_unpacklo_ps(c, d);
const __m128 tmp2 = _mm_unpackhi_ps(a, b);
const __m128 tmp3 = _mm_unpackhi_ps(c, d);
v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
v2.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp2, tmp3));
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3)
{
const __m128 a = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
const __m128 b = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
const __m128 c = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
const __m128 d = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
const __m128 tmp0 = _mm_unpacklo_ps(a, b);
const __m128 tmp1 = _mm_unpacklo_ps(c, d);
const __m128 tmp2 = _mm_unpackhi_ps(a, b);
const __m128 tmp3 = _mm_unpackhi_ps(c, d);
v0.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp0, tmp1));
v1.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp1, tmp0));
v2.data() = SSE::sse_cast<typename V::VectorType>(_mm_movelh_ps(tmp2, tmp3));
v3.data() = SSE::sse_cast<typename V::VectorType>(_mm_movehl_ps(tmp3, tmp2));
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
{
deinterleave(data, i, v0, v1, v2, v3);
v4.gather(data + 4, i);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
{
deinterleave(data, i, v0, v1, v2, v3);
deinterleave(data + 4, i, v4, v5);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
{
deinterleave(data, i, v0, v1, v2, v3);
deinterleave(data + 4, i, v4, v5, v6);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
{
deinterleave(data, i, v0, v1, v2, v3);
deinterleave(data + 4, i, v4, v5, v6, v7);
}
};
template<typename V> struct InterleaveImpl<V, 2, 16> {
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1)
{
const __m128d tmp0 = _mm_unpacklo_pd(v0.data(), v1.data());
const __m128d tmp1 = _mm_unpackhi_pd(v0.data(), v1.data());
_mm_storeu_pd(&data[i[0]], tmp0);
_mm_storeu_pd(&data[i[1]], tmp1);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2)
{
interleave(data, i, v0, v1);
v2.scatter(data + 2, i);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3)
{
interleave(data, i, v0, v1);
interleave(data + 2, i, v2, v3);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4)
{
interleave(data, i, v0, v1, v2, v3);
v4.scatter(data + 4, i);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6, const typename V::AsArg v7)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6, v7);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1)
{
const __m128d a = _mm_loadu_pd(&data[i[0]]);
const __m128d b = _mm_loadu_pd(&data[i[1]]);
v0.data() = _mm_unpacklo_pd(a, b);
v1.data() = _mm_unpackhi_pd(a, b);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2)
{
v2.gather(data + 2, i);
deinterleave(data, i, v0, v1);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3)
{
deinterleave(data, i, v0, v1);
deinterleave(data + 2, i, v2, v3);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
{
deinterleave(data, i, v0, v1);
deinterleave(data + 2, i, v2, v3);
v4.gather(data + 4, i);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
{
deinterleave(data, i, v0, v1);
deinterleave(data + 2, i, v2, v3);
deinterleave(data + 4, i, v4, v5);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
{
deinterleave(data, i, v0, v1);
deinterleave(data + 2, i, v2, v3);
deinterleave(data + 4, i, v4, v5);
v6.gather(data + 6, i);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
{
deinterleave(data, i, v0, v1);
deinterleave(data + 2, i, v2, v3);
deinterleave(data + 4, i, v4, v5);
deinterleave(data + 6, i, v6, v7);
}
};
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <size_t Size>
Vc_INTRINSIC_L Vc_CONST_L int mask_count(__m128i) Vc_INTRINSIC_R Vc_CONST_R;
template <size_t Size>
Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m128i) Vc_INTRINSIC_R Vc_CONST_R;
template <size_t Size>
Vc_INTRINSIC_L Vc_CONST_L bool is_equal(__m128, __m128) Vc_INTRINSIC_R Vc_CONST_R;
template <size_t Size>
Vc_INTRINSIC_L Vc_CONST_L bool is_not_equal(__m128, __m128) Vc_INTRINSIC_R Vc_CONST_R;
}
using SSE::sse_cast;
template <typename T> class Mask<T, VectorAbi::Sse>
{
using abi = VectorAbi::Sse;
friend class Mask< double, abi>;
friend class Mask< float, abi>;
friend class Mask< int32_t, abi>;
friend class Mask<uint32_t, abi>;
friend class Mask< int16_t, abi>;
friend class Mask<uint16_t, abi>;
typedef Common::MaskBool<sizeof(T)> MaskBool;
typedef Common::Storage<T, SSE::VectorTraits<T>::Size> Storage;
public:
typedef bool EntryType;
using value_type = EntryType;
using EntryReference = Detail::ElementReference<Mask>;
using reference = EntryReference;
typedef MaskBool VectorEntryType;
using VectorType = typename Storage::VectorType;
using Vector = SSE::Vector<T>;
public:
Vc_FREE_STORE_OPERATORS_ALIGNED(16);
static constexpr size_t Size = SSE::VectorTraits<T>::Size;
static constexpr size_t MemoryAlignment = Size;
static constexpr std::size_t size() { return Size; }
#if defined Vc_MSVC && defined _WIN32
typedef const Mask &Argument;
#else
typedef Mask Argument;
#endif
Vc_INTRINSIC Mask() = default;
Vc_INTRINSIC Mask(const Mask &) = default;
Vc_INTRINSIC Mask &operator=(const Mask &) = default;
Vc_INTRINSIC Mask(const __m128 &x) : d(sse_cast<VectorType>(x)) {}
Vc_INTRINSIC Mask(const __m128d &x) : d(sse_cast<VectorType>(x)) {}
Vc_INTRINSIC Mask(const __m128i &x) : d(sse_cast<VectorType>(x)) {}
Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : Mask(_mm_setzero_ps()) {}
Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : Mask(SSE::_mm_setallone_ps()) {}
Vc_INTRINSIC explicit Mask(bool b) : Mask(b ? SSE::_mm_setallone_ps() : _mm_setzero_ps()) {}
Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; }
Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; }
template <typename U>
Vc_INTRINSIC Mask(
U &&rhs, Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg)
: d(sse_cast<VectorType>(
Detail::mask_cast<Traits::simd_vector_size<U>::value, Size, __m128>(
rhs.dataI())))
{
}
#if Vc_IS_VERSION_1
template <typename U>
Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
"mask types") Vc_INTRINSIC
explicit Mask(U &&rhs,
Common::enable_if_mask_converts_explicitly<T, U> = nullarg);
#endif
Vc_ALWAYS_INLINE explicit Mask(const bool *mem) { load(mem); }
template<typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags f) { load(mem, f); }
Vc_ALWAYS_INLINE_L void load(const bool *mem) Vc_ALWAYS_INLINE_R;
template<typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags) { load(mem); }
Vc_ALWAYS_INLINE_L void store(bool *) const Vc_ALWAYS_INLINE_R;
template<typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags) const { store(mem); }
Vc_ALWAYS_INLINE Vc_PURE bool operator==(const Mask &rhs) const
{
return Detail::is_equal<Size>(dataF(), rhs.dataF());
}
Vc_ALWAYS_INLINE Vc_PURE bool operator!=(const Mask &rhs) const
{
return Detail::is_not_equal<Size>(dataF(), rhs.dataF());
}
Vc_ALWAYS_INLINE Vc_PURE Mask operator!() const
{
#ifdef Vc_GCC
return ~dataI();
#else
return _mm_andnot_si128(dataI(), SSE::_mm_setallone_si128());
#endif
}
Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { d.v() = SSE::sse_cast<VectorType>(_mm_and_ps(dataF(), rhs.dataF())); return *this; }
Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { d.v() = SSE::sse_cast<VectorType>(_mm_or_ps (dataF(), rhs.dataF())); return *this; }
Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { d.v() = SSE::sse_cast<VectorType>(_mm_xor_ps(dataF(), rhs.dataF())); return *this; }
Vc_ALWAYS_INLINE Vc_PURE Mask operator&(const Mask &rhs) const { return _mm_and_ps(dataF(), rhs.dataF()); }
Vc_ALWAYS_INLINE Vc_PURE Mask operator|(const Mask &rhs) const { return _mm_or_ps (dataF(), rhs.dataF()); }
Vc_ALWAYS_INLINE Vc_PURE Mask operator^(const Mask &rhs) const { return _mm_xor_ps(dataF(), rhs.dataF()); }
Vc_ALWAYS_INLINE Vc_PURE Mask operator&&(const Mask &rhs) const { return _mm_and_ps(dataF(), rhs.dataF()); }
Vc_ALWAYS_INLINE Vc_PURE Mask operator||(const Mask &rhs) const { return _mm_or_ps (dataF(), rhs.dataF()); }
Vc_ALWAYS_INLINE Vc_PURE bool isFull () const { return
#ifdef Vc_USE_PTEST
_mm_testc_si128(dataI(), SSE::_mm_setallone_si128());
#else
_mm_movemask_epi8(dataI()) == 0xffff;
#endif
}
Vc_ALWAYS_INLINE Vc_PURE bool isNotEmpty() const { return
#ifdef Vc_USE_PTEST
0 == _mm_testz_si128(dataI(), dataI());
#else
_mm_movemask_epi8(dataI()) != 0x0000;
#endif
}
Vc_ALWAYS_INLINE Vc_PURE bool isEmpty() const { return
#ifdef Vc_USE_PTEST
0 != _mm_testz_si128(dataI(), dataI());
#else
_mm_movemask_epi8(dataI()) == 0x0000;
#endif
}
Vc_ALWAYS_INLINE Vc_PURE bool isMix() const {
#ifdef Vc_USE_PTEST
return _mm_test_mix_ones_zeros(dataI(), SSE::_mm_setallone_si128());
#else
const int tmp = _mm_movemask_epi8(dataI());
return tmp != 0 && (tmp ^ 0xffff) != 0;
#endif
}
Vc_ALWAYS_INLINE Vc_PURE int shiftMask() const { return _mm_movemask_epi8(dataI()); }
Vc_ALWAYS_INLINE Vc_PURE int toInt() const { return Detail::mask_to_int<Size>(dataI()); }
Vc_ALWAYS_INLINE Vc_PURE VectorType data() const { return d.v(); }
Vc_ALWAYS_INLINE Vc_PURE __m128 dataF() const { return SSE::sse_cast<__m128 >(d.v()); }
Vc_ALWAYS_INLINE Vc_PURE __m128i dataI() const { return SSE::sse_cast<__m128i>(d.v()); }
Vc_ALWAYS_INLINE Vc_PURE __m128d dataD() const { return SSE::sse_cast<__m128d>(d.v()); }
private:
friend reference;
static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept
{
return m.toInt() & (1 << i);
}
template <typename U>
static Vc_INTRINSIC void set(Mask &m, int i,
U &&v) noexcept(noexcept(MaskBool(std::declval<U>())))
{
m.d.set(i, MaskBool(std::forward<U>(v)));
}
public:
Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
{
return {*this, int(index)};
}
Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept
{
return get(*this, index);
}
Vc_ALWAYS_INLINE Vc_PURE int count() const
{
return Detail::mask_count<Size>(dataI());
}
Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
template <typename G> static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R;
private:
#ifdef Vc_COMPILE_BENCHMARKS
public:
#endif
Storage d;
};
template <typename T> constexpr size_t Mask<T, VectorAbi::Sse>::Size;
template <typename T> constexpr size_t Mask<T, VectorAbi::Sse>::MemoryAlignment;
}
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template<> Vc_INTRINSIC Vc_CONST int mask_count<2>(__m128i k)
{
int mask = _mm_movemask_pd(_mm_castsi128_pd(k));
return (mask & 1) + (mask >> 1);
}
template<> Vc_INTRINSIC Vc_CONST int mask_count<4>(__m128i k)
{
#ifdef Vc_IMPL_POPCNT
return _mm_popcnt_u32(_mm_movemask_ps(_mm_castsi128_ps(k)));
#else
auto x = _mm_srli_epi32(k, 31);
x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
x = _mm_add_epi32(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)));
return _mm_cvtsi128_si32(x);
#endif
}
template<> Vc_INTRINSIC Vc_CONST int mask_count<8>(__m128i k)
{
#ifdef Vc_IMPL_POPCNT
return _mm_popcnt_u32(_mm_movemask_epi8(k)) / 2;
#else
auto x = _mm_srli_epi16(k, 15);
x = _mm_add_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)));
x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)));
return _mm_extract_epi16(x, 0);
#endif
}
template<> Vc_INTRINSIC Vc_CONST int mask_count<16>(__m128i k)
{
return Detail::popcnt16(_mm_movemask_epi8(k));
}
template<> Vc_INTRINSIC Vc_CONST int mask_to_int<2>(__m128i k)
{
return _mm_movemask_pd(_mm_castsi128_pd(k));
}
template<> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m128i k)
{
return _mm_movemask_ps(_mm_castsi128_ps(k));
}
template<> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m128i k)
{
return _mm_movemask_epi8(_mm_packs_epi16(k, _mm_setzero_si128()));
}
template<> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m128i k)
{
return _mm_movemask_epi8(k);
}
template <size_t> Vc_ALWAYS_INLINE void mask_store(__m128i k, bool *mem);
template <> Vc_ALWAYS_INLINE void mask_store<16>(__m128i k, bool *mem)
{
_mm_store_si128(reinterpret_cast<__m128i *>(mem), _mm_and_si128(k, _mm_set1_epi8(1)));
}
template <> Vc_ALWAYS_INLINE void mask_store<8>(__m128i k, bool *mem)
{
k = _mm_srli_epi16(k, 15);
const auto k2 = _mm_packs_epi16(k, _mm_setzero_si128());
#ifdef __x86_64__
*aliasing_cast<int64_t>(mem) = _mm_cvtsi128_si64(k2);
#else
_mm_store_sd(aliasing_cast<double>(mem), _mm_castsi128_pd(k2));
#endif
}
template <> Vc_ALWAYS_INLINE void mask_store<4>(__m128i k, bool *mem)
{
*aliasing_cast<int32_t>(mem) = _mm_cvtsi128_si32(
_mm_packs_epi16(_mm_srli_epi16(_mm_packs_epi32(k, _mm_setzero_si128()), 15),
_mm_setzero_si128()));
}
template <> Vc_ALWAYS_INLINE void mask_store<2>(__m128i k, bool *mem)
{
mem[0] = -SseIntrinsics::extract_epi32<1>(k);
mem[1] = -SseIntrinsics::extract_epi32<3>(k);
}
template<size_t> Vc_ALWAYS_INLINE __m128 mask_load(const bool *mem);
template<> Vc_ALWAYS_INLINE __m128 mask_load<16>(const bool *mem)
{
return sse_cast<__m128>(_mm_cmpgt_epi8(
_mm_load_si128(reinterpret_cast<const __m128i *>(mem)), _mm_setzero_si128()));
}
template<> Vc_ALWAYS_INLINE __m128 mask_load<8>(const bool *mem)
{
#ifdef __x86_64__
__m128i k = _mm_cvtsi64_si128(*reinterpret_cast<const int64_t *>(mem));
#else
__m128i k = _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(mem)));
#endif
return sse_cast<__m128>(_mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
}
template<> Vc_ALWAYS_INLINE __m128 mask_load<4>(const bool *mem)
{
__m128i k = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem));
k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
return sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
}
template<> Vc_ALWAYS_INLINE __m128 mask_load<2>(const bool *mem)
{
return sse_cast<__m128>(
_mm_set_epi32(-int(mem[1]), -int(mem[1]), -int(mem[0]), -int(mem[0])));
}
template <> Vc_INTRINSIC Vc_CONST bool is_equal<2>(__m128 k1, __m128 k2)
{
return _mm_movemask_pd(_mm_castps_pd(k1)) == _mm_movemask_pd(_mm_castps_pd(k2));
}
template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<2>(__m128 k1, __m128 k2)
{
return _mm_movemask_pd(_mm_castps_pd(k1)) != _mm_movemask_pd(_mm_castps_pd(k2));
}
template <> Vc_INTRINSIC Vc_CONST bool is_equal<4>(__m128 k1, __m128 k2)
{
return _mm_movemask_ps(k1) == _mm_movemask_ps(k2);
}
template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<4>(__m128 k1, __m128 k2)
{
return _mm_movemask_ps(k1) != _mm_movemask_ps(k2);
}
template <> Vc_INTRINSIC Vc_CONST bool is_equal<8>(__m128 k1, __m128 k2)
{
return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
_mm_movemask_epi8(_mm_castps_si128(k2));
}
template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<8>(__m128 k1, __m128 k2)
{
return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
_mm_movemask_epi8(_mm_castps_si128(k2));
}
template <> Vc_INTRINSIC Vc_CONST bool is_equal<16>(__m128 k1, __m128 k2)
{
return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
_mm_movemask_epi8(_mm_castps_si128(k2));
}
template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<16>(__m128 k1, __m128 k2)
{
return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
_mm_movemask_epi8(_mm_castps_si128(k2));
}
}
template<> Vc_ALWAYS_INLINE void SSE::double_m::store(bool *mem) const
{
*aliasing_cast<uint16_t>(mem) = _mm_movemask_epi8(dataI()) & 0x0101;
}
template<typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::store(bool *mem) const
{
Detail::mask_store<Size>(dataI(), mem);
}
template<> Vc_ALWAYS_INLINE void SSE::double_m::load(const bool *mem)
{
d.set(0, MaskBool(mem[0]));
d.set(1, MaskBool(mem[1]));
}
template <typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::load(const bool *mem)
{
d.v() = sse_cast<VectorType>(Detail::mask_load<Size>(mem));
}
template <>
Vc_INTRINSIC Vc_PURE bool SSE::short_m::get(const SSE::short_m &m, int index) noexcept
{
return m.shiftMask() & (1 << 2 * index);
}
template <>
Vc_INTRINSIC Vc_PURE bool SSE::ushort_m::get(const SSE::ushort_m &m, int index) noexcept
{
return m.shiftMask() & (1 << 2 * index);
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE int Mask<T, VectorAbi::Sse>::firstOne() const
{
const int mask = toInt();
#ifdef _MSC_VER
unsigned long bit;
_BitScanForward(&bit, mask);
#else
int bit;
__asm__("bsf %1,%0" : "=&r"(bit) : "r"(mask));
#endif
return bit;
}
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 2>)
{
return _mm_set_epi64x(gen(1) ? 0xffffffffffffffffull : 0,
gen(0) ? 0xffffffffffffffffull : 0);
}
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4>)
{
return _mm_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0);
}
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8>)
{
return _mm_setr_epi16(gen(0) ? 0xffffu : 0, gen(1) ? 0xffffu : 0,
gen(2) ? 0xffffu : 0, gen(3) ? 0xffffu : 0,
gen(4) ? 0xffffu : 0, gen(5) ? 0xffffu : 0,
gen(6) ? 0xffffu : 0, gen(7) ? 0xffffu : 0);
}
template <typename T>
template <typename G>
Vc_INTRINSIC Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::generate(G &&gen)
{
return generate_impl<Mask<T, VectorAbi::Sse>>(std::forward<G>(gen),
std::integral_constant<int, Size>());
}
template <typename T> Vc_INTRINSIC Vc_PURE Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::shifted(int amount) const
{
switch (amount * int(sizeof(VectorEntryType))) {
case 0: return *this;
case 1: return Detail::shifted< 1>(dataI());
case 2: return Detail::shifted< 2>(dataI());
case 3: return Detail::shifted< 3>(dataI());
case 4: return Detail::shifted< 4>(dataI());
case 5: return Detail::shifted< 5>(dataI());
case 6: return Detail::shifted< 6>(dataI());
case 7: return Detail::shifted< 7>(dataI());
case 8: return Detail::shifted< 8>(dataI());
case 9: return Detail::shifted< 9>(dataI());
case 10: return Detail::shifted< 10>(dataI());
case 11: return Detail::shifted< 11>(dataI());
case 12: return Detail::shifted< 12>(dataI());
case 13: return Detail::shifted< 13>(dataI());
case 14: return Detail::shifted< 14>(dataI());
case 15: return Detail::shifted< 15>(dataI());
case 16: return Detail::shifted< 16>(dataI());
case -1: return Detail::shifted< -1>(dataI());
case -2: return Detail::shifted< -2>(dataI());
case -3: return Detail::shifted< -3>(dataI());
case -4: return Detail::shifted< -4>(dataI());
case -5: return Detail::shifted< -5>(dataI());
case -6: return Detail::shifted< -6>(dataI());
case -7: return Detail::shifted< -7>(dataI());
case -8: return Detail::shifted< -8>(dataI());
case -9: return Detail::shifted< -9>(dataI());
case -10: return Detail::shifted<-10>(dataI());
case -11: return Detail::shifted<-11>(dataI());
case -12: return Detail::shifted<-12>(dataI());
case -13: return Detail::shifted<-13>(dataI());
case -14: return Detail::shifted<-14>(dataI());
case -15: return Detail::shifted<-15>(dataI());
case -16: return Detail::shifted<-16>(dataI());
}
return Zero();
}
}
#endif
#include <algorithm>
#include <cmath>
#ifdef isfinite
#undef isfinite
#endif
#ifdef isnan
#undef isnan
#endif
namespace Vc_VERSIONED_NAMESPACE
{
#define Vc_CURRENT_CLASS_NAME Vector
template <typename T> class Vector<T, VectorAbi::Sse>
{
static_assert(std::is_arithmetic<T>::value,
"Vector<T> only accepts arithmetic builtin types as template parameter T.");
protected:
#ifdef Vc_COMPILE_BENCHMARKS
public:
#endif
typedef typename SSE::VectorTraits<T>::StorageType StorageType;
StorageType d;
typedef typename SSE::VectorTraits<T>::GatherMaskType GatherMask;
typedef SSE::VectorHelper<typename SSE::VectorTraits<T>::VectorType> HV;
typedef SSE::VectorHelper<T> HT;
public:
Vc_FREE_STORE_OPERATORS_ALIGNED(16);
typedef typename SSE::VectorTraits<T>::VectorType VectorType;
using vector_type = VectorType;
static constexpr size_t Size = SSE::VectorTraits<T>::Size;
static constexpr size_t MemoryAlignment = alignof(VectorType);
typedef typename SSE::VectorTraits<T>::EntryType EntryType;
using value_type = EntryType;
using VectorEntryType = EntryType;
using IndexType = fixed_size_simd<int, Size>;
using index_type = IndexType;
typedef typename SSE::VectorTraits<T>::MaskType Mask;
using MaskType = Mask;
using mask_type = Mask;
typedef typename Mask::Argument MaskArg;
typedef typename Mask::Argument MaskArgument;
typedef const Vector AsArg;
using abi = VectorAbi::Sse;
using WriteMaskedVector = Common::WriteMaskedVector<Vector, Mask>;
template <typename U> using V = Vector<U, abi>;
using reference = Detail::ElementReference<Vector>;
public:
Vc_INTRINSIC Vector() = default;
static constexpr std::size_t size() { return Size; }
explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R;
explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R;
explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R;
static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); }
static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); }
static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero()
{
return Vector(Vc::IndexesFromZero);
}
template <class G, int = 0,
class = typename std::enable_if<std::is_convertible<
decltype(std::declval<G>()(size_t())), value_type>::value>::type>
explicit Vector(G &&g) : Vector(generate(std::forward<G>(g)))
{
}
static Vc_INTRINSIC_L Vector Random() Vc_INTRINSIC_R;
Vc_ALWAYS_INLINE Vector(VectorType x) : d(x) {}
template <typename U>
Vc_INTRINSIC Vector(
V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
void *>::type = nullptr)
: d(SSE::convert<U, T>(x.data()))
{
}
#if Vc_IS_VERSION_1
template <typename U>
Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
"vector types") Vc_INTRINSIC
explicit Vector(
V<U> x,
typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
void *>::type = nullptr)
: d(SSE::convert<U, T>(x.data()))
{
}
#endif
Vc_INTRINSIC Vector(EntryType a) : d(HT::set(a)) {}
template <typename U>
Vc_INTRINSIC Vector(U a,
typename std::enable_if<std::is_same<U, int>::value &&
!std::is_same<U, EntryType>::value,
void *>::type = nullptr)
: Vector(static_cast<EntryType>(a))
{
}
Vc_INTRINSIC explicit Vector(reference a) : Vector(static_cast<EntryType>(a)) {}
explicit Vc_INTRINSIC Vector(const EntryType *mem)
{
load(mem);
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
{
load(mem, flags);
}
template <typename U, typename Flags = DefaultLoadTag,
typename = enable_if<
(!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
sizeof(EntryType) >= sizeof(U)) &&
std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
{
load<U, Flags>(x, flags);
}
Vc_INTRINSIC void load(const EntryType *mem)
{
load(mem, DefaultLoadTag());
}
template <typename Flags>
Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
load(const EntryType *mem, Flags flags)
{
load<EntryType, Flags>(mem, flags);
}
private:
template <typename U, typename Flags>
struct load_concept : public std::enable_if<
(!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
sizeof(EntryType) >= sizeof(U)) &&
std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
{};
public:
template <typename U, typename Flags = DefaultLoadTag>
Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
template <
typename U,
typename Flags = DefaultStoreTag,
typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
template <
typename U,
typename Flags = DefaultStoreTag,
typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
Vc_INTRINSIC void store(EntryType *mem) const
{
store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
{
store<EntryType, Flags>(mem, flags);
}
Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
{
store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
{
store<EntryType, Flags>(mem, mask, flags);
}
Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R;
Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R;
Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R;
Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
Vc_INTRINSIC_L void setQnan(const Mask &k) Vc_INTRINSIC_R;
#ifndef Vc_CURRENT_CLASS_NAME
#error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
#endif
private:
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
MaskArgument mask);
public:
#define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<MT, EntryType>::value, \
"The memory pointer needs to point to a type that can be converted to the " \
"EntryType of this SIMD vector type."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
template <typename MT, typename IT,
typename = enable_if<Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
#undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
private:
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes) const;
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
public:
#define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<EntryType, MT>::value, \
"The memory pointer needs to point to a type that the EntryType of this " \
"SIMD vector type can be converted to."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes));
}
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes), mask);
}
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
{
scatter(args.address, args.indexes);
}
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
{
scatter(args.address, args.indexes, mask);
}
#undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
#if defined Vc_IMPL_AVX2 && !defined Vc_MSVC
template <class U, class A, int Scale, int N = Vector<U, A>::size(),
class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<T, Vector<U, A>, Scale> &args)
{
d.v() = SSE::gather<sizeof(T) * Scale>(
args.address, simd_cast<SSE::int_v>(args.indexes).data());
}
template <class U, class A, int Scale, int N = Vector<U, A>::size(),
class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<T, Vector<U, A>, Scale> &args, MaskArgument k)
{
d.v() = SSE::gather<sizeof(T) * Scale>(
d.v(), k.data(), args.address,
simd_cast<SSE::int_v>(args.indexes).data());
}
template <
class MT, class U, class A, int Scale,
class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
(sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
{
using AVX2::int_v;
const auto idx = simd_cast<int_v>(args.indexes).data();
*this = simd_cast<Vector>(int_v(
AVX::gather<sizeof(MT) * Scale>(aliasing_cast<int>(args.address), idx)));
if (sizeof(MT) == 1) {
if (std::is_signed<MT>::value) {
d.v() = _mm_srai_epi16(_mm_slli_epi16(d.v(), 8), 8);
} else {
*this &= 0xff;
}
}
}
template <
class MT, class U, class A, int Scale,
class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
(sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<MT, Vector<U, A>, Scale> &args, MaskArgument k)
{
using AVX2::int_v;
auto v = simd_cast<Vector>(int_v(AVX::gather<sizeof(MT) * Scale>(
_mm256_setzero_si256(), simd_cast<AVX2::int_m>(k).data(),
aliasing_cast<int>(args.address),
simd_cast<int_v>(args.indexes).data())));
if (sizeof(MT) == 1) {
if (std::is_signed<MT>::value) {
v.data() = _mm_srai_epi16(_mm_slli_epi16(v.data(), 8), 8);
} else {
v &= 0xff;
}
}
assign(v, k);
}
template <class MT, class U, class A, int Scale>
Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
Traits::is_valid_vector_argument<MT>::value &&
!std::is_same<MT, T>::value &&
Vector<U, A>::size() >= size()),
void>
gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
{
*this = simd_cast<Vector>(fixed_size_simd<MT, Size>(args));
}
template <class MT, class U, class A, int Scale>
Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
Traits::is_valid_vector_argument<MT>::value &&
!std::is_same<MT, T>::value &&
Vector<U, A>::size() >= size()),
void>
gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args,
MaskArgument k)
{
assign(simd_cast<Vector>(fixed_size_simd<MT, Size>(args, k)), k);
}
#endif
Vc_INTRINSIC Vector &operator++() { data() = HT::add(data(), HT::one()); return *this; }
Vc_INTRINSIC Vector &operator--() { data() = HT::sub(data(), HT::one()); return *this; }
Vc_INTRINSIC Vector operator++(int) { const Vector r = *this; data() = HT::add(data(), HT::one()); return r; }
Vc_INTRINSIC Vector operator--(int) { const Vector r = *this; data() = HT::sub(data(), HT::one()); return r; }
private:
friend reference;
Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
{
return o.d.m(i);
}
template <typename U>
Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
noexcept(std::declval<value_type &>() = v))
{
o.d.set(i, v);
}
public:
Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
{
static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
return {*this, int(index)};
}
Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
{
return d.m(index);
}
Vc_INTRINSIC_L Vector Vc_VDECL operator[](const SSE::int_v &perm) const Vc_INTRINSIC_R;
Vc_INTRINSIC Vc_PURE Mask operator!() const
{
return *this == Zero();
}
Vc_INTRINSIC Vc_PURE Vector operator~() const
{
#ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
static_assert(std::is_integral<T>::value,
"bit-complement can only be used with Vectors of integral type");
#endif
return Detail::andnot_(data(), HV::allone());
}
Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; }
Vc_ALWAYS_INLINE Vector Vc_VDECL operator<< (AsArg shift) const { return generate([&](int i) { return get(*this, i) << get(shift, i); }); }
Vc_ALWAYS_INLINE Vector Vc_VDECL operator>> (AsArg shift) const { return generate([&](int i) { return get(*this, i) >> get(shift, i); }); }
Vc_ALWAYS_INLINE Vector &Vc_VDECL operator<<=(AsArg shift) { return *this = *this << shift; }
Vc_ALWAYS_INLINE Vector &Vc_VDECL operator>>=(AsArg shift) { return *this = *this >> shift; }
Vc_INTRINSIC_L Vector &Vc_VDECL operator<<=( int shift) Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vector Vc_VDECL operator<< ( int shift) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vector &Vc_VDECL operator>>=( int shift) Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vector Vc_VDECL operator>> ( int shift) const Vc_INTRINSIC_R;
Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
isNegative() const
{
return Vc::isnegative(*this);
}
Vc_ALWAYS_INLINE void assign(const Vector &v, const Mask &mask)
{
data() = HV::blend(data(), v.data(), mask.data());
}
template <typename V2>
Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast")
Vc_ALWAYS_INLINE Vc_PURE V2 staticCast() const
{
return SSE::convert<T, typename V2::EntryType>(data());
}
template <typename V2>
Vc_DEPRECATED("use reinterpret_components_cast instead")
Vc_ALWAYS_INLINE Vc_PURE V2 reinterpretCast() const
{
return SSE::sse_cast<typename V2::VectorType>(data());
}
Vc_INTRINSIC WriteMaskedVector operator()(const Mask &k) { return {*this, k}; }
Vc_ALWAYS_INLINE Vc_PURE VectorType &data() { return d.v(); }
Vc_ALWAYS_INLINE Vc_PURE const VectorType &data() const { return d.v(); }
template<int Index>
Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R;
Vc_INTRINSIC EntryType min() const { return HT::min(data()); }
Vc_INTRINSIC EntryType max() const { return HT::max(data()); }
Vc_INTRINSIC EntryType product() const { return HT::mul(data()); }
Vc_INTRINSIC EntryType sum() const { return HT::add(data()); }
Vc_INTRINSIC_L Vector partialSum() const Vc_INTRINSIC_R;
Vc_INTRINSIC_L EntryType min(MaskArg m) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L EntryType max(MaskArg m) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L EntryType product(MaskArg m) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L EntryType sum(MaskArg m) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R;
Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
template <typename F> void callWithValuesSorted(F &&f)
{
EntryType value = d.m(0);
f(value);
for (std::size_t i = 1; i < Size; ++i) {
if (d.m(i) != value) {
value = d.m(i);
f(value);
}
}
}
template <typename F> Vc_INTRINSIC void call(F &&f) const
{
Common::for_all_vector_entries<Size>([&](size_t i) { f(EntryType(d.m(i))); });
}
template <typename F> Vc_INTRINSIC void call(F &&f, const Mask &mask) const
{
for(size_t i : where(mask)) {
f(EntryType(d.m(i)));
}
}
template <typename F> Vc_INTRINSIC Vector apply(F &&f) const
{
Vector r;
Common::for_all_vector_entries<Size>(
[&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); });
return r;
}
template <typename F> Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const
{
Vector r(*this);
for (size_t i : where(mask)) {
r.d.set(i, f(EntryType(r.d.m(i))));
}
return r;
}
template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f(i)); });
}
Vc_INTRINSIC void fill(EntryType (&f)()) {
Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f()); });
}
template <typename G> static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R;
Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector
copySign(AsArg x) const
{
return Vc::copysign(*this, x);
}
Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
{
return Vc::exponent(*this);
}
Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R;
};
#undef Vc_CURRENT_CLASS_NAME
template <typename T> constexpr size_t Vector<T, VectorAbi::Sse>::Size;
template <typename T> constexpr size_t Vector<T, VectorAbi::Sse>::MemoryAlignment;
static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v min(const SSE::int_v &x, const SSE::int_v &y) { return SSE::min_epi32(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v min(const SSE::uint_v &x, const SSE::uint_v &y) { return SSE::min_epu32(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v min(const SSE::short_v &x, const SSE::short_v &y) { return _mm_min_epi16(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v min(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::min_epu16(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v min(const SSE::float_v &x, const SSE::float_v &y) { return _mm_min_ps(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v min(const SSE::double_v &x, const SSE::double_v &y) { return _mm_min_pd(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v max(const SSE::int_v &x, const SSE::int_v &y) { return SSE::max_epi32(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v max(const SSE::uint_v &x, const SSE::uint_v &y) { return SSE::max_epu32(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v max(const SSE::short_v &x, const SSE::short_v &y) { return _mm_max_epi16(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v max(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::max_epu16(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v max(const SSE::float_v &x, const SSE::float_v &y) { return _mm_max_ps(x.data(), y.data()); }
static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v max(const SSE::double_v &x, const SSE::double_v &y) { return _mm_max_pd(x.data(), y.data()); }
template <typename T,
typename = enable_if<std::is_same<T, double>::value || std::is_same<T, float>::value ||
std::is_same<T, short>::value ||
std::is_same<T, int>::value>>
Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> abs(Vector<T, VectorAbi::Sse> x)
{
return SSE::VectorHelper<T>::abs(x.data());
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> sqrt (const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::sqrt(x.data()); }
template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> rsqrt(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::rsqrt(x.data()); }
template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> reciprocal(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::reciprocal(x.data()); }
template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> round(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::round(x.data()); }
template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::Mask isfinite(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::isFinite(x.data()); }
template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::Mask isinf(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::isInfinite(x.data()); }
template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::Mask isnan(const Vector<T, VectorAbi::Sse> &x) { return SSE::VectorHelper<T>::isNaN(x.data()); }
#define Vc_CONDITIONAL_ASSIGN(name_,op_) \
template <Operator O, typename T, typename M, typename U> \
Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign( \
Vector<T, VectorAbi::Sse> &lhs, M &&mask, U &&rhs) \
{ \
lhs(mask) op_ rhs; \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN( Assign, =);
Vc_CONDITIONAL_ASSIGN( PlusAssign, +=);
Vc_CONDITIONAL_ASSIGN( MinusAssign, -=);
Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=);
Vc_CONDITIONAL_ASSIGN( DivideAssign, /=);
Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
Vc_CONDITIONAL_ASSIGN( XorAssign, ^=);
Vc_CONDITIONAL_ASSIGN( AndAssign, &=);
Vc_CONDITIONAL_ASSIGN( OrAssign, |=);
Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
#undef Vc_CONDITIONAL_ASSIGN
#define Vc_CONDITIONAL_ASSIGN(name_,expr_) \
template <Operator O, typename T, typename M> \
Vc_INTRINSIC enable_if<O == Operator::name_, Vector<T, VectorAbi::Sse>> \
conditional_assign(Vector<T, VectorAbi::Sse> &lhs, M &&mask) \
{ \
return expr_; \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++);
Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask));
Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--);
Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask));
#undef Vc_CONDITIONAL_ASSIGN
}
#ifndef VC_COMMON_X86_PREFETCHES_H_
#define VC_COMMON_X86_PREFETCHES_H_
#include <xmmintrin.h>
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
static constexpr int exclusive_hint = 0;
template <typename ExclusiveOrShared = Vc::Shared>
Vc_INTRINSIC void prefetchForOneRead(const void *addr)
{
if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
_mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_NTA);
} else {
_mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
static_cast<decltype(_MM_HINT_NTA)>(_MM_HINT_NTA | exclusive_hint));
}
}
template <typename ExclusiveOrShared = Vc::Shared>
Vc_INTRINSIC void prefetchClose(const void *addr)
{
if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
_mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T0);
} else {
_mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
static_cast<decltype(_MM_HINT_T0)>(_MM_HINT_T0 | exclusive_hint));
}
}
template <typename ExclusiveOrShared = Vc::Shared>
Vc_INTRINSIC void prefetchMid(const void *addr)
{
if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
_mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T1);
} else {
_mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
static_cast<decltype(_MM_HINT_T1)>(_MM_HINT_T1 | exclusive_hint));
}
}
template <typename ExclusiveOrShared = Vc::Shared>
Vc_INTRINSIC void prefetchFar(const void *addr)
{
if (std::is_same<ExclusiveOrShared, Vc::Shared>::value) {
_mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T2);
} else {
_mm_prefetch(static_cast<char *>(const_cast<void *>(addr)),
static_cast<decltype(_MM_HINT_T2)>(_MM_HINT_T2 | exclusive_hint));
}
}
namespace
{
template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if<L1 != 0 && L2 != 0, void *>::type = nullptr)
{
const char *addr = static_cast<const char *>(addr_);
prefetchClose<typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L1);
prefetchMid <typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L2);
}
template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if<L1 == 0 && L2 != 0, void *>::type = nullptr)
{
const char *addr = static_cast<const char *>(addr_);
prefetchMid <typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L2);
}
template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *addr_, typename std::enable_if<L1 != 0 && L2 == 0, void *>::type = nullptr)
{
const char *addr = static_cast<const char *>(addr_);
prefetchClose<typename std::conditional<UseExclusivePrefetch, Vc::Exclusive, Vc::Shared>::type>(addr + L1);
}
template<size_t L1, size_t L2, bool UseExclusivePrefetch> Vc_INTRINSIC void handlePrefetch(const void *, typename std::enable_if<L1 == 0 && L2 == 0, void *>::type = nullptr)
{
}
template<typename Flags> Vc_INTRINSIC void handleLoadPrefetches(const void * , Flags, typename Flags::EnableIfNotPrefetch = nullptr) {}
template<typename Flags> Vc_INTRINSIC void handleLoadPrefetches(const void *addr, Flags, typename Flags::EnableIfPrefetch = nullptr)
{
handlePrefetch<Flags::L1Stride, Flags::L2Stride, Flags::IsExclusivePrefetch>(addr);
}
template<typename Flags> Vc_INTRINSIC void handleStorePrefetches(const void * , Flags, typename Flags::EnableIfNotPrefetch = nullptr) {}
template<typename Flags> Vc_INTRINSIC void handleStorePrefetches(const void *addr, Flags, typename Flags::EnableIfPrefetch = nullptr)
{
handlePrefetch<Flags::L1Stride, Flags::L2Stride, !Flags::IsSharedPrefetch>(addr);
}
}
}
using Common::prefetchForOneRead;
using Common::prefetchClose;
using Common::prefetchMid;
using Common::prefetchFar;
}
#endif
#ifndef VC_SSE_LIMITS_H_
#define VC_SSE_LIMITS_H_
namespace std
{
template<> struct numeric_limits< ::Vc::SSE::ushort_v> : public numeric_limits<unsigned short>
{
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v max() Vc_NOEXCEPT { return ::Vc::SSE::_mm_setallone_si128(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v min() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v lowest() Vc_NOEXCEPT { return min(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::ushort_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::ushort_v::Zero(); }
};
template<> struct numeric_limits< ::Vc::SSE::short_v> : public numeric_limits<short>
{
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v max() Vc_NOEXCEPT { return _mm_srli_epi16(::Vc::SSE::_mm_setallone_si128(), 1); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v min() Vc_NOEXCEPT { return ::Vc::SSE::setmin_epi16(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v lowest() Vc_NOEXCEPT { return min(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::short_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::short_v::Zero(); }
};
template<> struct numeric_limits< ::Vc::SSE::uint_v> : public numeric_limits<unsigned int>
{
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v max() Vc_NOEXCEPT { return ::Vc::SSE::_mm_setallone_si128(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v min() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v lowest() Vc_NOEXCEPT { return min(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::uint_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::uint_v::Zero(); }
};
template<> struct numeric_limits< ::Vc::SSE::int_v> : public numeric_limits<int>
{
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v max() Vc_NOEXCEPT { return _mm_srli_epi32(::Vc::SSE::_mm_setallone_si128(), 1); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v min() Vc_NOEXCEPT { return ::Vc::SSE::setmin_epi32(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v lowest() Vc_NOEXCEPT { return min(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v epsilon() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v round_error() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v infinity() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v quiet_NaN() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v signaling_NaN() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
static Vc_INTRINSIC Vc_CONST ::Vc::SSE::int_v denorm_min() Vc_NOEXCEPT { return ::Vc::SSE::int_v::Zero(); }
};
}
#endif
#ifndef VC_COMMON_BITSCANINTRINSICS_H_
#define VC_COMMON_BITSCANINTRINSICS_H_
#if defined(Vc_GCC) || defined(Vc_CLANG) || defined(Vc_APPLECLANG)
#include <x86intrin.h>
# ifndef _bit_scan_forward
#define _bit_scan_forward(x) __builtin_ctz(x)
static Vc_ALWAYS_INLINE Vc_CONST int _Vc_bit_scan_reverse_asm(unsigned int x) {
int r;
__asm__("bsr %1,%0" : "=r"(r) : "X"(x));
return r;
}
#define _bit_scan_reverse(x) _Vc_bit_scan_reverse_asm(x)
# endif
#elif defined(_WIN32)
#include <intrin.h>
static inline __forceinline unsigned long _bit_scan_forward(unsigned long x) {
unsigned long index;
_BitScanForward(&index, x);
return index;
}
static inline __forceinline unsigned long _bit_scan_reverse(unsigned long x) {
unsigned long index;
_BitScanReverse(&index, x);
return index;
}
#elif defined(Vc_ICC)
#else
#endif
#endif
#ifndef VC_COMMON_SET_H_
#define VC_COMMON_SET_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace
{
static Vc_INTRINSIC Vc_CONST __m128i set(unsigned short x0, unsigned short x1, unsigned short x2, unsigned short x3,
unsigned short x4, unsigned short x5, unsigned short x6, unsigned short x7)
{
#if defined(Vc_GNU_ASM)
#if 0
__m128i r;
unsigned long long tmp0 = x3; tmp0 = (tmp0 << 16) | x2;
unsigned long long tmp1 = x1; tmp1 = (tmp1 << 16) | x0;
asm("vmovq %1,%0" : "=x"(r) : "r"((tmp0 << 32) | tmp1));
unsigned long long tmp2 = x7; tmp2 = (tmp2 << 16) | x6;
unsigned long long tmp3 = x5; tmp3 = (tmp3 << 16) | x4;
asm("vpinsrq $1,%1,%0,%0" : "+x"(r) : "r"((tmp2 << 32) | tmp3));
return r;
#elif defined(Vc_USE_VEX_CODING)
__m128i r0, r1;
unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
asm("vmovd %1,%0" : "=x"(r0) : "r"(tmp0));
asm("vpinsrd $1,%1,%0,%0" : "+x"(r0) : "r"(tmp1));
asm("vmovd %1,%0" : "=x"(r1) : "r"(tmp2));
asm("vpinsrd $1,%1,%0,%0" : "+x"(r1) : "r"(tmp3));
asm("vpunpcklqdq %1,%0,%0" : "+x"(r0) : "x"(r1));
return r0;
#else
__m128i r0, r1;
unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
asm("movd %1,%0" : "=x"(r0) : "r"(tmp0));
asm("pinsrd $1,%1,%0" : "+x"(r0) : "r"(tmp1));
asm("movd %1,%0" : "=x"(r1) : "r"(tmp2));
asm("pinsrd $1,%1,%0" : "+x"(r1) : "r"(tmp3));
asm("punpcklqdq %1,%0" : "+x"(r0) : "x"(r1));
return r0;
#endif
#else
unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
#endif
}
static Vc_INTRINSIC Vc_CONST __m128i set(short x0, short x1, short x2, short x3, short x4, short x5, short x6, short x7)
{
return set(static_cast<unsigned short>(x0), static_cast<unsigned short>(x1), static_cast<unsigned short>(x2),
static_cast<unsigned short>(x3), static_cast<unsigned short>(x4), static_cast<unsigned short>(x5),
static_cast<unsigned short>(x6), static_cast<unsigned short>(x7));
}
}
}
#endif
#ifndef VC_COMMON_GATHERIMPLEMENTATION_H_
#define VC_COMMON_GATHERIMPLEMENTATION_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
enum class GatherScatterImplementation : int {
SimpleLoop,
SetIndexZero,
BitScanLoop,
PopcntSwitch
};
using SimpleLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SimpleLoop>;
using SetIndexZeroT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SetIndexZero>;
using BitScanLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::BitScanLoop>;
using PopcntSwitchT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::PopcntSwitch>;
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(SetIndexZeroT,
V &v,
const MT *mem,
IT &&indexes_,
typename V::MaskArgument mask)
{
auto indexes = std::forward<IT>(indexes_);
indexes.setZeroInverted(static_cast<decltype(!indexes)>(mask));
const V tmp(mem, indexes);
where(mask) | v = tmp;
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(SimpleLoopT, V &v, const MT *mem, const IT &indexes,
const typename V::MaskArgument mask)
{
if (Vc_IS_UNLIKELY(mask.isEmpty())) {
return;
}
#if defined Vc_GCC && Vc_GCC >= 0x40900
constexpr std::size_t Sizeof = sizeof(V);
using Builtin [[gnu::vector_size(Sizeof)]] = typename V::value_type;
Builtin tmp = reinterpret_cast<Builtin>(v.data());
Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
if (mask[i]) {
tmp[i] = mem[indexes[i]];
}
});
v.data() = reinterpret_cast<typename V::VectorType>(tmp);
#else
Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
if (mask[i])
v[i] = mem[indexes[i]];
});
#endif
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(BitScanLoopT,
V &v,
const MT *mem,
const IT &indexes,
typename V::MaskArgument mask)
{
#ifdef Vc_GNU_ASM
size_t bits = mask.toInt();
while (Vc_IS_LIKELY(bits > 0)) {
size_t i, j;
asm("bsf %[bits],%[i]\n\t"
"bsr %[bits],%[j]\n\t"
"btr %[i],%[bits]\n\t"
"btr %[j],%[bits]\n\t"
: [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
v[i] = mem[indexes[i]];
v[j] = mem[indexes[j]];
}
#else
int bits = mask.toInt();
while (bits) {
const int i = _bit_scan_forward(bits);
bits &= bits - 1;
v[i] = mem[indexes[i]];
}
#endif
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
V &v,
const MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 16> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low, high = 0;
switch (Vc::Detail::popcnt16(bits)) {
case 16:
v.gather(mem, indexes);
break;
case 15:
low = _bit_scan_forward(bits);
bits ^= 1 << low;
v[low] = mem[indexes[low]];
case 14:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
case 13:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
case 12:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
case 11:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
case 10:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
case 9:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
case 8:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
case 7:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
case 6:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
case 5:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
case 4:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
case 3:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
case 2:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
case 1:
low = _bit_scan_forward(bits);
v[low] = mem[indexes[low]];
case 0:
break;
}
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
V &v,
const MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 8> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low, high = 0;
switch (Vc::Detail::popcnt8(bits)) {
case 8:
v.gather(mem, indexes);
break;
case 7:
low = _bit_scan_forward(bits);
bits ^= 1 << low;
v[low] = mem[indexes[low]];
case 6:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
case 5:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
case 4:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
case 3:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
case 2:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
case 1:
low = _bit_scan_forward(bits);
v[low] = mem[indexes[low]];
case 0:
break;
}
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
V &v,
const MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 4> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low, high = 0;
switch (Vc::Detail::popcnt4(bits)) {
case 4:
v.gather(mem, indexes);
break;
case 3:
low = _bit_scan_forward(bits);
bits ^= 1 << low;
v[low] = mem[indexes[low]];
case 2:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
case 1:
low = _bit_scan_forward(bits);
v[low] = mem[indexes[low]];
case 0:
break;
}
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
V &v,
const MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 2> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low;
switch (Vc::Detail::popcnt4(bits)) {
case 2:
v.gather(mem, indexes);
break;
case 1:
low = _bit_scan_forward(bits);
v[low] = mem[indexes[low]];
case 0:
break;
}
}
}
}
#endif
#ifndef VC_COMMON_SCATTERIMPLEMENTATION_H_
#define VC_COMMON_SCATTERIMPLEMENTATION_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(SetIndexZeroT,
V &v,
MT *mem,
IT indexes,
typename V::MaskArgument mask)
{
indexes.setZeroInverted(static_cast<typename IT::Mask>(mask));
const V tmp(mem, indexes);
where(mask) | v = tmp;
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(SimpleLoopT,
V &v,
MT *mem,
const IT &indexes,
typename V::MaskArgument mask)
{
if (Vc_IS_UNLIKELY(mask.isEmpty())) {
return;
}
Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
if (mask[i])
mem[indexes[i]] = v[i];
});
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(BitScanLoopT,
V &v,
MT *mem,
const IT &indexes,
typename V::MaskArgument mask)
{
size_t bits = mask.toInt();
while (Vc_IS_LIKELY(bits > 0)) {
size_t i, j;
asm("bsf %[bits],%[i]\n\t"
"bsr %[bits],%[j]\n\t"
"btr %[i],%[bits]\n\t"
"btr %[j],%[bits]\n\t"
: [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
mem[indexes[i]] = v[i];
mem[indexes[j]] = v[j];
}
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
V &v,
MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 16> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low, high = 0;
switch (Vc::Detail::popcnt16(bits)) {
case 16:
v.scatter(mem, indexes);
break;
case 15:
low = _bit_scan_forward(bits);
bits ^= 1 << low;
mem[indexes[low]] = v[low];
case 14:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 13:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 12:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 11:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 10:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 9:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 8:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 7:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 6:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 5:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 4:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 3:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 2:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
case 1:
low = _bit_scan_forward(bits);
mem[indexes[low]] = v[low];
case 0:
break;
}
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
V &v,
MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 8> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low, high = 0;
switch (Vc::Detail::popcnt8(bits)) {
case 8:
v.scatter(mem, indexes);
break;
case 7:
low = _bit_scan_forward(bits);
bits ^= 1 << low;
mem[indexes[low]] = v[low];
case 6:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 5:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 4:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 3:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 2:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
case 1:
low = _bit_scan_forward(bits);
mem[indexes[low]] = v[low];
case 0:
break;
}
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
V &v,
MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 4> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low, high = 0;
switch (Vc::Detail::popcnt4(bits)) {
case 4:
v.scatter(mem, indexes);
break;
case 3:
low = _bit_scan_forward(bits);
bits ^= 1 << low;
mem[indexes[low]] = v[low];
case 2:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
case 1:
low = _bit_scan_forward(bits);
mem[indexes[low]] = v[low];
case 0:
break;
}
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
V &v,
MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 2> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low;
switch (Vc::Detail::popcnt4(bits)) {
case 2:
v.scatter(mem, indexes);
break;
case 1:
low = _bit_scan_forward(bits);
mem[indexes[low]] = v[low];
case 0:
break;
}
}
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
Vc_INTRINSIC SSE::double_m operator==(SSE::double_v a, SSE::double_v b) { return _mm_cmpeq_pd(a.data(), b.data()); }
Vc_INTRINSIC SSE:: float_m operator==(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpeq_ps(a.data(), b.data()); }
Vc_INTRINSIC SSE:: int_m operator==(SSE:: int_v a, SSE:: int_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
Vc_INTRINSIC SSE:: uint_m operator==(SSE:: uint_v a, SSE:: uint_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
Vc_INTRINSIC SSE:: short_m operator==(SSE:: short_v a, SSE:: short_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }
Vc_INTRINSIC SSE::ushort_m operator==(SSE::ushort_v a, SSE::ushort_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }
Vc_INTRINSIC SSE::double_m operator!=(SSE::double_v a, SSE::double_v b) { return _mm_cmpneq_pd(a.data(), b.data()); }
Vc_INTRINSIC SSE:: float_m operator!=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpneq_ps(a.data(), b.data()); }
Vc_INTRINSIC SSE:: int_m operator!=(SSE:: int_v a, SSE:: int_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
Vc_INTRINSIC SSE:: uint_m operator!=(SSE:: uint_v a, SSE:: uint_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
Vc_INTRINSIC SSE:: short_m operator!=(SSE:: short_v a, SSE:: short_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }
Vc_INTRINSIC SSE::ushort_m operator!=(SSE::ushort_v a, SSE::ushort_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }
Vc_INTRINSIC SSE::double_m operator> (SSE::double_v a, SSE::double_v b) { return _mm_cmpgt_pd(a.data(), b.data()); }
Vc_INTRINSIC SSE:: float_m operator> (SSE:: float_v a, SSE:: float_v b) { return _mm_cmpgt_ps(a.data(), b.data()); }
Vc_INTRINSIC SSE:: int_m operator> (SSE:: int_v a, SSE:: int_v b) { return _mm_cmpgt_epi32(a.data(), b.data()); }
Vc_INTRINSIC SSE:: uint_m operator> (SSE:: uint_v a, SSE:: uint_v b) {
#ifndef USE_INCORRECT_UNSIGNED_COMPARE
return SSE::cmpgt_epu32(a.data(), b.data());
#else
return _mm_cmpgt_epi32(a.data(), b.data());
#endif
}
Vc_INTRINSIC SSE:: short_m operator> (SSE:: short_v a, SSE:: short_v b) { return _mm_cmpgt_epi16(a.data(), b.data()); }
Vc_INTRINSIC SSE::ushort_m operator> (SSE::ushort_v a, SSE::ushort_v b) {
#ifndef USE_INCORRECT_UNSIGNED_COMPARE
return SSE::cmpgt_epu16(a.data(), b.data());
#else
return _mm_cmpgt_epi16(a.data(), b.data());
#endif
}
Vc_INTRINSIC SSE::double_m operator< (SSE::double_v a, SSE::double_v b) { return _mm_cmplt_pd(a.data(), b.data()); }
Vc_INTRINSIC SSE:: float_m operator< (SSE:: float_v a, SSE:: float_v b) { return _mm_cmplt_ps(a.data(), b.data()); }
Vc_INTRINSIC SSE:: int_m operator< (SSE:: int_v a, SSE:: int_v b) { return _mm_cmplt_epi32(a.data(), b.data()); }
Vc_INTRINSIC SSE:: uint_m operator< (SSE:: uint_v a, SSE:: uint_v b) {
#ifndef USE_INCORRECT_UNSIGNED_COMPARE
return SSE::cmplt_epu32(a.data(), b.data());
#else
return _mm_cmplt_epi32(a.data(), b.data());
#endif
}
Vc_INTRINSIC SSE:: short_m operator< (SSE:: short_v a, SSE:: short_v b) { return _mm_cmplt_epi16(a.data(), b.data()); }
Vc_INTRINSIC SSE::ushort_m operator< (SSE::ushort_v a, SSE::ushort_v b) {
#ifndef USE_INCORRECT_UNSIGNED_COMPARE
return SSE::cmplt_epu16(a.data(), b.data());
#else
return _mm_cmplt_epi16(a.data(), b.data());
#endif
}
Vc_INTRINSIC SSE::double_m operator>=(SSE::double_v a, SSE::double_v b) { return _mm_cmpnlt_pd(a.data(), b.data()); }
Vc_INTRINSIC SSE:: float_m operator>=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpnlt_ps(a.data(), b.data()); }
Vc_INTRINSIC SSE:: int_m operator>=(SSE:: int_v a, SSE:: int_v b) { return !(a < b); }
Vc_INTRINSIC SSE:: uint_m operator>=(SSE:: uint_v a, SSE:: uint_v b) { return !(a < b); }
Vc_INTRINSIC SSE:: short_m operator>=(SSE:: short_v a, SSE:: short_v b) { return !(a < b); }
Vc_INTRINSIC SSE::ushort_m operator>=(SSE::ushort_v a, SSE::ushort_v b) { return !(a < b); }
Vc_INTRINSIC SSE::double_m operator<=(SSE::double_v a, SSE::double_v b) { return _mm_cmple_pd(a.data(), b.data()); }
Vc_INTRINSIC SSE:: float_m operator<=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmple_ps(a.data(), b.data()); }
Vc_INTRINSIC SSE:: int_m operator<=(SSE:: int_v a, SSE:: int_v b) { return !(a > b); }
Vc_INTRINSIC SSE:: uint_m operator<=(SSE:: uint_v a, SSE:: uint_v b) { return !(a > b); }
Vc_INTRINSIC SSE:: short_m operator<=(SSE:: short_v a, SSE:: short_v b) { return !(a > b); }
Vc_INTRINSIC SSE::ushort_m operator<=(SSE::ushort_v a, SSE::ushort_v b) { return !(a > b); }
template <typename T>
Vc_INTRINSIC SSE::Vector<T> operator^(SSE::Vector<T> a, SSE::Vector<T> b)
{
return xor_(a.data(), b.data());
}
template <typename T>
Vc_INTRINSIC SSE::Vector<T> operator&(SSE::Vector<T> a, SSE::Vector<T> b)
{
return and_(a.data(), b.data());
}
template <typename T>
Vc_INTRINSIC SSE::Vector<T> operator|(SSE::Vector<T> a, SSE::Vector<T> b)
{
return or_(a.data(), b.data());
}
template <typename T>
Vc_INTRINSIC SSE::Vector<T> operator+(SSE::Vector<T> a, SSE::Vector<T> b)
{
return add(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC SSE::Vector<T> operator-(SSE::Vector<T> a, SSE::Vector<T> b)
{
return sub(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC SSE::Vector<T> operator*(SSE::Vector<T> a, SSE::Vector<T> b)
{
return mul(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC enable_if<std::is_floating_point<T>::value, SSE::Vector<T>> operator/(
SSE::Vector<T> a, SSE::Vector<T> b)
{
return div(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC
enable_if<std::is_same<int, T>::value || std::is_same<uint, T>::value, SSE::Vector<T>>
operator/(SSE::Vector<T> a, SSE::Vector<T> b)
{
return SSE::Vector<T>::generate([&](int i) { return a[i] / b[i]; });
}
template <typename T>
Vc_INTRINSIC enable_if<std::is_same<short, T>::value || std::is_same<ushort, T>::value,
SSE::Vector<T>>
operator/(SSE::Vector<T> a, SSE::Vector<T> b)
{
using HT = SSE::VectorHelper<T>;
__m128 lo = _mm_cvtepi32_ps(HT::expand0(a.data()));
__m128 hi = _mm_cvtepi32_ps(HT::expand1(a.data()));
lo = _mm_div_ps(lo, _mm_cvtepi32_ps(HT::expand0(b.data())));
hi = _mm_div_ps(hi, _mm_cvtepi32_ps(HT::expand1(b.data())));
return HT::concat(_mm_cvttps_epi32(lo), _mm_cvttps_epi32(hi));
}
template <typename T>
Vc_INTRINSIC enable_if<std::is_integral<T>::value, SSE::Vector<T>> operator%(
SSE::Vector<T> a, SSE::Vector<T> b)
{
return a - a / b * b;
}
}
template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerZero)
: d(HV::zero())
{
}
template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerOne)
: d(HT::one())
{
}
template <typename T>
Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
: d(Detail::load16(Detail::IndexesFromZero<EntryType, Size>(), Aligned))
{
#if defined Vc_GCC && Vc_GCC < 0x40903 && defined Vc_IMPL_AVX2
if (std::is_same<T, short>::value) {
asm("" ::"x"(d.v()));
}
#endif
}
template <>
Vc_INTRINSIC Vector<float, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
: d(SSE::convert<int, float>(SSE::int_v::IndexesFromZero().data()))
{
}
template <>
Vc_INTRINSIC Vector<double, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
: d(SSE::convert<int, double>(SSE::int_v::IndexesFromZero().data()))
{
}
template <typename DstT>
template <typename SrcT, typename Flags>
Vc_INTRINSIC typename Vector<DstT, VectorAbi::Sse>::
#ifndef Vc_MSVC
template
#endif
load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Sse>::load(const SrcT *mem, Flags flags)
{
Common::handleLoadPrefetches(mem, flags);
d.v() = Detail::load<VectorType, DstT>(mem, flags);
}
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero()
{
data() = HV::zero();
}
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero(const Mask &k)
{
data() = Detail::andnot_(k.data(), data());
}
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZeroInverted(const Mask &k)
{
data() = Detail::and_(k.data(), data());
}
template<> Vc_INTRINSIC void SSE::double_v::setQnan()
{
data() = SSE::_mm_setallone_pd();
}
template<> Vc_INTRINSIC void Vector<double, VectorAbi::Sse>::setQnan(const Mask &k)
{
data() = _mm_or_pd(data(), k.dataD());
}
template<> Vc_INTRINSIC void SSE::float_v::setQnan()
{
data() = SSE::_mm_setallone_ps();
}
template<> Vc_INTRINSIC void Vector<float, VectorAbi::Sse>::setQnan(const Mask &k)
{
data() = _mm_or_ps(data(), k.dataF());
}
template <typename T>
template <typename U, typename Flags, typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Flags flags) const
{
Common::handleStorePrefetches(mem, flags);
HV::template store<Flags>(mem, data());
}
template <typename T>
template <typename U, typename Flags, typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Mask mask, Flags flags) const
{
Common::handleStorePrefetches(mem, flags);
HV::template store<Flags>(mem, data(), mask.data());
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator-() const
{
return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
}
#ifdef Vc_IMPL_XOP
template <> Vc_ALWAYS_INLINE SSE::int_v SSE::int_v::operator<<(const SSE::int_v shift) const { return _mm_sha_epi32(d.v(), shift.d.v()); }
template <> Vc_ALWAYS_INLINE SSE::uint_v SSE::uint_v::operator<<(const SSE::uint_v shift) const { return _mm_shl_epi32(d.v(), shift.d.v()); }
template <> Vc_ALWAYS_INLINE SSE::short_v SSE::short_v::operator<<(const SSE::short_v shift) const { return _mm_sha_epi16(d.v(), shift.d.v()); }
template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator<<(const SSE::ushort_v shift) const { return _mm_shl_epi16(d.v(), shift.d.v()); }
template <> Vc_ALWAYS_INLINE SSE::int_v SSE::int_v::operator>>(const SSE::int_v shift) const { return operator<<(-shift); }
template <> Vc_ALWAYS_INLINE SSE::uint_v SSE::uint_v::operator>>(const SSE::uint_v shift) const { return operator<<(-shift); }
template <> Vc_ALWAYS_INLINE SSE::short_v SSE::short_v::operator>>(const SSE::short_v shift) const { return operator<<(-shift); }
template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator>>(const SSE::ushort_v shift) const { return operator<<(-shift); }
#elif defined Vc_IMPL_AVX2
template <> Vc_ALWAYS_INLINE SSE::Vector< int> Vector< int, VectorAbi::Sse>::operator<<(const SSE::Vector< int> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE SSE::Vector< uint> Vector< uint, VectorAbi::Sse>::operator<<(const SSE::Vector< uint> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE SSE::Vector< int> Vector< int, VectorAbi::Sse>::operator>>(const SSE::Vector< int> x) const { return _mm_srav_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE SSE::Vector< uint> Vector< uint, VectorAbi::Sse>::operator>>(const SSE::Vector< uint> x) const { return _mm_srlv_epi32(d.v(), x.d.v()); }
#endif
template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator>>=(int shift) {
d.v() = HT::shiftRight(d.v(), shift);
return *this;
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator>>(int shift) const {
return HT::shiftRight(d.v(), shift);
}
template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator<<=(int shift) {
d.v() = HT::shiftLeft(d.v(), shift);
return *this;
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator<<(int shift) const {
return HT::shiftLeft(d.v(), shift);
}
Vc_INTRINSIC Vc_CONST SSE::float_m isnegative(SSE::float_v x)
{
return sse_cast<__m128>(_mm_srai_epi32(
sse_cast<__m128i>(_mm_and_ps(SSE::_mm_setsignmask_ps(), x.data())), 31));
}
Vc_INTRINSIC Vc_CONST SSE::double_m isnegative(SSE::double_v x)
{
return Mem::permute<X1, X1, X3, X3>(sse_cast<__m128>(_mm_srai_epi32(
sse_cast<__m128i>(_mm_and_pd(SSE::_mm_setsignmask_pd(), x.data())), 31)));
}
#define Vc_GATHER_IMPL(V_) \
template <> \
template <class MT, class IT, int Scale> \
inline void SSE::V_::gatherImplementation( \
const Common::GatherArguments<MT, IT, Scale> &args)
#define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
Vc_GATHER_IMPL(double_v) { d.v() = _mm_setr_pd(Vc_M(0), Vc_M(1)); }
Vc_GATHER_IMPL(float_v) { d.v() = _mm_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
Vc_GATHER_IMPL(int_v) { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
Vc_GATHER_IMPL(uint_v) { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
Vc_GATHER_IMPL(short_v)
{
d.v() =
Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7));
}
Vc_GATHER_IMPL(ushort_v)
{
d.v() =
Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7));
}
#undef Vc_M
#undef Vc_GATHER_IMPL
template <typename T>
template <class MT, class IT, int Scale>
inline void Vector<T, VectorAbi::Sse>::gatherImplementation(
const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
{
const auto *mem = args.address;
const auto indexes = Scale * args.indexes;
using Selector = std::integral_constant < Common::GatherScatterImplementation,
#ifdef Vc_USE_SET_GATHERS
Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
#endif
#ifdef Vc_USE_BSF_GATHERS
Common::GatherScatterImplementation::BitScanLoop
#elif defined Vc_USE_POPCNT_BSF_GATHERS
Common::GatherScatterImplementation::PopcntSwitch
#else
Common::GatherScatterImplementation::SimpleLoop
#endif
> ;
Common::executeGather(Selector(), *this, mem, indexes, mask);
}
template <typename T>
template <typename MT, typename IT>
inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes) const
{
Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
}
template <typename T>
template <typename MT, typename IT>
inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
{
using Selector = std::integral_constant < Common::GatherScatterImplementation,
#ifdef Vc_USE_SET_GATHERS
Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
#endif
#ifdef Vc_USE_BSF_GATHERS
Common::GatherScatterImplementation::BitScanLoop
#elif defined Vc_USE_POPCNT_BSF_GATHERS
Common::GatherScatterImplementation::PopcntSwitch
#else
Common::GatherScatterImplementation::SimpleLoop
#endif
> ;
Common::executeScatter(Selector(), *this, mem, indexes, mask);
}
template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::partialSum() const
{
Vector<T, VectorAbi::Sse> tmp = *this;
if (Size > 1) tmp += tmp.shifted(-1);
if (Size > 2) tmp += tmp.shifted(-2);
if (Size > 4) tmp += tmp.shifted(-4);
if (Size > 8) tmp += tmp.shifted(-8);
if (Size > 16) tmp += tmp.shifted(-16);
return tmp;
}
#ifndef Vc_IMPL_SSE4_1
template<> Vc_INTRINSIC Vc_PURE int SSE::int_v::product() const
{
return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
}
template<> Vc_INTRINSIC Vc_PURE unsigned int SSE::uint_v::product() const
{
return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
}
#endif
template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::min(MaskArg m) const
{
Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::max();
tmp(m) = *this;
return tmp.min();
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::max(MaskArg m) const
{
Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::min();
tmp(m) = *this;
return tmp.max();
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::product(MaskArg m) const
{
Vector<T, VectorAbi::Sse> tmp(Vc::One);
tmp(m) = *this;
return tmp.product();
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::sum(MaskArg m) const
{
Vector<T, VectorAbi::Sse> tmp(Vc::Zero);
tmp(m) = *this;
return tmp.sum();
}
namespace Detail
{
Vc_INTRINSIC Vc_CONST __m128 exponent(__m128 v)
{
__m128i tmp = _mm_srli_epi32(_mm_castps_si128(v), 23);
tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x7f));
return _mm_cvtepi32_ps(tmp);
}
Vc_INTRINSIC Vc_CONST __m128d exponent(__m128d v)
{
__m128i tmp = _mm_srli_epi64(_mm_castpd_si128(v), 52);
tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x3ff));
return _mm_cvtepi32_pd(_mm_shuffle_epi32(tmp, 0x08));
}
}
Vc_INTRINSIC Vc_CONST SSE::float_v exponent(SSE::float_v x)
{
using Detail::operator>=;
Vc_ASSERT((x >= x.Zero()).isFull());
return Detail::exponent(x.data());
}
Vc_INTRINSIC Vc_CONST SSE::double_v exponent(SSE::double_v x)
{
using Detail::operator>=;
Vc_ASSERT((x >= x.Zero()).isFull());
return Detail::exponent(x.data());
}
static void _doRandomStep(SSE::uint_v &state0,
SSE::uint_v &state1)
{
using SSE::uint_v;
using Detail::operator+;
using Detail::operator*;
state0.load(&Common::RandomState[0]);
state1.load(&Common::RandomState[uint_v::Size]);
(state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
uint_v(_mm_xor_si128((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
_mm_srli_epi32(state1.data(), 16)))
.store(&Common::RandomState[0]);
}
template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::Random()
{
SSE::uint_v state0, state1;
_doRandomStep(state0, state1);
return state0.data();
}
template<> Vc_ALWAYS_INLINE SSE::float_v SSE::float_v::Random()
{
SSE::uint_v state0, state1;
_doRandomStep(state0, state1);
return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
}
template<> Vc_ALWAYS_INLINE SSE::double_v SSE::double_v::Random()
{
typedef unsigned long long uint64 Vc_MAY_ALIAS;
uint64 state0 = *reinterpret_cast<const uint64 *>(&Common::RandomState[8]);
uint64 state1 = *reinterpret_cast<const uint64 *>(&Common::RandomState[10]);
const __m128i state = _mm_load_si128(reinterpret_cast<const __m128i *>(&Common::RandomState[8]));
*reinterpret_cast<uint64 *>(&Common::RandomState[ 8]) = (state0 * 0x5deece66dull + 11);
*reinterpret_cast<uint64 *>(&Common::RandomState[10]) = (state1 * 0x5deece66dull + 11);
return _mm_sub_pd(_mm_or_pd(_mm_castsi128_pd(_mm_srli_epi64(state, 12)), HT::one()), HT::one());
}
template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount) const
{
enum {
EntryTypeSizeof = sizeof(EntryType)
};
switch (amount) {
case 0: return *this;
case 1: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
case 2: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
case 3: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
case 4: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
case 5: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
case 6: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
case 7: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
case 8: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
case -1: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
case -2: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
case -3: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
case -4: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
case -5: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
case -6: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
case -7: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
case -8: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
}
return Zero();
}
template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount, Vector shiftIn) const
{
if (amount >= -int(size())) {
constexpr int VectorWidth = int(size());
constexpr int EntryTypeSizeof = sizeof(EntryType);
const __m128i v0 = sse_cast<__m128i>(d.v());
const __m128i v1 = sse_cast<__m128i>(shiftIn.d.v());
auto &&fixup = sse_cast<VectorType, __m128i>;
switch (amount) {
case 0: return *this;
case -1: return fixup(SSE::alignr_epi8<(VectorWidth - 1) * EntryTypeSizeof>(v0, v1));
case -2: return fixup(SSE::alignr_epi8<(VectorWidth - 2) * EntryTypeSizeof>(v0, v1));
case -3: return fixup(SSE::alignr_epi8<(VectorWidth - 3) * EntryTypeSizeof>(v0, v1));
case -4: return fixup(SSE::alignr_epi8<(VectorWidth - 4) * EntryTypeSizeof>(v0, v1));
case -5: return fixup(SSE::alignr_epi8<(VectorWidth - 5) * EntryTypeSizeof>(v0, v1));
case -6: return fixup(SSE::alignr_epi8<(VectorWidth - 6) * EntryTypeSizeof>(v0, v1));
case -7: return fixup(SSE::alignr_epi8<(VectorWidth - 7) * EntryTypeSizeof>(v0, v1));
case -8: return fixup(SSE::alignr_epi8<(VectorWidth - 8) * EntryTypeSizeof>(v0, v1));
case -9: return fixup(SSE::alignr_epi8<(VectorWidth - 9) * EntryTypeSizeof>(v0, v1));
case-10: return fixup(SSE::alignr_epi8<(VectorWidth -10) * EntryTypeSizeof>(v0, v1));
case-11: return fixup(SSE::alignr_epi8<(VectorWidth -11) * EntryTypeSizeof>(v0, v1));
case-12: return fixup(SSE::alignr_epi8<(VectorWidth -12) * EntryTypeSizeof>(v0, v1));
case-13: return fixup(SSE::alignr_epi8<(VectorWidth -13) * EntryTypeSizeof>(v0, v1));
case-14: return fixup(SSE::alignr_epi8<(VectorWidth -14) * EntryTypeSizeof>(v0, v1));
case-15: return fixup(SSE::alignr_epi8<(VectorWidth -15) * EntryTypeSizeof>(v0, v1));
case 1: return fixup(SSE::alignr_epi8< 1 * EntryTypeSizeof>(v1, v0));
case 2: return fixup(SSE::alignr_epi8< 2 * EntryTypeSizeof>(v1, v0));
case 3: return fixup(SSE::alignr_epi8< 3 * EntryTypeSizeof>(v1, v0));
case 4: return fixup(SSE::alignr_epi8< 4 * EntryTypeSizeof>(v1, v0));
case 5: return fixup(SSE::alignr_epi8< 5 * EntryTypeSizeof>(v1, v0));
case 6: return fixup(SSE::alignr_epi8< 6 * EntryTypeSizeof>(v1, v0));
case 7: return fixup(SSE::alignr_epi8< 7 * EntryTypeSizeof>(v1, v0));
case 8: return fixup(SSE::alignr_epi8< 8 * EntryTypeSizeof>(v1, v0));
case 9: return fixup(SSE::alignr_epi8< 9 * EntryTypeSizeof>(v1, v0));
case 10: return fixup(SSE::alignr_epi8<10 * EntryTypeSizeof>(v1, v0));
case 11: return fixup(SSE::alignr_epi8<11 * EntryTypeSizeof>(v1, v0));
case 12: return fixup(SSE::alignr_epi8<12 * EntryTypeSizeof>(v1, v0));
case 13: return fixup(SSE::alignr_epi8<13 * EntryTypeSizeof>(v1, v0));
case 14: return fixup(SSE::alignr_epi8<14 * EntryTypeSizeof>(v1, v0));
case 15: return fixup(SSE::alignr_epi8<15 * EntryTypeSizeof>(v1, v0));
}
}
return shiftIn.shifted(int(size()) + amount);
}
template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::rotated(int amount) const
{
enum {
EntryTypeSizeof = sizeof(EntryType)
};
const __m128i v = SSE::sse_cast<__m128i>(d.v());
switch (static_cast<unsigned int>(amount) % Size) {
case 0: return *this;
case 1: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<1 * EntryTypeSizeof>(v, v));
case 2: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<2 * EntryTypeSizeof>(v, v));
case 3: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<3 * EntryTypeSizeof>(v, v));
case 4: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<4 * EntryTypeSizeof>(v, v));
case 5: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<5 * EntryTypeSizeof>(v, v));
case 6: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<6 * EntryTypeSizeof>(v, v));
case 7: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<7 * EntryTypeSizeof>(v, v));
}
return Zero();
}
namespace Detail
{
inline Vc_CONST SSE::double_v sorted(SSE::double_v x_)
{
const __m128d x = x_.data();
const __m128d y = _mm_shuffle_pd(x, x, _MM_SHUFFLE2(0, 1));
return _mm_unpacklo_pd(_mm_min_sd(x, y), _mm_max_sd(x, y));
}
}
template <typename T>
Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::sorted()
const
{
return Detail::sorted(*this);
}
template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveLow (SSE::double_v x) const { return _mm_unpacklo_pd(data(), x.data()); }
template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveHigh(SSE::double_v x) const { return _mm_unpackhi_pd(data(), x.data()); }
template <> Vc_INTRINSIC SSE::float_v SSE::float_v::interleaveLow ( SSE::float_v x) const { return _mm_unpacklo_ps(data(), x.data()); }
template <> Vc_INTRINSIC SSE::float_v SSE::float_v::interleaveHigh( SSE::float_v x) const { return _mm_unpackhi_ps(data(), x.data()); }
template <> Vc_INTRINSIC SSE::int_v SSE::int_v::interleaveLow ( SSE::int_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
template <> Vc_INTRINSIC SSE::int_v SSE::int_v::interleaveHigh( SSE::int_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
template <> Vc_INTRINSIC SSE::uint_v SSE::uint_v::interleaveLow ( SSE::uint_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
template <> Vc_INTRINSIC SSE::uint_v SSE::uint_v::interleaveHigh( SSE::uint_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveLow ( SSE::short_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveHigh( SSE::short_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveLow (SSE::ushort_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveHigh(SSE::ushort_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
template <> template <typename G> Vc_INTRINSIC SSE::double_v SSE::double_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
return _mm_setr_pd(tmp0, tmp1);
}
template <> template <typename G> Vc_INTRINSIC SSE::float_v SSE::float_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
return _mm_setr_ps(tmp0, tmp1, tmp2, tmp3);
}
template <> template <typename G> Vc_INTRINSIC SSE::int_v SSE::int_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
}
template <> template <typename G> Vc_INTRINSIC SSE::uint_v SSE::uint_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
}
template <> template <typename G> Vc_INTRINSIC SSE::short_v SSE::short_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
const auto tmp4 = gen(4);
const auto tmp5 = gen(5);
const auto tmp6 = gen(6);
const auto tmp7 = gen(7);
return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
}
template <> template <typename G> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
const auto tmp4 = gen(4);
const auto tmp5 = gen(5);
const auto tmp6 = gen(6);
const auto tmp7 = gen(7);
return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
}
template <> Vc_INTRINSIC Vc_PURE SSE::double_v SSE::double_v::reversed() const
{
return Mem::permute<X1, X0>(d.v());
}
template <> Vc_INTRINSIC Vc_PURE SSE::float_v SSE::float_v::reversed() const
{
return Mem::permute<X3, X2, X1, X0>(d.v());
}
template <> Vc_INTRINSIC Vc_PURE SSE::int_v SSE::int_v::reversed() const
{
return Mem::permute<X3, X2, X1, X0>(d.v());
}
template <> Vc_INTRINSIC Vc_PURE SSE::uint_v SSE::uint_v::reversed() const
{
return Mem::permute<X3, X2, X1, X0>(d.v());
}
template <> Vc_INTRINSIC Vc_PURE SSE::short_v SSE::short_v::reversed() const
{
return sse_cast<__m128i>(
Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
}
template <> Vc_INTRINSIC Vc_PURE SSE::ushort_v SSE::ushort_v::reversed() const
{
return sse_cast<__m128i>(
Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
}
template <>
Vc_INTRINSIC SSE::float_v SSE::float_v::operator[](const SSE::int_v &
#ifdef Vc_IMPL_AVX
perm
#endif
) const
{
#ifdef Vc_IMPL_AVX
return _mm_permutevar_ps(d.v(), perm.data());
#else
return *this;
#endif
}
template <> template <int Index> Vc_INTRINSIC SSE::float_v SSE::float_v::broadcast() const
{
constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
return Mem::permute<Inner, Inner, Inner, Inner>(d.v());
}
template <> template <int Index> Vc_INTRINSIC SSE::double_v SSE::double_v::broadcast() const
{
constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
return Mem::permute<Inner, Inner>(d.v());
}
namespace Common
{
Vc_ALWAYS_INLINE void transpose_impl(
TransposeTag<4, 4>, SSE::float_v *Vc_RESTRICT r[],
const TransposeProxy<SSE::float_v, SSE::float_v, SSE::float_v, SSE::float_v> &proxy)
{
const auto in0 = std::get<0>(proxy.in).data();
const auto in1 = std::get<1>(proxy.in).data();
const auto in2 = std::get<2>(proxy.in).data();
const auto in3 = std::get<3>(proxy.in).data();
const auto tmp0 = _mm_unpacklo_ps(in0, in2);
const auto tmp1 = _mm_unpacklo_ps(in1, in3);
const auto tmp2 = _mm_unpackhi_ps(in0, in2);
const auto tmp3 = _mm_unpackhi_ps(in1, in3);
*r[0] = _mm_unpacklo_ps(tmp0, tmp1);
*r[1] = _mm_unpackhi_ps(tmp0, tmp1);
*r[2] = _mm_unpacklo_ps(tmp2, tmp3);
*r[3] = _mm_unpackhi_ps(tmp2, tmp3);
}
}
}
#ifndef VC_SSE_SIMD_CAST_H_
#define VC_SSE_SIMD_CAST_H_
#ifdef Vc_IMPL_AVX
#ifndef VC_AVX_CASTS_H_
#define VC_AVX_CASTS_H_
#ifndef VC_AVX_SHUFFLE_H_
#define VC_AVX_SHUFFLE_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <int... Dst> struct Permutation {};
template <uint8_t... Sel> struct Mask {};
#ifdef Vc_IMPL_AVX2
template <uint8_t Sel0, uint8_t Sel1, uint8_t Sel2, uint8_t Sel3, uint8_t Sel4,
uint8_t Sel5, uint8_t Sel6, uint8_t Sel7, uint8_t Sel8, uint8_t Sel9,
uint8_t Sel10, uint8_t Sel11, uint8_t Sel12, uint8_t Sel13, uint8_t Sel14,
uint8_t Sel15>
Vc_INTRINSIC Vc_CONST __m256i
blend(__m256i a, __m256i b, Mask<Sel0, Sel1, Sel2, Sel3, Sel4, Sel5, Sel6, Sel7, Sel8,
Sel9, Sel10, Sel11, Sel12, Sel13, Sel14, Sel15>)
{
static_assert((Sel0 == 0 || Sel0 == 1) && (Sel1 == 0 || Sel1 == 1) &&
(Sel2 == 0 || Sel2 == 1) && (Sel3 == 0 || Sel3 == 1) &&
(Sel4 == 0 || Sel4 == 1) && (Sel5 == 0 || Sel5 == 1) &&
(Sel6 == 0 || Sel6 == 1) && (Sel7 == 0 || Sel7 == 1) &&
(Sel8 == 0 || Sel8 == 1) && (Sel9 == 0 || Sel9 == 1) &&
(Sel10 == 0 || Sel10 == 1) && (Sel11 == 0 || Sel11 == 1) &&
(Sel12 == 0 || Sel12 == 1) && (Sel13 == 0 || Sel13 == 1) &&
(Sel14 == 0 || Sel14 == 1) && (Sel15 == 0 || Sel15 == 1),
"Selectors must be 0 or 1 to select the value from a or b");
constexpr uint8_t mask = static_cast<uint8_t>(
(Sel0 << 0 ) | (Sel1 << 1 ) | (Sel2 << 2 ) | (Sel3 << 3 ) |
(Sel4 << 4 ) | (Sel5 << 5 ) | (Sel6 << 6 ) | (Sel7 << 7 ) |
(Sel8 << 8 ) | (Sel9 << 9 ) | (Sel10 << 10) | (Sel11 << 11) |
(Sel12 << 12) | (Sel13 << 13) | (Sel14 << 14) | (Sel15 << 15));
return _mm256_blend_epi16(a, b, mask);
}
#endif
}
namespace Mem
{
#ifdef Vc_IMPL_AVX2
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteLo(__m256i x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm256_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteHi(__m256i x) {
static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
return _mm256_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
}
#endif
template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x) {
static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
return _mm256_permute2f128_ps(
x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
}
template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x) {
static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
return _mm256_permute2f128_pd(
x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
}
template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x) {
static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
#ifdef Vc_IMPL_AVX2
return _mm256_permute2x128_si256(
x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
#else
return _mm256_permute2f128_si256(
x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
#endif
}
template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle128(__m256 x, __m256 y) {
static_assert(L >= X0 && H >= X0, "Incorrect_Range");
static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
}
template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST shuffle128(__m256i x, __m256i y) {
static_assert(L >= X0 && H >= X0, "Incorrect_Range");
static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
#ifdef Vc_IMPL_AVX2
return _mm256_permute2x128_si256(
x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
#else
return _mm256_permute2f128_si256(
x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
#endif
}
template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle128(__m256d x, __m256d y) {
static_assert(L >= X0 && H >= X0, "Incorrect_Range");
static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute(__m256i x) {
return _mm256_castps_si256(permute<Dst0, Dst1, Dst2, Dst3>(_mm256_castsi256_ps(x)));
}
#ifdef Vc_IMPL_AVX2
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute4x64(__m256i x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm256_permute4x64_epi64(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
#endif
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
static Vc_ALWAYS_INLINE __m256 Vc_CONST blend(__m256 x, __m256 y) {
static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
return _mm256_blend_ps(x, y,
(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
(Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
(Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
(Dst6 / Y6) * 64 + (Dst7 / Y7) *128
);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
static Vc_ALWAYS_INLINE __m256i Vc_CONST blend(__m256i x, __m256i y) {
return _mm256_castps_si256(blend<Dst0, Dst1, Dst2, Dst3, Dst4, Dst5, Dst6, Dst7>(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
}
template<VecPos Dst> struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; };
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
static_assert(Dst0 >= X0 && Dst0 <= X7, "Incorrect_Range");
static_assert(Dst1 >= X0 && Dst1 <= X7, "Incorrect_Range");
static_assert(Dst2 >= X0 && Dst2 <= X7, "Incorrect_Range");
static_assert(Dst3 >= X0 && Dst3 <= X7, "Incorrect_Range");
static_assert(Dst4 >= X0 && Dst4 <= X7, "Incorrect_Range");
static_assert(Dst5 >= X0 && Dst5 <= X7, "Incorrect_Range");
static_assert(Dst6 >= X0 && Dst6 <= X7, "Incorrect_Range");
static_assert(Dst7 >= X0 && Dst7 <= X7, "Incorrect_Range");
if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) {
return permute<Dst0, Dst1, Dst2, Dst3>(x);
}
const __m128 loIn = _mm256_castps256_ps128(x);
const __m128 hiIn = _mm256_extractf128_ps(x, 1);
__m128 lo, hi;
if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) {
lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
} else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) {
lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
} else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) {
lo = shuffle<Dst0, Dst1, Dst2 - X4 + Y0, Dst3 - X4 + Y0>(loIn, hiIn);
} else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) {
lo = shuffle<Dst0 - X4, Dst1 - X4, Dst2 + Y0, Dst3 + Y0>(hiIn, loIn);
} else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) {
lo = _mm_unpacklo_ps(loIn, hiIn);
} else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) {
lo = _mm_unpacklo_ps(hiIn, loIn);
} else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) {
lo = _mm_unpackhi_ps(loIn, hiIn);
} else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) {
lo = _mm_unpackhi_ps(hiIn, loIn);
} else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) {
lo = blend<ScaleForBlend<Dst0>::Value, ScaleForBlend<Dst1>::Value,
ScaleForBlend<Dst2>::Value, ScaleForBlend<Dst3>::Value>(loIn, hiIn);
}
if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) {
hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
} else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) {
hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
} else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) {
hi = shuffle<Dst4, Dst5, Dst6 - X4 + Y0, Dst7 - X4 + Y0>(loIn, hiIn);
} else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) {
hi = shuffle<Dst4 - X4, Dst5 - X4, Dst6 + Y0, Dst7 + Y0>(hiIn, loIn);
} else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) {
hi = _mm_unpacklo_ps(loIn, hiIn);
} else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) {
hi = _mm_unpacklo_ps(hiIn, loIn);
} else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) {
hi = _mm_unpackhi_ps(loIn, hiIn);
} else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) {
hi = _mm_unpackhi_ps(hiIn, loIn);
} else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) {
hi = blend<ScaleForBlend<Dst4>::Value, ScaleForBlend<Dst5>::Value,
ScaleForBlend<Dst6>::Value, ScaleForBlend<Dst7>::Value>(loIn, hiIn);
}
return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
}
}
}
namespace Vc_VERSIONED_NAMESPACE
{
namespace Reg
{
template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x, __m256 y) {
static_assert(L >= X0 && H >= X0, "Incorrect_Range");
static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
}
template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x, __m256i y) {
static_assert(L >= X0 && H >= X0, "Incorrect_Range");
static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
#ifdef Vc_IMPL_AVX2
return _mm256_permute2x128_si256(
x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
#else
return _mm256_permute2f128_si256(
x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
#endif
}
template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x, __m256d y) {
static_assert(L >= X0 && H >= X0, "Incorrect_Range");
static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
}
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
}
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST permute(__m128d x) {
static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
return _mm_permute_pd(x, Dst0 + Dst1 * 2);
}
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
}
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
}
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
namespace Casts
{
template<typename T> Vc_INTRINSIC_L T avx_cast(__m128 v) Vc_INTRINSIC_R;
template<typename T> Vc_INTRINSIC_L T avx_cast(__m128i v) Vc_INTRINSIC_R;
template<typename T> Vc_INTRINSIC_L T avx_cast(__m128d v) Vc_INTRINSIC_R;
template<typename T> Vc_INTRINSIC_L T avx_cast(__m256 v) Vc_INTRINSIC_R;
template<typename T> Vc_INTRINSIC_L T avx_cast(__m256i v) Vc_INTRINSIC_R;
template<typename T> Vc_INTRINSIC_L T avx_cast(__m256d v) Vc_INTRINSIC_R;
template<> Vc_INTRINSIC __m128 avx_cast(__m128 v) { return v; }
template<> Vc_INTRINSIC __m128 avx_cast(__m128i v) { return _mm_castsi128_ps(v); }
template<> Vc_INTRINSIC __m128 avx_cast(__m128d v) { return _mm_castpd_ps(v); }
template<> Vc_INTRINSIC __m128i avx_cast(__m128 v) { return _mm_castps_si128(v); }
template<> Vc_INTRINSIC __m128i avx_cast(__m128i v) { return v; }
template<> Vc_INTRINSIC __m128i avx_cast(__m128d v) { return _mm_castpd_si128(v); }
template<> Vc_INTRINSIC __m128d avx_cast(__m128 v) { return _mm_castps_pd(v); }
template<> Vc_INTRINSIC __m128d avx_cast(__m128i v) { return _mm_castsi128_pd(v); }
template<> Vc_INTRINSIC __m128d avx_cast(__m128d v) { return v; }
template<> Vc_INTRINSIC __m256 avx_cast(__m128 v) { return _mm256_castps128_ps256(v); }
template<> Vc_INTRINSIC __m256 avx_cast(__m128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); }
template<> Vc_INTRINSIC __m256 avx_cast(__m128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); }
template<> Vc_INTRINSIC __m256i avx_cast(__m128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); }
template<> Vc_INTRINSIC __m256i avx_cast(__m128i v) { return _mm256_castsi128_si256(v); }
template<> Vc_INTRINSIC __m256i avx_cast(__m128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); }
template<> Vc_INTRINSIC __m256d avx_cast(__m128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); }
template<> Vc_INTRINSIC __m256d avx_cast(__m128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); }
template<> Vc_INTRINSIC __m256d avx_cast(__m128d v) { return _mm256_castpd128_pd256(v); }
#if defined Vc_MSVC || defined Vc_CLANG || defined Vc_APPLECLANG
static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); }
static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); }
static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); }
#else
static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); }
static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); }
static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); }
#endif
template<> Vc_INTRINSIC __m128 avx_cast(__m256 v) { return _mm256_castps256_ps128(v); }
template<> Vc_INTRINSIC __m128 avx_cast(__m256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); }
template<> Vc_INTRINSIC __m128 avx_cast(__m256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); }
template<> Vc_INTRINSIC __m128i avx_cast(__m256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); }
template<> Vc_INTRINSIC __m128i avx_cast(__m256i v) { return _mm256_castsi256_si128(v); }
template<> Vc_INTRINSIC __m128i avx_cast(__m256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); }
template<> Vc_INTRINSIC __m128d avx_cast(__m256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); }
template<> Vc_INTRINSIC __m128d avx_cast(__m256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); }
template<> Vc_INTRINSIC __m128d avx_cast(__m256d v) { return _mm256_castpd256_pd128(v); }
template<> Vc_INTRINSIC __m256 avx_cast(__m256 v) { return v; }
template<> Vc_INTRINSIC __m256 avx_cast(__m256i v) { return _mm256_castsi256_ps(v); }
template<> Vc_INTRINSIC __m256 avx_cast(__m256d v) { return _mm256_castpd_ps(v); }
template<> Vc_INTRINSIC __m256i avx_cast(__m256 v) { return _mm256_castps_si256(v); }
template<> Vc_INTRINSIC __m256i avx_cast(__m256i v) { return v; }
template<> Vc_INTRINSIC __m256i avx_cast(__m256d v) { return _mm256_castpd_si256(v); }
template<> Vc_INTRINSIC __m256d avx_cast(__m256 v) { return _mm256_castps_pd(v); }
template<> Vc_INTRINSIC __m256d avx_cast(__m256i v) { return _mm256_castsi256_pd(v); }
template<> Vc_INTRINSIC __m256d avx_cast(__m256d v) { return v; }
Vc_INTRINSIC Vc_CONST __m128 lo128(__m256 v) { return avx_cast<__m128>(v); }
Vc_INTRINSIC Vc_CONST __m128d lo128(__m256d v) { return avx_cast<__m128d>(v); }
Vc_INTRINSIC Vc_CONST __m128i lo128(__m256i v) { return avx_cast<__m128i>(v); }
Vc_INTRINSIC Vc_CONST __m128 hi128(__m256 v) { return extract128<1>(v); }
Vc_INTRINSIC Vc_CONST __m128d hi128(__m256d v) { return extract128<1>(v); }
Vc_INTRINSIC Vc_CONST __m128i hi128(__m256i v) { return extract128<1>(v); }
Vc_INTRINSIC Vc_CONST __m256 concat(__m128 a, __m128 b) { return insert128<1>(avx_cast<__m256 >(a), b); }
Vc_INTRINSIC Vc_CONST __m256d concat(__m128d a, __m128d b) { return insert128<1>(avx_cast<__m256d>(a), b); }
Vc_INTRINSIC Vc_CONST __m256i concat(__m128i a, __m128i b) { return insert128<1>(avx_cast<__m256i>(a), b); }
}
using namespace Casts;
}
namespace AVX2
{
using namespace AVX::Casts;
}
namespace AVX
{
template <typename From, typename To> struct ConvertTag {};
Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag<float , int>) { return _mm256_cvttps_epi32(v); }
Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, int>) { return _mm256_cvttpd_epi32(v); }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int , int>) { return v; }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint , int>) { return v; }
Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , int>) {
#ifdef Vc_IMPL_AVX2
return _mm256_cvtepi16_epi32(v);
#else
return AVX::srai_epi32<16>(
concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
#endif
}
Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, int>) {
#ifdef Vc_IMPL_AVX2
return _mm256_cvtepu16_epi32(v);
#else
return AVX::srli_epi32<16>(
concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
#endif
}
Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag<float , uint>) {
using namespace AVX;
return _mm256_castps_si256(_mm256_blendv_ps(
_mm256_castsi256_ps(_mm256_cvttps_epi32(v)),
_mm256_castsi256_ps(add_epi32(_mm256_cvttps_epi32(_mm256_sub_ps(v, set2power31_ps())),
set2power31_epu32())),
cmpge_ps(v, set2power31_ps())));
}
Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, uint>) {
using namespace AVX;
return _mm_xor_si128(
_mm256_cvttpd_epi32(_mm256_sub_pd(_mm256_floor_pd(v), set1_pd(0x80000000u))),
_mm_set2power31_epu32());
}
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int , uint>) { return v; }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint , uint>) { return v; }
Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , uint>) {
#ifdef Vc_IMPL_AVX2
return _mm256_cvtepi16_epi32(v);
#else
return AVX::srai_epi32<16>(
concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
#endif
}
Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, uint>) {
#ifdef Vc_IMPL_AVX2
return _mm256_cvtepu16_epi32(v);
#else
return AVX::srli_epi32<16>(
concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
#endif
}
Vc_INTRINSIC __m256 convert(__m256 v, ConvertTag<float , float>) { return v; }
Vc_INTRINSIC __m128 convert(__m256d v, ConvertTag<double, float>) { return _mm256_cvtpd_ps(v); }
Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag<int , float>) { return _mm256_cvtepi32_ps(v); }
Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag<uint , float>) {
using namespace AVX;
return _mm256_blendv_ps(
_mm256_cvtepi32_ps(v),
_mm256_add_ps(_mm256_cvtepi32_ps(and_si256(v, set1_epi32(0x7ffffe00))),
_mm256_add_ps(set2power31_ps(), _mm256_cvtepi32_ps(and_si256(
v, set1_epi32(0x000001ff))))),
_mm256_castsi256_ps(cmplt_epi32(v, _mm256_setzero_si256())));
}
Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag<short , float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag< short, int>())); }
Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag<ushort, float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag<ushort, int>())); }
Vc_INTRINSIC __m256d convert(__m128 v, ConvertTag<float , double>) { return _mm256_cvtps_pd(v); }
Vc_INTRINSIC __m256d convert(__m256d v, ConvertTag<double, double>) { return v; }
Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<int , double>) { return _mm256_cvtepi32_pd(v); }
Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<uint , double>) {
using namespace AVX;
return _mm256_add_pd(
_mm256_cvtepi32_pd(_mm_xor_si128(v, _mm_setmin_epi32())),
set1_pd(1u << 31)); }
Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<short , double>) { return convert(convert(v, SSE::ConvertTag< short, int>()), ConvertTag<int, double>()); }
Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<ushort, double>) { return convert(convert(v, SSE::ConvertTag<ushort, int>()), ConvertTag<int, double>()); }
Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int , short>) {
#ifdef Vc_IMPL_AVX2
auto a = _mm256_shuffle_epi8(
v, _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80,
-0x80, -0x80, -0x80, 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
-0x80, -0x80, -0x80, -0x80, -0x80, -0x80));
return lo128(_mm256_permute4x64_epi64(a, 0xf8));
#else
const auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
const auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
const auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
const auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
return _mm_unpacklo_epi16(tmp2, tmp3);
#endif
}
Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint , short>) { return convert(v, ConvertTag<int, short>()); }
Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag<float , short>) { return convert(convert(v, ConvertTag<float, int>()), ConvertTag<int, short>()); }
Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, short>) { return convert(convert(v, ConvertTag<double, int>()), SSE::ConvertTag<int, short>()); }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , short>) { return v; }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, short>) { return v; }
Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int , ushort>) {
auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
return _mm_unpacklo_epi16(tmp2, tmp3);
}
Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint , ushort>) {
auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
return _mm_unpacklo_epi16(tmp2, tmp3);
}
Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag<float , ushort>) { return convert(convert(v, ConvertTag<float, uint>()), ConvertTag<uint, ushort>()); }
Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, ushort>) { return convert(convert(v, ConvertTag<double, uint>()), SSE::ConvertTag<uint, ushort>()); }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , ushort>) { return v; }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, ushort>) { return v; }
template <typename From, typename To>
Vc_INTRINSIC auto convert(
typename std::conditional<(sizeof(From) < sizeof(To)),
typename SSE::VectorTraits<From>::VectorType,
typename AVX::VectorTypeHelper<From>::Type>::type v)
-> decltype(convert(v, ConvertTag<From, To>()))
{
return convert(v, ConvertTag<From, To>());
}
template <typename From, typename To, typename = enable_if<(sizeof(From) < sizeof(To))>>
Vc_INTRINSIC auto convert(typename AVX::VectorTypeHelper<From>::Type v)
-> decltype(convert(lo128(v), ConvertTag<From, To>()))
{
return convert(lo128(v), ConvertTag<From, To>());
}
}
}
#endif
#endif
#ifndef VC_SSE_VECTOR_H_
#error "Vc/sse/vector.h needs to be included before Vc/sse/simd_cast.h"
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{
#define Vc_SIMD_CAST_1(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x, enable_if<std::is_same<To, to_>::value> = nullarg)
#define Vc_SIMD_CAST_2(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x0, from_ x1, enable_if<std::is_same<To, to_>::value> = nullarg)
#define Vc_SIMD_CAST_4(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x0, from_ x1, from_ x2, from_ x3, \
enable_if<std::is_same<To, to_>::value> = nullarg)
#define Vc_SIMD_CAST_8(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, from_ x7, \
enable_if<std::is_same<To, to_>::value> = nullarg)
Vc_SIMD_CAST_1( float_v, int_v);
Vc_SIMD_CAST_1(double_v, int_v);
Vc_SIMD_CAST_1( uint_v, int_v);
Vc_SIMD_CAST_1( short_v, int_v);
Vc_SIMD_CAST_1(ushort_v, int_v);
Vc_SIMD_CAST_1( float_v, uint_v);
Vc_SIMD_CAST_1(double_v, uint_v);
Vc_SIMD_CAST_1( int_v, uint_v);
Vc_SIMD_CAST_1( short_v, uint_v);
Vc_SIMD_CAST_1(ushort_v, uint_v);
Vc_SIMD_CAST_1(double_v, float_v);
Vc_SIMD_CAST_1( int_v, float_v);
Vc_SIMD_CAST_1( uint_v, float_v);
Vc_SIMD_CAST_1( short_v, float_v);
Vc_SIMD_CAST_1(ushort_v, float_v);
Vc_SIMD_CAST_1( float_v, double_v);
Vc_SIMD_CAST_1( int_v, double_v);
Vc_SIMD_CAST_1( uint_v, double_v);
Vc_SIMD_CAST_1( short_v, double_v);
Vc_SIMD_CAST_1(ushort_v, double_v);
Vc_SIMD_CAST_1( int_v, short_v);
Vc_SIMD_CAST_1( uint_v, short_v);
Vc_SIMD_CAST_1( float_v, short_v);
Vc_SIMD_CAST_1(double_v, short_v);
Vc_SIMD_CAST_1(ushort_v, short_v);
Vc_SIMD_CAST_1( int_v, ushort_v);
Vc_SIMD_CAST_1( uint_v, ushort_v);
Vc_SIMD_CAST_1( float_v, ushort_v);
Vc_SIMD_CAST_1(double_v, ushort_v);
Vc_SIMD_CAST_1( short_v, ushort_v);
Vc_SIMD_CAST_2(double_v, int_v);
Vc_SIMD_CAST_2(double_v, uint_v);
Vc_SIMD_CAST_2(double_v, float_v);
Vc_SIMD_CAST_2( int_v, short_v);
Vc_SIMD_CAST_2( uint_v, short_v);
Vc_SIMD_CAST_2( float_v, short_v);
Vc_SIMD_CAST_2(double_v, short_v);
Vc_SIMD_CAST_2( int_v, ushort_v);
Vc_SIMD_CAST_2( uint_v, ushort_v);
Vc_SIMD_CAST_2( float_v, ushort_v);
Vc_SIMD_CAST_2(double_v, ushort_v);
#define Vc_CAST_(To_) \
template <typename Return> \
Vc_INTRINSIC Vc_CONST enable_if<std::is_same<Return, To_>::value, Return>
Vc_CAST_(short_v) simd_cast(double_v a, double_v b, double_v c);
Vc_CAST_(ushort_v) simd_cast(double_v a, double_v b, double_v c);
Vc_SIMD_CAST_4(double_v, short_v);
Vc_SIMD_CAST_4(double_v, ushort_v);
}
using SSE::simd_cast;
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, SSE::double_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, SSE::double_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::float_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7,
enable_if<std::is_same<Return, SSE::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7,
enable_if<std::is_same<Return, SSE::ushort_v>::value> = nullarg);
template <typename To, typename FromT>
Vc_INTRINSIC Vc_CONST To
simd_cast(SSE::Vector<FromT> x, enable_if<Scalar::is_vector<To>::value> = nullarg);
#undef Vc_SIMD_CAST_1
#undef Vc_SIMD_CAST_2
#undef Vc_SIMD_CAST_4
#undef Vc_SIMD_CAST_8
#define Vc_SIMD_CAST_1(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(from_ x, enable_if<std::is_same<To, to_>::value>)
#define Vc_SIMD_CAST_2(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, \
enable_if<std::is_same<To, to_>::value>)
#define Vc_SIMD_CAST_4(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, \
enable_if<std::is_same<To, to_>::value>)
#define Vc_SIMD_CAST_8(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
from_ x5, from_ x6, from_ x7, \
enable_if<std::is_same<To, to_>::value>)
namespace SSE
{
Vc_INTRINSIC __m128i convert_int32_to_int16(__m128i a, __m128i b)
{
auto tmp0 = _mm_unpacklo_epi16(a, b);
auto tmp1 = _mm_unpackhi_epi16(a, b);
auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
return _mm_unpacklo_epi16(tmp2, tmp3);
}
Vc_SIMD_CAST_1( float_v, int_v) { return convert< float, int>(x.data()); }
Vc_SIMD_CAST_1(double_v, int_v) { return convert<double, int>(x.data()); }
Vc_SIMD_CAST_1( uint_v, int_v) { return convert< uint, int>(x.data()); }
Vc_SIMD_CAST_1( short_v, int_v) { return convert< short, int>(x.data()); }
Vc_SIMD_CAST_1(ushort_v, int_v) { return convert<ushort, int>(x.data()); }
Vc_SIMD_CAST_1( float_v, uint_v) { return convert< float, uint>(x.data()); }
Vc_SIMD_CAST_1(double_v, uint_v) { return convert<double, uint>(x.data()); }
Vc_SIMD_CAST_1( int_v, uint_v) { return convert< int, uint>(x.data()); }
Vc_SIMD_CAST_1( short_v, uint_v) { return convert< short, uint>(x.data()); }
Vc_SIMD_CAST_1(ushort_v, uint_v) { return convert<ushort, uint>(x.data()); }
Vc_SIMD_CAST_1(double_v, float_v) { return convert<double, float>(x.data()); }
Vc_SIMD_CAST_1( int_v, float_v) { return convert< int, float>(x.data()); }
Vc_SIMD_CAST_1( uint_v, float_v) { return convert< uint, float>(x.data()); }
Vc_SIMD_CAST_1( short_v, float_v) { return convert< short, float>(x.data()); }
Vc_SIMD_CAST_1(ushort_v, float_v) { return convert<ushort, float>(x.data()); }
Vc_SIMD_CAST_1( float_v, double_v) { return convert< float, double>(x.data()); }
Vc_SIMD_CAST_1( int_v, double_v) { return convert< int, double>(x.data()); }
Vc_SIMD_CAST_1( uint_v, double_v) { return convert< uint, double>(x.data()); }
Vc_SIMD_CAST_1( short_v, double_v) { return convert< short, double>(x.data()); }
Vc_SIMD_CAST_1(ushort_v, double_v) { return convert<ushort, double>(x.data()); }
Vc_SIMD_CAST_1( int_v, short_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
Vc_SIMD_CAST_1( uint_v, short_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
Vc_SIMD_CAST_1( float_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x).data(), _mm_setzero_si128()); }
Vc_SIMD_CAST_1(double_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x).data(), _mm_setzero_si128()); }
Vc_SIMD_CAST_1(ushort_v, short_v) { return x.data(); }
Vc_SIMD_CAST_1( int_v, ushort_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
Vc_SIMD_CAST_1( uint_v, ushort_v) { return SSE::convert_int32_to_int16(x.data(), _mm_setzero_si128()); }
Vc_SIMD_CAST_1( float_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x)); }
Vc_SIMD_CAST_1(double_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x)); }
Vc_SIMD_CAST_1( short_v, ushort_v) { return x.data(); }
Vc_SIMD_CAST_2(double_v, int_v) {
#ifdef Vc_IMPL_AVX
return AVX::convert<double, int>(AVX::concat(x0.data(), x1.data()));
#else
return _mm_unpacklo_epi64(convert<double, int>(x0.data()), convert<double, int>(x1.data()));
#endif
}
Vc_SIMD_CAST_2(double_v, uint_v) {
#ifdef Vc_IMPL_AVX
return AVX::convert<double, uint>(AVX::concat(x0.data(), x1.data()));
#else
return _mm_unpacklo_epi64(convert<double, uint>(x0.data()), convert<double, uint>(x1.data()));
#endif
}
Vc_SIMD_CAST_2(double_v, float_v) {
#ifdef Vc_IMPL_AVX
return _mm256_cvtpd_ps(AVX::concat(x0.data(), x1.data()));
#else
return _mm_movelh_ps(_mm_cvtpd_ps(x0.data()), _mm_cvtpd_ps(x1.data()));
#endif
}
Vc_SIMD_CAST_2( int_v, short_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
Vc_SIMD_CAST_2( uint_v, short_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
Vc_SIMD_CAST_2( float_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x0).data(), simd_cast<SSE::int_v>(x1).data()); }
Vc_SIMD_CAST_2(double_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x0, x1).data(), _mm_setzero_si128()); }
Vc_SIMD_CAST_2( int_v, ushort_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
Vc_SIMD_CAST_2( uint_v, ushort_v) { return SSE::convert_int32_to_int16(x0.data(), x1.data()); }
Vc_SIMD_CAST_2( float_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x0), simd_cast<SSE::int_v>(x1)); }
Vc_SIMD_CAST_2(double_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x0, x1)); }
Vc_CAST_(short_v) simd_cast(double_v a, double_v b, double_v c)
{
return simd_cast<short_v>(simd_cast<int_v>(a, b), simd_cast<int_v>(c));
}
Vc_CAST_(ushort_v) simd_cast(double_v a, double_v b, double_v c)
{
return simd_cast<ushort_v>(simd_cast<int_v>(a, b), simd_cast<int_v>(c));
}
#undef Vc_CAST_
Vc_SIMD_CAST_4(double_v, short_v) { return _mm_packs_epi32(simd_cast<SSE::int_v>(x0, x1).data(), simd_cast<SSE::int_v>(x2, x3).data()); }
Vc_SIMD_CAST_4(double_v, ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE::int_v>(x0, x1), simd_cast<SSE::int_v>(x2, x3)); }
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, SSE::double_v>::value> )
{
return _mm_setr_pd(x.data(), 0.);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, SSE::float_v>::value> )
{
return _mm_setr_ps(x.data(), 0.f, 0.f, 0.f);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, SSE::int_v>::value> )
{
return _mm_setr_epi32(x.data(), 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, SSE::uint_v>::value> )
{
return _mm_setr_epi32(uint(x.data()), 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, SSE::short_v>::value> )
{
return _mm_setr_epi16(
x.data(), 0, 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, SSE::ushort_v>::value> )
{
return _mm_setr_epi16(
x.data(), 0, 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0,
Scalar::Vector<T> x1,
enable_if<std::is_same<Return, SSE::double_v>::value> )
{
return _mm_setr_pd(x0.data(), x1.data());
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0,
Scalar::Vector<T> x1,
enable_if<std::is_same<Return, SSE::float_v>::value> )
{
return _mm_setr_ps(x0.data(), x1.data(), 0.f, 0.f);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0,
Scalar::Vector<T> x1,
enable_if<std::is_same<Return, SSE::int_v>::value> )
{
return _mm_setr_epi32(x0.data(), x1.data(), 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0,
Scalar::Vector<T> x1,
enable_if<std::is_same<Return, SSE::uint_v>::value> )
{
return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), 0,
0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0,
Scalar::Vector<T> x1,
enable_if<std::is_same<Return, SSE::short_v>::value> )
{
return _mm_setr_epi16(
x0.data(), x1.data(), 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0,
Scalar::Vector<T> x1,
enable_if<std::is_same<Return, SSE::ushort_v>::value> )
{
return _mm_setr_epi16(
x0.data(), x1.data(), 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, SSE::float_v>::value>)
{
return _mm_setr_ps(x0.data(), x1.data(), x2.data(), 0.f);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, SSE::int_v>::value>)
{
return _mm_setr_epi32(x0.data(), x1.data(), x2.data(), 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, SSE::uint_v>::value>)
{
return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, SSE::short_v>::value>)
{
return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, SSE::ushort_v>::value>)
{
return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0,
Scalar::Vector<T> x1,
Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::float_v>::value> )
{
return _mm_setr_ps(
x0.data(), x1.data(), x2.data(), x3.data());
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0,
Scalar::Vector<T> x1,
Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::int_v>::value> )
{
return _mm_setr_epi32(
x0.data(), x1.data(), x2.data(), x3.data());
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0,
Scalar::Vector<T> x1,
Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::uint_v>::value> )
{
return _mm_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
uint(x3.data()));
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0,
Scalar::Vector<T> x1,
Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::short_v>::value> )
{
return _mm_setr_epi16(
x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0,
Scalar::Vector<T> x1,
Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, SSE::ushort_v>::value> )
{
return _mm_setr_epi16(
x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, SSE::short_v>::value>)
{
return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, SSE::ushort_v>::value>)
{
return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, SSE::short_v>::value>)
{
return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, SSE::ushort_v>::value>)
{
return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, enable_if<std::is_same<Return, SSE::short_v>::value>)
{
return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, enable_if<std::is_same<Return, SSE::ushort_v>::value>)
{
return _mm_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0,
Scalar::Vector<T> x1,
Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
Scalar::Vector<T> x4,
Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
Scalar::Vector<T> x7,
enable_if<std::is_same<Return, SSE::short_v>::value> )
{
return _mm_setr_epi16(x0.data(),
x1.data(),
x2.data(),
x3.data(),
x4.data(),
x5.data(),
x6.data(),
x7.data());
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0,
Scalar::Vector<T> x1,
Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
Scalar::Vector<T> x4,
Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
Scalar::Vector<T> x7,
enable_if<std::is_same<Return, SSE::ushort_v>::value> )
{
return _mm_setr_epi16(x0.data(),
x1.data(),
x2.data(),
x3.data(),
x4.data(),
x5.data(),
x6.data(),
x7.data());
}
template <typename To, typename FromT>
Vc_INTRINSIC Vc_CONST To
simd_cast(SSE::Vector<FromT> x, enable_if<Scalar::is_vector<To>::value> )
{
return static_cast<To>(x[0]);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(SSE::Mask<T> x, enable_if<SSE::is_mask<Return>::value> = nullarg)
{
using M = SSE::Mask<T>;
return {Detail::mask_cast<M::Size, Return::Size, __m128>(x.dataI())};
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(
SSE::Mask<T> x0,
SSE::Mask<T> x1,
enable_if<SSE::is_mask<Return>::value && Mask<T, VectorAbi::Sse>::Size * 2 == Return::Size> = nullarg)
{
return SSE::sse_cast<__m128>(_mm_packs_epi16(x0.dataI(), x1.dataI()));
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(
SSE::Mask<T> x0,
SSE::Mask<T> x1,
enable_if<SSE::is_mask<Return>::value && Mask<T, VectorAbi::Sse>::Size * 4 == Return::Size> = nullarg)
{
return SSE::sse_cast<__m128>(
_mm_packs_epi16(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_setzero_si128()));
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(
SSE::Mask<T> x0,
SSE::Mask<T> x1,
SSE::Mask<T> x2,
SSE::Mask<T> x3,
enable_if<SSE::is_mask<Return>::value && Mask<T, VectorAbi::Sse>::Size * 4 == Return::Size> = nullarg)
{
return SSE::sse_cast<__m128>(_mm_packs_epi16(_mm_packs_epi16(x0.dataI(), x1.dataI()),
_mm_packs_epi16(x2.dataI(), x3.dataI())));
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Mask<T> x, enable_if<SSE::is_mask<Return>::value> = nullarg)
{
Return m(false);
m[0] = x[0];
return m;
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Mask<T> x0, Scalar::Mask<T> x1, enable_if<SSE::is_mask<Return>::value> = nullarg)
{
Return m(false);
m[0] = x0[0];
m[1] = x1[0];
return m;
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask<T> x0,
Scalar::Mask<T> x1,
Scalar::Mask<T> x2,
Scalar::Mask<T> x3,
enable_if<SSE::is_mask<Return>::value> = nullarg)
{
Return m(false);
m[0] = x0[0];
m[1] = x1[0];
if (Return::Size >= 4) {
m[2] = x2[0];
m[3] = x3[0];
}
return m;
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(Scalar::Mask<T> x0,
Scalar::Mask<T> x1,
Scalar::Mask<T> x2,
Scalar::Mask<T> x3,
Scalar::Mask<T> x4,
Scalar::Mask<T> x5,
Scalar::Mask<T> x6,
Scalar::Mask<T> x7,
enable_if<SSE::is_mask<Return>::value> = nullarg)
{
Return m(false);
m[0] = x0[0];
m[1] = x1[0];
if (Return::Size >= 4) {
m[2] = x2[0];
m[3] = x3[0];
}
if (Return::Size >= 8) {
m[4] = x4[0];
m[5] = x5[0];
m[6] = x6[0];
m[7] = x7[0];
}
return m;
}
template <typename To, typename FromT>
Vc_INTRINSIC Vc_CONST To
simd_cast(SSE::Mask<FromT> x, enable_if<Scalar::is_mask<To>::value> = nullarg)
{
return static_cast<To>(x[0]);
}
template <typename Return, int offset, typename V>
Vc_INTRINSIC Vc_CONST Return
simd_cast(V &&x, enable_if<offset == 0 && ((SSE::is_vector<Traits::decay<V>>::value &&
SSE::is_vector<Return>::value) ||
(SSE::is_mask<Traits::decay<V>>::value &&
SSE::is_mask<Return>::value))> = nullarg)
{
return simd_cast<Return>(x);
}
template <typename Return, int offset, typename V>
Vc_INTRINSIC Vc_CONST Return
simd_cast(V &&x,
enable_if<offset == 0 && ((Scalar::is_vector<Traits::decay<V>>::value &&
SSE::is_vector<Return>::value) ||
(Scalar::is_mask<Traits::decay<V>>::value &&
SSE::is_mask<Return>::value))> = nullarg)
{
return simd_cast<Return>(x);
}
template <typename Return, int offset, typename V>
Vc_INTRINSIC Vc_CONST Return simd_cast(
V x,
enable_if<offset != 0 && (SSE::is_vector<Return>::value && SSE::is_vector<V>::value)> = nullarg)
{
constexpr int shift = (sizeof(V) / V::Size) * offset * Return::Size;
static_assert(shift > 0 && shift < 16, "");
return simd_cast<Return>(V{SSE::sse_cast<typename V::VectorType>(
_mm_srli_si128(SSE::sse_cast<__m128i>(x.data()), shift & 0xff))});
}
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(SSE::Vector<T> x,
enable_if<offset != 0 && Scalar::is_vector<Return>::value> = nullarg)
{
return static_cast<typename Return::EntryType>(x[offset]);
}
template <typename Return, int offset, typename V>
Vc_INTRINSIC Vc_CONST Return simd_cast(
V x,
enable_if<offset != 0 && (SSE::is_mask<Return>::value && SSE::is_mask<V>::value)> = nullarg)
{
constexpr int shift = (sizeof(V) / V::Size) * offset * Return::Size;
static_assert(shift > 0 && shift < 16, "");
return simd_cast<Return>(V{SSE::sse_cast<typename V::VectorType>(
_mm_srli_si128(SSE::sse_cast<__m128i>(x.data()), shift & 0xff))});
}
#undef Vc_SIMD_CAST_1
#undef Vc_SIMD_CAST_2
#undef Vc_SIMD_CAST_4
#undef Vc_SIMD_CAST_8
}
#endif
#endif
#endif
#ifdef Vc_IMPL_AVX
#ifndef VC_AVX_VECTOR_H_
#define VC_AVX_VECTOR_H_
#ifndef VC_AVX_VECTORHELPER_H_
#define VC_AVX_VECTORHELPER_H_
#include <limits>
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
template<> struct VectorHelper<__m256>
{
typedef __m256 VectorType;
typedef const VectorType VTArg;
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_ps(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_ps(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_ps(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_ps()); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
};
template<> struct VectorHelper<__m256d>
{
typedef __m256d VectorType;
typedef const VectorType VTArg;
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_pd(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_pd(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_pd(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_pd()); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
};
template<> struct VectorHelper<__m256i>
{
typedef __m256i VectorType;
typedef const VectorType VTArg;
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_si256()); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
};
#define Vc_OP1(op) \
static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return Vc_CAT2(_mm256_##op##_, Vc_SUFFIX)(a); }
#define Vc_OP(op) \
static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(op##_ , Vc_SUFFIX)(a, b); }
#define Vc_OP_(op) \
static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op , Vc_SUFFIX)(a, b); }
#define Vc_OPx(op,op2) \
static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op2##_, Vc_SUFFIX)(a, b); }
template<> struct VectorHelper<double> {
typedef __m256d VectorType;
typedef const VectorType VTArg;
typedef double EntryType;
#define Vc_SUFFIX pd
static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(_mm256_castps_pd(mask), a); }
static Vc_ALWAYS_INLINE VectorType set(const double a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) {
return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d);
}
static Vc_ALWAYS_INLINE VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }
static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
#ifdef Vc_IMPL_FMA4
v1 = _mm256_macc_pd(v1, v2, v3);
#else
VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
#if defined(Vc_GCC) && Vc_GCC < 0x40703
asm("":"+x"(h1), "+x"(h2));
#endif
const VectorType l1 = _mm256_sub_pd(v1, h1);
const VectorType l2 = _mm256_sub_pd(v2, h2);
const VectorType ll = mul(l1, l2);
const VectorType lh = add(mul(l1, h2), mul(h1, l2));
const VectorType hh = mul(h1, h2);
const VectorType lh_lt_v3 = cmplt_pd(abs(lh), abs(v3));
const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3);
const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3);
v1 = add(add(ll, b), add(c, hh));
#endif
}
static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_pd(a,b); }
static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_pd(a,b); }
static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_pd(a,b); }
Vc_OP1(sqrt)
static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) {
return _mm256_div_pd(one(), sqrt(x));
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
return _mm256_div_pd(one(), x);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_pd());
}
static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_pd(a, b); }
static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_pd(a, b); }
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
__m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
return _mm_cvtsd_f64(b);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
__m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
return _mm_cvtsd_f64(b);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
__m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
return _mm_cvtsd_f64(b);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
__m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
b = _mm_hadd_pd(b, b);
return _mm_cvtsd_f64(b);
}
#undef Vc_SUFFIX
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
return _mm256_round_pd(a, _MM_FROUND_NINT);
}
};
template<> struct VectorHelper<float> {
typedef float EntryType;
typedef __m256 VectorType;
typedef const VectorType VTArg;
#define Vc_SUFFIX ps
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(mask, a); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d,
const float e, const float f, const float g, const float h) {
return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE Vc_CONST __m256 concat(__m256d a, __m256d b) { return _mm256_insertf128_ps(avx_cast<__m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }
static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
#ifdef Vc_IMPL_FMA4
v1 = _mm256_macc_ps(v1, v2, v3);
#else
__m256d v1_0 = _mm256_cvtps_pd(lo128(v1));
__m256d v1_1 = _mm256_cvtps_pd(hi128(v1));
__m256d v2_0 = _mm256_cvtps_pd(lo128(v2));
__m256d v2_1 = _mm256_cvtps_pd(hi128(v2));
__m256d v3_0 = _mm256_cvtps_pd(lo128(v3));
__m256d v3_1 = _mm256_cvtps_pd(hi128(v3));
v1 = AVX::concat(
_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
#endif
}
static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_ps(a, b); }
static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_ps(a, b); }
static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_ps(a, b); }
Vc_OP1(sqrt) Vc_OP1(rsqrt)
static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
return _mm256_rcp_ps(x);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_ps());
}
static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_ps(a, b); }
static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_ps(a, b); }
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
__m128 b = _mm_min_ps(lo128(a), hi128(a));
b = _mm_min_ps(b, _mm_movehl_ps(b, b));
b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtss_f32(b);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
__m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
b = _mm_max_ps(b, _mm_movehl_ps(b, b));
b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtss_f32(b);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
__m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
return _mm_cvtss_f32(b);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
__m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
return _mm_cvtss_f32(b);
}
#undef Vc_SUFFIX
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
return _mm256_round_ps(a, _MM_FROUND_NINT);
}
};
#undef Vc_OP1
#undef Vc_OP
#undef Vc_OP_
#undef Vc_OPx
}
}
#endif
#ifndef VC_AVX_MASK_H_
#define VC_AVX_MASK_H_
#include <array>
#ifndef VC_AVX_DETAIL_H_
#define VC_AVX_DETAIL_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename Flags>
Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
typename Flags::EnableIfAligned = nullptr)
{
return _mm256_load_ps(x);
}
template <typename Flags>
Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
typename Flags::EnableIfUnaligned = nullptr)
{
return _mm256_loadu_ps(x);
}
template <typename Flags>
Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
typename Flags::EnableIfStreaming = nullptr)
{
return AvxIntrinsics::stream_load<__m256>(x);
}
template <typename Flags>
Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
typename Flags::EnableIfAligned = nullptr)
{
return _mm256_load_pd(x);
}
template <typename Flags>
Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
typename Flags::EnableIfUnaligned = nullptr)
{
return _mm256_loadu_pd(x);
}
template <typename Flags>
Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
typename Flags::EnableIfStreaming = nullptr)
{
return AvxIntrinsics::stream_load<__m256d>(x);
}
template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
Vc_INTRINSIC Vc_PURE __m256i
load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfAligned = nullptr)
{
return _mm256_load_si256(reinterpret_cast<const __m256i *>(x));
}
template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
Vc_INTRINSIC Vc_PURE __m256i
load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfUnaligned = nullptr)
{
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(x));
}
template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
Vc_INTRINSIC Vc_PURE __m256i
load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfStreaming = nullptr)
{
return AvxIntrinsics::stream_load<__m256i>(x);
}
Vc_INTRINSIC __m256 load32(const float *mem, when_aligned)
{
return _mm256_load_ps(mem);
}
Vc_INTRINSIC __m256 load32(const float *mem, when_unaligned)
{
return _mm256_loadu_ps(mem);
}
Vc_INTRINSIC __m256 load32(const float *mem, when_streaming)
{
return AvxIntrinsics::stream_load<__m256>(mem);
}
Vc_INTRINSIC __m256d load32(const double *mem, when_aligned)
{
return _mm256_load_pd(mem);
}
Vc_INTRINSIC __m256d load32(const double *mem, when_unaligned)
{
return _mm256_loadu_pd(mem);
}
Vc_INTRINSIC __m256d load32(const double *mem, when_streaming)
{
return AvxIntrinsics::stream_load<__m256d>(mem);
}
template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_aligned)
{
static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}
template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_unaligned)
{
static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}
template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_streaming)
{
static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
return AvxIntrinsics::stream_load<__m256i>(mem);
}
#ifdef Vc_MSVC
Vc_INTRINSIC __m256i load(const uint *mem, when_aligned, LoadTag<__m256i, int>)
{
return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}
Vc_INTRINSIC __m256d load(const double *mem, when_unaligned, LoadTag<__m256d, double>)
{
return _mm256_loadu_pd(mem);
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256 load(const float *mem, when_aligned,
enable_if<(std::is_same<DstT, float>::value &&
std::is_same<V, __m256>::value)> = nullarg)
{
return _mm256_load_ps(mem);
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256 load(const float *mem, when_unaligned,
enable_if<(std::is_same<DstT, float>::value &&
std::is_same<V, __m256>::value)> = nullarg)
{
return _mm256_loadu_ps(mem);
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256 load(const float *mem, when_streaming,
enable_if<(std::is_same<DstT, float>::value &&
std::is_same<V, __m256>::value)> = nullarg)
{
return AvxIntrinsics::stream_load<__m256>(mem);
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256d load(const double *mem, when_aligned,
enable_if<(std::is_same<DstT, double>::value &&
std::is_same<V, __m256d>::value)> = nullarg)
{
return _mm256_load_pd(mem);
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256d load(const double *mem, when_unaligned,
enable_if<(std::is_same<DstT, double>::value &&
std::is_same<V, __m256d>::value)> = nullarg)
{
return _mm256_loadu_pd(mem);
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256d load(const double *mem, when_streaming,
enable_if<(std::is_same<DstT, double>::value &&
std::is_same<V, __m256d>::value)> = nullarg)
{
return AvxIntrinsics::stream_load<__m256d>(mem);
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const uint *mem, when_aligned,
enable_if<(std::is_same<DstT, uint>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
{
return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const uint *mem, when_unaligned,
enable_if<(std::is_same<DstT, uint>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
{
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const uint *mem, when_streaming,
enable_if<(std::is_same<DstT, uint>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
{
return AvxIntrinsics::stream_load<__m256i>(mem);
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const int *mem, when_unaligned,
enable_if<(std::is_same<DstT, int>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
{
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const int *mem, when_aligned,
enable_if<(std::is_same<DstT, int>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
{
return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const int *mem, when_streaming,
enable_if<(std::is_same<DstT, int>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
{
return AvxIntrinsics::stream_load<__m256i>(mem);
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const short *mem, when_unaligned,
enable_if<(std::is_same<DstT, short>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
{
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const short *mem, when_aligned,
enable_if<(std::is_same<DstT, short>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
{
return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const short *mem, when_streaming,
enable_if<(std::is_same<DstT, short>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
{
return AvxIntrinsics::stream_load<__m256i>(mem);
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const ushort *mem, when_unaligned,
enable_if<(std::is_same<DstT, ushort>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
{
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const ushort *mem, when_aligned,
enable_if<(std::is_same<DstT, ushort>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
{
return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}
template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const ushort *mem, when_streaming,
enable_if<(std::is_same<DstT, ushort>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
{
return AvxIntrinsics::stream_load<__m256i>(mem);
}
#endif
template <typename Flags>
Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, short>)
{
return load32(mem, f);
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, short>)
{
return AVX::cvtepu8_epi16(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const schar *mem, Flags f, LoadTag<__m256i, short>)
{
return AVX::cvtepi8_epi16(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, ushort>)
{
return AVX::cvtepu8_epi16(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const uint *mem, Flags f, LoadTag<__m256i, int>)
{
return load32(mem, f);
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, int>)
{
return AVX::cvtepu16_epi32(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const short *mem, Flags f, LoadTag<__m256i, int>)
{
return AVX::cvtepi16_epi32(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, int>)
{
return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const schar *mem, Flags, LoadTag<__m256i, int>)
{
return AVX::cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, uint>)
{
return AVX::cvtepu16_epi32(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, uint>)
{
return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
template <typename Flags>
Vc_INTRINSIC __m256d load(const float *mem, Flags f, LoadTag<__m256d, double>)
{
return AVX::convert<float, double>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256d load(const uint *mem, Flags f, LoadTag<__m256d, double>)
{
return AVX::convert<uint, double>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256d load(const int *mem, Flags f, LoadTag<__m256d, double>)
{
return AVX::convert<int, double>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256d load(const ushort *mem, Flags f, LoadTag<__m256d, double>)
{
return AVX::convert<int, double>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256d load(const short *mem, Flags f, LoadTag<__m256d, double>)
{
return AVX::convert<int, double>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256d load(const uchar *mem, Flags f, LoadTag<__m256d, double>)
{
return AVX::convert<int, double>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256d load(const schar *mem, Flags f, LoadTag<__m256d, double>)
{
return AVX::convert<int, double>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256 load(const double *mem, Flags f, LoadTag<__m256, float>)
{
return AVX::concat(_mm256_cvtpd_ps(load32(&mem[0], f)),
_mm256_cvtpd_ps(load32(&mem[4], f)));
}
template <typename Flags>
Vc_INTRINSIC __m256 load(const uint *mem, Flags f, LoadTag<__m256, float>)
{
const auto v = load32(mem, f);
return _mm256_blendv_ps(
_mm256_cvtepi32_ps(v),
_mm256_add_ps(_mm256_cvtepi32_ps(AVX::sub_epi32(v, AVX::set2power31_epu32())),
AVX::set2power31_ps()),
_mm256_castsi256_ps(AVX::cmplt_epi32(v, _mm256_setzero_si256())));
}
template <typename Flags>
Vc_INTRINSIC __m256 load(const int *mem, Flags f, LoadTag<__m256, float>)
{
return AVX::convert<int, float>(load32(mem, f));
}
template <typename T, typename Flags,
typename = enable_if<!std::is_same<T, float>::value>>
Vc_INTRINSIC __m256 load(const T *mem, Flags f, LoadTag<__m256, float>)
{
return _mm256_cvtepi32_ps(load<__m256i, int>(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256 load(const ushort *mem, Flags f, LoadTag<__m256, float>)
{
return AVX::convert<ushort, float>(load16(mem, f));
}
template <typename Flags>
Vc_INTRINSIC __m256 load(const short *mem, Flags f, LoadTag<__m256, float>)
{
return AVX::convert<short, float>(load16(mem, f));
}
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount >= 16), T> shifted(T k)
{
return AVX::avx_cast<T>(AVX::zeroExtend(
_mm_srli_si128(AVX::hi128(AVX::avx_cast<__m256i>(k)), amount - 16)));
}
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > 0 && amount < 16), T>
shifted(T k)
{
return AVX::avx_cast<T>(
AVX::alignr<amount>(Mem::permute128<X1, Const0>(AVX::avx_cast<__m256i>(k)),
AVX::avx_cast<__m256i>(k)));
}
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount <= -16), T> shifted(T k)
{
return AVX::avx_cast<T>(Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(
_mm_slli_si128(AVX::lo128(AVX::avx_cast<__m256i>(k)), -16 - amount))));
}
template <int amount, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > -16 && amount < 0), T>
shifted(T k)
{
return AVX::avx_cast<T>(
AVX::alignr<16 + amount>(AVX::avx_cast<__m256i>(k),
Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(k))));
}
template<size_t From, size_t To, typename R> Vc_INTRINSIC Vc_CONST R mask_cast(__m256i k)
{
static_assert(From == To, "Incorrect mask cast.");
static_assert(std::is_same<R, __m256>::value, "Incorrect mask cast.");
return AVX::avx_cast<__m256>(k);
}
template <> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 4, __m128>(__m256i k)
{
return AVX::avx_cast<__m128>(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)));
}
template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 4, __m256>(__m128i k)
{
const auto kk = _mm_castsi128_ps(k);
return AVX::concat(_mm_unpacklo_ps(kk, kk), _mm_unpackhi_ps(kk, kk));
}
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m256i k)
{
return AVX::avx_cast<__m256>(AVX::concat(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)),
_mm_setzero_si128()));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m256i k)
{
return AVX::avx_cast<__m128>(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)), _mm_setzero_si128()));
}
template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m128i k)
{
return AVX::zeroExtend(AVX::avx_cast<__m128>(k));
}
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 16, __m256>(__m256i k)
{
return AVX::zeroExtend(mask_cast<4, 8, __m128>(k));
}
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m256i k)
{
const auto lo = AVX::lo128(AVX::avx_cast<__m256>(k));
return AVX::concat(_mm_unpacklo_ps(lo, lo),
_mm_unpackhi_ps(lo, lo));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m256i k)
{
return AVX::avx_cast<__m128>(AVX::lo128(k));
}
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m128i k)
{
const auto tmp = _mm_unpacklo_epi16(k, k);
return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(tmp, tmp),
_mm_unpackhi_epi32(tmp, tmp)));
}
template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 8, __m128>(__m256i k)
{
return AVX::avx_cast<__m128>(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
}
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 8, __m256>(__m128i k)
{
return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(k, k),
_mm_unpackhi_epi16(k, k)));
}
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 16, __m256>(__m256i k)
{
return AVX::zeroExtend(mask_cast<8, 8, __m128>(k));
}
#ifdef Vc_IMPL_AVX2
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 8, __m256>(__m256i k)
{
const auto flipped = Mem::permute4x64<X0, X2, X1, X3>(k);
return _mm256_castsi256_ps(AVX::unpacklo_epi16(flipped, flipped));
}
#endif
template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 4, __m256>(__m256i k)
{
const auto tmp = _mm_unpacklo_epi16(AVX::lo128(k), AVX::lo128(k));
return _mm256_castsi256_ps(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)));
}
template<> Vc_INTRINSIC Vc_CONST __m256 allone<__m256 >() { return AVX::setallone_ps(); }
template<> Vc_INTRINSIC Vc_CONST __m256i allone<__m256i>() { return AVX::setallone_si256(); }
template<> Vc_INTRINSIC Vc_CONST __m256d allone<__m256d>() { return AVX::setallone_pd(); }
template<> Vc_INTRINSIC Vc_CONST __m256 zero<__m256 >() { return _mm256_setzero_ps(); }
template<> Vc_INTRINSIC Vc_CONST __m256i zero<__m256i>() { return _mm256_setzero_si256(); }
template<> Vc_INTRINSIC Vc_CONST __m256d zero<__m256d>() { return _mm256_setzero_pd(); }
Vc_INTRINSIC Vc_CONST __m256 one( float) { return AVX::setone_ps (); }
Vc_INTRINSIC Vc_CONST __m256d one(double) { return AVX::setone_pd (); }
Vc_INTRINSIC Vc_CONST __m256i one( int) { return AVX::setone_epi32(); }
Vc_INTRINSIC Vc_CONST __m256i one( uint) { return AVX::setone_epu32(); }
Vc_INTRINSIC Vc_CONST __m256i one( short) { return AVX::setone_epi16(); }
Vc_INTRINSIC Vc_CONST __m256i one(ushort) { return AVX::setone_epu16(); }
Vc_INTRINSIC Vc_CONST __m256i one( schar) { return AVX::setone_epi8 (); }
Vc_INTRINSIC Vc_CONST __m256i one( uchar) { return AVX::setone_epu8 (); }
Vc_ALWAYS_INLINE Vc_CONST __m256 negate(__m256 v, std::integral_constant<std::size_t, 4>)
{
return _mm256_xor_ps(v, AVX::setsignmask_ps());
}
Vc_ALWAYS_INLINE Vc_CONST __m256d negate(__m256d v, std::integral_constant<std::size_t, 8>)
{
return _mm256_xor_pd(v, AVX::setsignmask_pd());
}
Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 4>)
{
return AVX::sign_epi32(v, Detail::allone<__m256i>());
}
Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 2>)
{
return AVX::sign_epi16(v, Detail::allone<__m256i>());
}
Vc_INTRINSIC __m256 xor_(__m256 a, __m256 b) { return _mm256_xor_ps(a, b); }
Vc_INTRINSIC __m256d xor_(__m256d a, __m256d b) { return _mm256_xor_pd(a, b); }
Vc_INTRINSIC __m256i xor_(__m256i a, __m256i b)
{
#ifdef Vc_IMPL_AVX2
return _mm256_xor_si256(a, b);
#else
return _mm256_castps_si256(
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
#endif
}
Vc_INTRINSIC __m256 or_(__m256 a, __m256 b) { return _mm256_or_ps(a, b); }
Vc_INTRINSIC __m256d or_(__m256d a, __m256d b) { return _mm256_or_pd(a, b); }
Vc_INTRINSIC __m256i or_(__m256i a, __m256i b)
{
#ifdef Vc_IMPL_AVX2
return _mm256_or_si256(a, b);
#else
return _mm256_castps_si256(
_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
#endif
}
Vc_INTRINSIC __m256 and_(__m256 a, __m256 b) { return _mm256_and_ps(a, b); }
Vc_INTRINSIC __m256d and_(__m256d a, __m256d b) { return _mm256_and_pd(a, b); }
Vc_INTRINSIC __m256i and_(__m256i a, __m256i b) {
#ifdef Vc_IMPL_AVX2
return _mm256_and_si256(a, b);
#else
return _mm256_castps_si256(
_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
#endif
}
Vc_INTRINSIC __m256 andnot_(__m256 a, __m256 b) { return _mm256_andnot_ps(a, b); }
Vc_INTRINSIC __m256d andnot_(__m256d a, __m256d b) { return _mm256_andnot_pd(a, b); }
Vc_INTRINSIC __m256i andnot_(__m256i a, __m256i b)
{
#ifdef Vc_IMPL_AVX2
return _mm256_andnot_si256(a, b);
#else
return _mm256_castps_si256(
_mm256_andnot_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
#endif
}
Vc_INTRINSIC __m256 not_(__m256 a) { return andnot_(a, allone<__m256 >()); }
Vc_INTRINSIC __m256d not_(__m256d a) { return andnot_(a, allone<__m256d>()); }
Vc_INTRINSIC __m256i not_(__m256i a) { return andnot_(a, allone<__m256i>()); }
Vc_INTRINSIC __m256 blend(__m256 a, __m256 b, __m256 c) { return _mm256_blendv_ps(a, b, c); }
Vc_INTRINSIC __m256d blend(__m256d a, __m256d b, __m256d c) { return _mm256_blendv_pd(a, b, c); }
Vc_INTRINSIC __m256i blend(__m256i a, __m256i b, __m256i c) { return AVX::blendv_epi8(a, b, c); }
Vc_INTRINSIC __m256 abs(__m256 a, float) { return and_(a, AVX::setabsmask_ps()); }
Vc_INTRINSIC __m256d abs(__m256d a, double) { return and_(a, AVX::setabsmask_pd()); }
Vc_INTRINSIC __m256i abs(__m256i a, int) { return AVX::abs_epi32(a); }
Vc_INTRINSIC __m256i abs(__m256i a, uint) { return a; }
Vc_INTRINSIC __m256i abs(__m256i a, short) { return AVX::abs_epi16(a); }
Vc_INTRINSIC __m256i abs(__m256i a, ushort) { return a; }
Vc_INTRINSIC __m256i abs(__m256i a, schar) { return AVX::abs_epi8 (a); }
Vc_INTRINSIC __m256i abs(__m256i a, uchar) { return a; }
Vc_INTRINSIC __m256 add(__m256 a, __m256 b, float) { return _mm256_add_ps(a, b); }
Vc_INTRINSIC __m256d add(__m256d a, __m256d b, double) { return _mm256_add_pd(a, b); }
Vc_INTRINSIC __m256i add(__m256i a, __m256i b, int) { return AVX::add_epi32(a, b); }
Vc_INTRINSIC __m256i add(__m256i a, __m256i b, uint) { return AVX::add_epi32(a, b); }
Vc_INTRINSIC __m256i add(__m256i a, __m256i b, short) { return AVX::add_epi16(a, b); }
Vc_INTRINSIC __m256i add(__m256i a, __m256i b, ushort) { return AVX::add_epi16(a, b); }
Vc_INTRINSIC __m256 sub(__m256 a, __m256 b, float) { return _mm256_sub_ps(a, b); }
Vc_INTRINSIC __m256d sub(__m256d a, __m256d b, double) { return _mm256_sub_pd(a, b); }
Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, int) { return AVX::sub_epi32(a, b); }
Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, uint) { return AVX::sub_epi32(a, b); }
Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, short) { return AVX::sub_epi16(a, b); }
Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, ushort) { return AVX::sub_epi16(a, b); }
Vc_INTRINSIC __m256 mul(__m256 a, __m256 b, float) { return _mm256_mul_ps(a, b); }
Vc_INTRINSIC __m256d mul(__m256d a, __m256d b, double) { return _mm256_mul_pd(a, b); }
Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, int) { return AVX::mullo_epi32(a, b); }
Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, uint) { return AVX::mullo_epi32(a, b); }
Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, short) { return AVX::mullo_epi16(a, b); }
Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, ushort) { return AVX::mullo_epi16(a, b); }
Vc_INTRINSIC __m256 div(__m256 a, __m256 b, float) { return _mm256_div_ps(a, b); }
Vc_INTRINSIC __m256d div(__m256d a, __m256d b, double) { return _mm256_div_pd(a, b); }
Vc_INTRINSIC __m256i div(__m256i a, __m256i b, int) {
using namespace AVX;
const __m256d lo1 = _mm256_cvtepi32_pd(lo128(a));
const __m256d lo2 = _mm256_cvtepi32_pd(lo128(b));
const __m256d hi1 = _mm256_cvtepi32_pd(hi128(a));
const __m256d hi2 = _mm256_cvtepi32_pd(hi128(b));
return concat(_mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)),
_mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2)));
}
Vc_INTRINSIC __m256i div(__m256i a, __m256i b, uint) {
using namespace AVX;
const __m256i aa = add_epi32(a, set1_epi32(-2147483648));
const __m256i bb = add_epi32(b, set1_epi32(-2147483648));
const __m256d loa = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(aa)), set1_pd(2147483648.));
const __m256d hia = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(aa)), set1_pd(2147483648.));
const __m256d lob = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(bb)), set1_pd(2147483648.));
const __m256d hib = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(bb)), set1_pd(2147483648.));
return avx_cast<__m256i>(_mm256_blendv_ps(
avx_cast<__m256>(concat(_mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)),
_mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)))),
avx_cast<__m256>(a),
avx_cast<__m256>(cmpeq_epi32(b, setone_epi32()))));
}
Vc_INTRINSIC __m256i div(__m256i a, __m256i b, short) {
using namespace AVX;
const __m256 lo =
_mm256_div_ps(convert<short, float>(lo128(a)), convert<short, float>(lo128(b)));
const __m256 hi =
_mm256_div_ps(convert<short, float>(hi128(a)), convert<short, float>(hi128(b)));
return concat(convert<float, short>(lo), convert<float, short>(hi));
}
template <typename T> Vc_INTRINSIC T add(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
{
return {add(add(AVX::lo128(a), AVX::hi128(a), T()), T())};
}
template <typename T> Vc_INTRINSIC T mul(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
{
return {mul(mul(AVX::lo128(a), AVX::hi128(a), T()), T())};
}
template <typename T> Vc_INTRINSIC T min(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
{
return {min(min(AVX::lo128(a), AVX::hi128(a), T()), T())};
}
template <typename T> Vc_INTRINSIC T max(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
{
return {max(max(AVX::lo128(a), AVX::hi128(a), T()), T())};
}
Vc_INTRINSIC __m256 cmpeq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpeq_ps(a, b); }
Vc_INTRINSIC __m256d cmpeq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpeq_pd(a, b); }
Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, int) { return AvxIntrinsics::cmpeq_epi32(a, b); }
Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, uint) { return AvxIntrinsics::cmpeq_epi32(a, b); }
Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, short) { return AvxIntrinsics::cmpeq_epi16(a, b); }
Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, ushort) { return AvxIntrinsics::cmpeq_epi16(a, b); }
Vc_INTRINSIC __m256 cmpneq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpneq_ps(a, b); }
Vc_INTRINSIC __m256d cmpneq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpneq_pd(a, b); }
Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, int) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uint) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, short) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, ushort) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, schar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }
Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uchar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }
Vc_INTRINSIC __m256 cmpgt(__m256 a, __m256 b, float) { return AVX::cmpgt_ps(a, b); }
Vc_INTRINSIC __m256d cmpgt(__m256d a, __m256d b, double) { return AVX::cmpgt_pd(a, b); }
Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(a, b); }
Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(a, b); }
Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(a, b); }
Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(a, b); }
Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (a, b); }
Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (a, b); }
Vc_INTRINSIC __m256 cmpge(__m256 a, __m256 b, float) { return AVX::cmpge_ps(a, b); }
Vc_INTRINSIC __m256d cmpge(__m256d a, __m256d b, double) { return AVX::cmpge_pd(a, b); }
Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(b, a)); }
Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(b, a)); }
Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(b, a)); }
Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(b, a)); }
Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (b, a)); }
Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (b, a)); }
Vc_INTRINSIC __m256 cmple(__m256 a, __m256 b, float) { return AVX::cmple_ps(a, b); }
Vc_INTRINSIC __m256d cmple(__m256d a, __m256d b, double) { return AVX::cmple_pd(a, b); }
Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(a, b)); }
Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(a, b)); }
Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(a, b)); }
Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(a, b)); }
Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (a, b)); }
Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (a, b)); }
Vc_INTRINSIC __m256 cmplt(__m256 a, __m256 b, float) { return AVX::cmplt_ps(a, b); }
Vc_INTRINSIC __m256d cmplt(__m256d a, __m256d b, double) { return AVX::cmplt_pd(a, b); }
Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(b, a); }
Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(b, a); }
Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(b, a); }
Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(b, a); }
Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (b, a); }
Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (b, a); }
Vc_INTRINSIC __m256 fma(__m256 a, __m256 b, __m256 c, float) {
#ifdef Vc_IMPL_FMA4
return _mm256_macc_ps(a, b, c);
#elif defined Vc_IMPL_FMA
return _mm256_fmadd_ps(a, b, c);
#else
using namespace AVX;
__m256d v1_0 = _mm256_cvtps_pd(lo128(a));
__m256d v1_1 = _mm256_cvtps_pd(hi128(a));
__m256d v2_0 = _mm256_cvtps_pd(lo128(b));
__m256d v2_1 = _mm256_cvtps_pd(hi128(b));
__m256d v3_0 = _mm256_cvtps_pd(lo128(c));
__m256d v3_1 = _mm256_cvtps_pd(hi128(c));
return concat(_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
#endif
}
Vc_INTRINSIC __m256d fma(__m256d a, __m256d b, __m256d c, double)
{
#ifdef Vc_IMPL_FMA4
return _mm256_macc_pd(a, b, c);
#elif defined Vc_IMPL_FMA
return _mm256_fmadd_pd(a, b, c);
#else
using namespace AVX;
__m256d h1 = and_(a, _mm256_broadcast_sd(reinterpret_cast<const double *>(
&c_general::highMaskDouble)));
__m256d h2 = and_(b, _mm256_broadcast_sd(reinterpret_cast<const double *>(
&c_general::highMaskDouble)));
const __m256d l1 = _mm256_sub_pd(a, h1);
const __m256d l2 = _mm256_sub_pd(b, h2);
const __m256d ll = mul(l1, l2, double());
const __m256d lh = add(mul(l1, h2, double()), mul(h1, l2, double()), double());
const __m256d hh = mul(h1, h2, double());
const __m256d lh_lt_v3 = cmplt(abs(lh, double()), abs(c, double()), double());
const __m256d x = _mm256_blendv_pd(c, lh, lh_lt_v3);
const __m256d y = _mm256_blendv_pd(lh, c, lh_lt_v3);
return add(add(ll, x, double()), add(y, hh, double()), double());
#endif
}
template <typename T> Vc_INTRINSIC __m256i fma(__m256i a, __m256i b, __m256i c, T)
{
return add(mul(a, b, T()), c, T());
}
template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, int) { return AVX::srai_epi32<shift>(a); }
template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, uint) { return AVX::srli_epi32<shift>(a); }
template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, short) { return AVX::srai_epi16<shift>(a); }
template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, ushort) { return AVX::srli_epi16<shift>(a); }
Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, int) { return AVX::sra_epi32(a, _mm_cvtsi32_si128(shift)); }
Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, uint) { return AVX::srl_epi32(a, _mm_cvtsi32_si128(shift)); }
Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, short) { return AVX::sra_epi16(a, _mm_cvtsi32_si128(shift)); }
Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, ushort) { return AVX::srl_epi16(a, _mm_cvtsi32_si128(shift)); }
template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, int) { return AVX::slli_epi32<shift>(a); }
template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, uint) { return AVX::slli_epi32<shift>(a); }
template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, short) { return AVX::slli_epi16<shift>(a); }
template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, ushort) { return AVX::slli_epi16<shift>(a); }
Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, int) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, uint) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, short) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, ushort) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m256 x) { return x; }
Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m256d x) { return x; }
Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m256i x) { return x; }
Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m128 x) { return AVX::zeroExtend(x); }
Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m128d x) { return AVX::zeroExtend(x); }
Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m128i x) { return AVX::zeroExtend(x); }
Vc_INTRINSIC __m256 avx_broadcast( float x) { return _mm256_set1_ps(x); }
Vc_INTRINSIC __m256d avx_broadcast(double x) { return _mm256_set1_pd(x); }
Vc_INTRINSIC __m256i avx_broadcast( int x) { return _mm256_set1_epi32(x); }
Vc_INTRINSIC __m256i avx_broadcast( uint x) { return _mm256_set1_epi32(x); }
Vc_INTRINSIC __m256i avx_broadcast( short x) { return _mm256_set1_epi16(x); }
Vc_INTRINSIC __m256i avx_broadcast(ushort x) { return _mm256_set1_epi16(x); }
Vc_INTRINSIC __m256i avx_broadcast( char x) { return _mm256_set1_epi8(x); }
Vc_INTRINSIC __m256i avx_broadcast( schar x) { return _mm256_set1_epi8(x); }
Vc_INTRINSIC __m256i avx_broadcast( uchar x) { return _mm256_set1_epi8(x); }
template <Vc::Implementation Impl, typename T,
typename = enable_if<(Impl >= AVXImpl && Impl <= AVX2Impl)>>
Vc_CONST_L AVX2::Vector<T> Vc_VDECL sorted(AVX2::Vector<T> x) Vc_CONST_R;
template <typename T> Vc_INTRINSIC Vc_CONST AVX2::Vector<T> sorted(AVX2::Vector<T> x)
{
return sorted<CurrentImplementation::current()>(x);
}
template <typename T, typename V>
static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32), V> shifted(V v, int amount)
{
using namespace AVX;
constexpr int S = sizeof(T);
switch (amount) {
case 0: return v;
case 1: return shifted<sanitize<V>( 1 * S)>(v);
case 2: return shifted<sanitize<V>( 2 * S)>(v);
case 3: return shifted<sanitize<V>( 3 * S)>(v);
case -1: return shifted<sanitize<V>(-1 * S)>(v);
case -2: return shifted<sanitize<V>(-2 * S)>(v);
case -3: return shifted<sanitize<V>(-3 * S)>(v);
}
if (sizeof(T) <= 4) {
switch (amount) {
case 4: return shifted<sanitize<V>( 4 * S)>(v);
case 5: return shifted<sanitize<V>( 5 * S)>(v);
case 6: return shifted<sanitize<V>( 6 * S)>(v);
case 7: return shifted<sanitize<V>( 7 * S)>(v);
case -4: return shifted<sanitize<V>(-4 * S)>(v);
case -5: return shifted<sanitize<V>(-5 * S)>(v);
case -6: return shifted<sanitize<V>(-6 * S)>(v);
case -7: return shifted<sanitize<V>(-7 * S)>(v);
}
if (sizeof(T) <= 2) {
switch (amount) {
case 8: return shifted<sanitize<V>( 8 * S)>(v);
case 9: return shifted<sanitize<V>( 9 * S)>(v);
case 10: return shifted<sanitize<V>( 10 * S)>(v);
case 11: return shifted<sanitize<V>( 11 * S)>(v);
case 12: return shifted<sanitize<V>( 12 * S)>(v);
case 13: return shifted<sanitize<V>( 13 * S)>(v);
case 14: return shifted<sanitize<V>( 14 * S)>(v);
case 15: return shifted<sanitize<V>( 15 * S)>(v);
case -8: return shifted<sanitize<V>(- 8 * S)>(v);
case -9: return shifted<sanitize<V>(- 9 * S)>(v);
case -10: return shifted<sanitize<V>(-10 * S)>(v);
case -11: return shifted<sanitize<V>(-11 * S)>(v);
case -12: return shifted<sanitize<V>(-12 * S)>(v);
case -13: return shifted<sanitize<V>(-13 * S)>(v);
case -14: return shifted<sanitize<V>(-14 * S)>(v);
case -15: return shifted<sanitize<V>(-15 * S)>(v);
}
if (sizeof(T) == 1) {
switch (amount) {
case 16: return shifted<sanitize<V>( 16)>(v);
case 17: return shifted<sanitize<V>( 17)>(v);
case 18: return shifted<sanitize<V>( 18)>(v);
case 19: return shifted<sanitize<V>( 19)>(v);
case 20: return shifted<sanitize<V>( 20)>(v);
case 21: return shifted<sanitize<V>( 21)>(v);
case 22: return shifted<sanitize<V>( 22)>(v);
case 23: return shifted<sanitize<V>( 23)>(v);
case 24: return shifted<sanitize<V>( 24)>(v);
case 25: return shifted<sanitize<V>( 25)>(v);
case 26: return shifted<sanitize<V>( 26)>(v);
case 27: return shifted<sanitize<V>( 27)>(v);
case 28: return shifted<sanitize<V>( 28)>(v);
case 29: return shifted<sanitize<V>( 29)>(v);
case 30: return shifted<sanitize<V>( 30)>(v);
case 31: return shifted<sanitize<V>( 31)>(v);
case -16: return shifted<sanitize<V>(-16)>(v);
case -17: return shifted<sanitize<V>(-17)>(v);
case -18: return shifted<sanitize<V>(-18)>(v);
case -19: return shifted<sanitize<V>(-19)>(v);
case -20: return shifted<sanitize<V>(-20)>(v);
case -21: return shifted<sanitize<V>(-21)>(v);
case -22: return shifted<sanitize<V>(-22)>(v);
case -23: return shifted<sanitize<V>(-23)>(v);
case -24: return shifted<sanitize<V>(-24)>(v);
case -25: return shifted<sanitize<V>(-25)>(v);
case -26: return shifted<sanitize<V>(-26)>(v);
case -27: return shifted<sanitize<V>(-27)>(v);
case -28: return shifted<sanitize<V>(-28)>(v);
case -29: return shifted<sanitize<V>(-29)>(v);
case -30: return shifted<sanitize<V>(-30)>(v);
case -31: return shifted<sanitize<V>(-31)>(v);
}
}
}
}
return avx_cast<V>(_mm256_setzero_ps());
}
template <typename T, typename V>
static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> shifted(V v, int amount)
{
using namespace AVX;
switch (amount) {
case 0: return v;
case 1: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
case 2: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
case 3: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
case -1: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
case -2: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
case -3: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
}
if (sizeof(T) <= 2) {
switch (amount) {
case 4: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
case 5: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
case 6: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
case 7: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
case -4: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
case -5: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
case -6: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
case -7: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
}
}
return avx_cast<V>(_mm_setzero_ps());
}
template <typename T, size_t N, typename V>
static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 4), V> rotated(V v,
int amount)
{
using namespace AVX;
const __m128i vLo = avx_cast<__m128i>(lo128(v));
const __m128i vHi = avx_cast<__m128i>(hi128(v));
switch (static_cast<unsigned int>(amount) % N) {
case 0:
return v;
case 1:
return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vHi, vLo),
SSE::alignr_epi8<sizeof(T)>(vLo, vHi)));
case 2:
return Mem::permute128<X1, X0>(v);
case 3:
return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vLo, vHi),
SSE::alignr_epi8<sizeof(T)>(vHi, vLo)));
}
return avx_cast<V>(_mm256_setzero_ps());
}
template <typename T, size_t N, typename V>
static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 8), V> rotated(V v,
int amount)
{
using namespace AVX;
const __m128i vLo = avx_cast<__m128i>(lo128(v));
const __m128i vHi = avx_cast<__m128i>(hi128(v));
switch (static_cast<unsigned int>(amount) % N) {
case 0:
return v;
case 1:
return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
case 2:
return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
case 3:
return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
case 4:
return Mem::permute128<X1, X0>(v);
case 5:
return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
case 6:
return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
case 7:
return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
}
return avx_cast<V>(_mm256_setzero_ps());
}
#ifdef Vc_IMPL_AVX2
template <typename T, size_t N, typename V>
static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 16), V> rotated(
V v, int amount)
{
using namespace AVX;
const __m128i vLo = avx_cast<__m128i>(lo128(v));
const __m128i vHi = avx_cast<__m128i>(hi128(v));
switch (static_cast<unsigned int>(amount) % N) {
case 0:
return v;
case 1:
return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
case 2:
return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
case 3:
return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
case 4:
return Mem::permute4x64<X1, X2, X3, X0>(v);
case 5:
return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo),
SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi)));
case 6:
return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo),
SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi)));
case 7:
return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo),
SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi)));
case 8:
return Mem::permute128<X1, X0>(v);
case 9:
return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
case 10:
return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
case 11:
return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
case 12:
return Mem::permute4x64<X3, X0, X1, X2>(v);
case 13:
return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi),
SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo)));
case 14:
return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi),
SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo)));
case 15:
return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi),
SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo)));
}
return avx_cast<V>(_mm256_setzero_ps());
}
#endif
Vc_INTRINSIC Vc_CONST int testc(__m128 a, __m128 b) { return _mm_testc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
Vc_INTRINSIC Vc_CONST int testc(__m256 a, __m256 b) { return _mm256_testc_ps(a, b); }
Vc_INTRINSIC Vc_CONST int testc(__m256d a, __m256d b) { return _mm256_testc_pd(a, b); }
Vc_INTRINSIC Vc_CONST int testc(__m256i a, __m256i b) { return _mm256_testc_si256(a, b); }
Vc_INTRINSIC Vc_CONST int testz(__m128 a, __m128 b) { return _mm_testz_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
Vc_INTRINSIC Vc_CONST int testz(__m256 a, __m256 b) { return _mm256_testz_ps(a, b); }
Vc_INTRINSIC Vc_CONST int testz(__m256d a, __m256d b) { return _mm256_testz_pd(a, b); }
Vc_INTRINSIC Vc_CONST int testz(__m256i a, __m256i b) { return _mm256_testz_si256(a, b); }
Vc_INTRINSIC Vc_CONST int testnzc(__m128 a, __m128 b) { return _mm_testnzc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
Vc_INTRINSIC Vc_CONST int testnzc(__m256 a, __m256 b) { return _mm256_testnzc_ps(a, b); }
Vc_INTRINSIC Vc_CONST int testnzc(__m256d a, __m256d b) { return _mm256_testnzc_pd(a, b); }
Vc_INTRINSIC Vc_CONST int testnzc(__m256i a, __m256i b) { return _mm256_testnzc_si256(a, b); }
Vc_INTRINSIC Vc_CONST int movemask(__m256i a) { return AVX::movemask_epi8(a); }
Vc_INTRINSIC Vc_CONST int movemask(__m128i a) { return _mm_movemask_epi8(a); }
Vc_INTRINSIC Vc_CONST int movemask(__m256d a) { return _mm256_movemask_pd(a); }
Vc_INTRINSIC Vc_CONST int movemask(__m128d a) { return _mm_movemask_pd(a); }
Vc_INTRINSIC Vc_CONST int movemask(__m256 a) { return _mm256_movemask_ps(a); }
Vc_INTRINSIC Vc_CONST int movemask(__m128 a) { return _mm_movemask_ps(a); }
template <size_t N, typename Flags>
Vc_INTRINSIC void mask_store(__m256i k, bool *mem, Flags)
{
static_assert(
N == 4 || N == 8 || N == 16,
"mask_store(__m256i, bool *) is only implemented for 4, 8, and 16 entries");
switch (N) {
case 4:
*aliasing_cast<int32_t>(mem) = (_mm_movemask_epi8(AVX::lo128(k)) |
(_mm_movemask_epi8(AVX::hi128(k)) << 16)) &
0x01010101;
break;
case 8: {
const auto k2 = _mm_srli_epi16(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)), 15);
const auto k3 = _mm_packs_epi16(k2, _mm_setzero_si128());
#ifdef __x86_64__
*aliasing_cast<int64_t>(mem) = _mm_cvtsi128_si64(k3);
#else
*aliasing_cast<int32_t>(mem) = _mm_cvtsi128_si32(k3);
*aliasing_cast<int32_t>(mem + 4) = _mm_extract_epi32(k3, 1);
#endif
} break;
case 16: {
const auto bools = Detail::and_(_mm_set1_epi8(1),
_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
if (Flags::IsAligned) {
_mm_store_si128(reinterpret_cast<__m128i *>(mem), bools);
} else {
_mm_storeu_si128(reinterpret_cast<__m128i *>(mem), bools);
}
} break;
default:
Vc_UNREACHABLE();
}
}
template <typename R, size_t N, typename Flags>
Vc_INTRINSIC R mask_load(const bool *mem, Flags,
enable_if<std::is_same<R, __m128>::value> = nullarg)
{
static_assert(N == 4 || N == 8,
"mask_load<__m128>(const bool *) is only implemented for 4, 8 entries");
switch (N) {
case 4: {
__m128i k = _mm_cvtsi32_si128(*aliasing_cast<int32_t>(mem));
k = _mm_unpacklo_epi8(k, k);
k = _mm_unpacklo_epi16(k, k);
k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
return AVX::avx_cast<__m128>(k);
}
case 8: {
#ifdef __x86_64__
__m128i k = _mm_cvtsi64_si128(*aliasing_cast<int64_t>(mem));
#else
__m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast<double>(mem)));
#endif
return AVX::avx_cast<__m128>(
_mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
}
default:
Vc_UNREACHABLE();
}
}
template <typename R, size_t N, typename Flags>
Vc_INTRINSIC R mask_load(const bool *mem, Flags,
enable_if<std::is_same<R, __m256>::value> = nullarg)
{
static_assert(
N == 4 || N == 8 || N == 16,
"mask_load<__m256>(const bool *) is only implemented for 4, 8, and 16 entries");
switch (N) {
case 4: {
__m128i k = AVX::avx_cast<__m128i>(_mm_and_ps(
_mm_set1_ps(*aliasing_cast<float>(mem)),
AVX::avx_cast<__m128>(_mm_setr_epi32(0x1, 0x100, 0x10000, 0x1000000))));
k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
return AVX::avx_cast<__m256>(
AVX::concat(_mm_unpacklo_epi32(k, k), _mm_unpackhi_epi32(k, k)));
}
case 8: {
#ifdef __x86_64__
__m128i k = _mm_cvtsi64_si128(*aliasing_cast<int64_t>(mem));
#else
__m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast<double>(mem)));
#endif
k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
return AVX::avx_cast<__m256>(
AVX::concat(_mm_unpacklo_epi16(k, k), _mm_unpackhi_epi16(k, k)));
}
case 16: {
const auto k128 = _mm_cmpgt_epi8(
Flags::IsAligned ? _mm_load_si128(reinterpret_cast<const __m128i *>(mem))
: _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem)),
_mm_setzero_si128());
return AVX::avx_cast<__m256>(
AVX::concat(_mm_unpacklo_epi8(k128, k128), _mm_unpackhi_epi8(k128, k128)));
}
default:
Vc_UNREACHABLE();
return R();
}
}
template <size_t Size>
Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m256i x) Vc_INTRINSIC_R Vc_CONST_R;
template <> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m256i k)
{
return movemask(AVX::avx_cast<__m256d>(k));
}
template <> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m256i k)
{
return movemask(AVX::avx_cast<__m256>(k));
}
#ifdef Vc_IMPL_BMI2
template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k)
{
return _pext_u32(movemask(k), 0x55555555u);
}
#endif
template <> Vc_INTRINSIC Vc_CONST int mask_to_int<32>(__m256i k)
{
return movemask(k);
}
template<typename V> struct InterleaveImpl<V, 16, 32> {
template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0,
const typename V::AsArg v1)
{
const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data());
const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data());
using namespace AVX;
*aliasing_cast<uint32_t>(&data[i[ 0]]) = _mm_cvtsi128_si32(lo128(tmp0));
*aliasing_cast<uint32_t>(&data[i[ 1]]) = _mm_extract_epi32(lo128(tmp0), 1);
*aliasing_cast<uint32_t>(&data[i[ 2]]) = _mm_extract_epi32(lo128(tmp0), 2);
*aliasing_cast<uint32_t>(&data[i[ 3]]) = _mm_extract_epi32(lo128(tmp0), 3);
*aliasing_cast<uint32_t>(&data[i[ 4]]) = _mm_cvtsi128_si32(lo128(tmp1));
*aliasing_cast<uint32_t>(&data[i[ 5]]) = _mm_extract_epi32(lo128(tmp1), 1);
*aliasing_cast<uint32_t>(&data[i[ 6]]) = _mm_extract_epi32(lo128(tmp1), 2);
*aliasing_cast<uint32_t>(&data[i[ 7]]) = _mm_extract_epi32(lo128(tmp1), 3);
*aliasing_cast<uint32_t>(&data[i[ 8]]) = _mm_cvtsi128_si32(hi128(tmp0));
*aliasing_cast<uint32_t>(&data[i[ 9]]) = _mm_extract_epi32(hi128(tmp0), 1);
*aliasing_cast<uint32_t>(&data[i[10]]) = _mm_extract_epi32(hi128(tmp0), 2);
*aliasing_cast<uint32_t>(&data[i[11]]) = _mm_extract_epi32(hi128(tmp0), 3);
*aliasing_cast<uint32_t>(&data[i[12]]) = _mm_cvtsi128_si32(hi128(tmp1));
*aliasing_cast<uint32_t>(&data[i[13]]) = _mm_extract_epi32(hi128(tmp1), 1);
*aliasing_cast<uint32_t>(&data[i[14]]) = _mm_extract_epi32(hi128(tmp1), 2);
*aliasing_cast<uint32_t>(&data[i[15]]) = _mm_extract_epi32(hi128(tmp1), 3);
}
static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
const typename V::AsArg v0, const typename V::AsArg v1)
{
const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data());
const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data());
V(Mem::shuffle128<X0, Y0>(tmp0, tmp1)).store(&data[i[0]], Vc::Unaligned);
V(Mem::shuffle128<X1, Y1>(tmp0, tmp1)).store(&data[i[8]], Vc::Unaligned);
}
template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
{
interleave(data, i, v0, v1);
v2.scatter(data + 2, i);
}
template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3)
{
const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data());
const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data());
const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data());
const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data());
const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
using namespace AVX;
auto &&store = [&](__m256i x, int offset) {
_mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 0]]), lo128(x));
_mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 8]]), hi128(x));
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 1]]), avx_cast<__m128>(x));
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 9]]), avx_cast<__m128>(hi128(x)));
};
store(tmp4, 0);
store(tmp5, 2);
store(tmp6, 4);
store(tmp7, 6);
}
static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3)
{
const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data());
const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data());
const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data());
const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data());
const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
V(Mem::shuffle128<X0, Y0>(tmp4, tmp5)).store(&data[i[0]], ::Vc::Unaligned);
V(Mem::shuffle128<X0, Y0>(tmp6, tmp7)).store(&data[i[4]], ::Vc::Unaligned);
V(Mem::shuffle128<X1, Y1>(tmp4, tmp5)).store(&data[i[8]], ::Vc::Unaligned);
V(Mem::shuffle128<X1, Y1>(tmp6, tmp7)).store(&data[i[12]], ::Vc::Unaligned);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4)
{
interleave(data, i, v0, v1, v2, v3);
v4.scatter(data + 4, i);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6, const typename V::AsArg v7)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6, v7);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1)
{
const __m256i tmp4 =
_mm256_setr_epi32(
*aliasing_cast<int>(&data[i[0]]), *aliasing_cast<int>(&data[i[1]]),
*aliasing_cast<int>(&data[i[2]]), *aliasing_cast<int>(&data[i[3]]),
*aliasing_cast<int>(&data[i[8]]), *aliasing_cast<int>(&data[i[9]]),
*aliasing_cast<int>(&data[i[10]]), *aliasing_cast<int>(&data[i[11]]));
const __m256i tmp5 =
_mm256_setr_epi32(
*aliasing_cast<int>(&data[i[4]]), *aliasing_cast<int>(&data[i[5]]),
*aliasing_cast<int>(&data[i[6]]), *aliasing_cast<int>(&data[i[7]]),
*aliasing_cast<int>(&data[i[12]]), *aliasing_cast<int>(&data[i[13]]),
*aliasing_cast<int>(&data[i[14]]), *aliasing_cast<int>(&data[i[15]]));
const __m256i tmp2 = AVX::unpacklo_epi16(tmp4, tmp5);
const __m256i tmp3 = AVX::unpackhi_epi16(tmp4, tmp5);
const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
const __m256i tmp1 = AVX::unpackhi_epi16(tmp2, tmp3);
v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2)
{
using namespace AVX;
const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(
*aliasing_cast<double>(&data[i[0]]), *aliasing_cast<double>(&data[i[1]]),
*aliasing_cast<double>(&data[i[8]]), *aliasing_cast<double>(&data[i[9]])));
const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(
*aliasing_cast<double>(&data[i[2]]), *aliasing_cast<double>(&data[i[3]]),
*aliasing_cast<double>(&data[i[10]]), *aliasing_cast<double>(&data[i[11]])));
const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(
*aliasing_cast<double>(&data[i[4]]), *aliasing_cast<double>(&data[i[5]]),
*aliasing_cast<double>(&data[i[12]]), *aliasing_cast<double>(&data[i[13]])));
const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(
*aliasing_cast<double>(&data[i[6]]), *aliasing_cast<double>(&data[i[7]]),
*aliasing_cast<double>(&data[i[14]]), *aliasing_cast<double>(&data[i[15]])));
const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6);
const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6);
const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7);
const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7);
v0.data() = AVX::unpacklo_epi16(tmp8, tmp10);
v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3)
{
using namespace AVX;
const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(
*aliasing_cast<double>(&data[i[0]]), *aliasing_cast<double>(&data[i[1]]),
*aliasing_cast<double>(&data[i[8]]), *aliasing_cast<double>(&data[i[9]])));
const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(
*aliasing_cast<double>(&data[i[2]]), *aliasing_cast<double>(&data[i[3]]),
*aliasing_cast<double>(&data[i[10]]), *aliasing_cast<double>(&data[i[11]])));
const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(
*aliasing_cast<double>(&data[i[4]]), *aliasing_cast<double>(&data[i[5]]),
*aliasing_cast<double>(&data[i[12]]), *aliasing_cast<double>(&data[i[13]])));
const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(
*aliasing_cast<double>(&data[i[6]]), *aliasing_cast<double>(&data[i[7]]),
*aliasing_cast<double>(&data[i[14]]), *aliasing_cast<double>(&data[i[15]])));
const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6);
const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6);
const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7);
const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7);
v0.data() = AVX::unpacklo_epi16(tmp8, tmp10);
v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
v3.data() = AVX::unpackhi_epi16(tmp9, tmp11);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
{
using namespace AVX;
const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
{
using namespace AVX;
const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
{
using namespace AVX;
const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11);
const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13);
v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
{
using namespace AVX;
const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11);
const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13);
v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
v7.data() = AVX::unpackhi_epi16(tmp14, tmp15);
}
};
template<typename V> struct InterleaveImpl<V, 8, 32> {
static_assert(sizeof(typename V::value_type) == 4, "");
template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1)
{
using namespace AVX;
const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
_mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), lo128(tmp0));
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), lo128(tmp0));
_mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), lo128(tmp1));
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), lo128(tmp1));
_mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), hi128(tmp0));
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), hi128(tmp0));
_mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), hi128(tmp1));
_mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), hi128(tmp1));
}
static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
const typename V::AsArg v0, const typename V::AsArg v1)
{
using namespace AVX;
const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
_mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), lo128(tmp0));
_mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), lo128(tmp1));
_mm_storeu_ps(aliasing_cast<float>(&data[i[4]]), hi128(tmp0));
_mm_storeu_ps(aliasing_cast<float>(&data[i[6]]), hi128(tmp1));
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2)
{
using namespace AVX;
#ifdef Vc_USE_MASKMOV_SCATTER
const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
const m256 tmp2 = _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
const m256 tmp3 = _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2);
const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2);
const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3);
const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3);
const m128i mask = _mm_set_epi32(0, -1, -1, -1);
_mm_maskstore_ps(aliasing_cast<float>(&data[i[0]]), mask, lo128(tmp4));
_mm_maskstore_ps(aliasing_cast<float>(&data[i[1]]), mask, lo128(tmp5));
_mm_maskstore_ps(aliasing_cast<float>(&data[i[2]]), mask, lo128(tmp6));
_mm_maskstore_ps(aliasing_cast<float>(&data[i[3]]), mask, lo128(tmp7));
_mm_maskstore_ps(aliasing_cast<float>(&data[i[4]]), mask, hi128(tmp4));
_mm_maskstore_ps(aliasing_cast<float>(&data[i[5]]), mask, hi128(tmp5));
_mm_maskstore_ps(aliasing_cast<float>(&data[i[6]]), mask, hi128(tmp6));
_mm_maskstore_ps(aliasing_cast<float>(&data[i[7]]), mask, hi128(tmp7));
#else
interleave(data, i, v0, v1);
v2.scatter(data + 2, i);
#endif
}
static inline void interleave(typename V::EntryType *const data,
const Common::SuccessiveEntries<3> &i,
const typename V::AsArg v0_,
const typename V::AsArg v1_,
const typename V::AsArg v2_)
{
__m256 v0 = AVX::avx_cast<__m256>(v0_.data());
__m256 v1 = AVX::avx_cast<__m256>(v1_.data());
__m256 v2 = AVX::avx_cast<__m256>(v2_.data());
v0 = _mm256_shuffle_ps(v0, v0, 0x6c);
v1 = _mm256_shuffle_ps(v1, v1, 0xb1);
v2 = _mm256_shuffle_ps(v2, v2, 0xc6);
__m256 w0 = Mem::blend<X0, X1, Y2, X3, Y4, X5, X6, Y7>(
Mem::blend<X0, Y1, X2, X3, X4, X5, Y6, X7>(v0, v1), v2);
__m256 w1 = Mem::blend<X0, Y1, X2, X3, X4, Y5, X6, X7>(
Mem::blend<Y0, X1, X2, Y3, Y4, X5, X6, Y7>(v0, v1), v2);
__m256 w2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(
Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(v0, v1), v2);
_mm256_storeu_ps(aliasing_cast<float>(&data[i[0]]),
_mm256_permute2f128_ps(w0, w1, 0x20));
_mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 8), w2);
_mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 16),
_mm256_permute2f128_ps(w1, w0, 0x31));
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3)
{
using namespace AVX;
const __m256 tmp0 =
_mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
const __m256 tmp1 =
_mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
const __m256 tmp2 =
_mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
const __m256 tmp3 =
_mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2);
const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2);
const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3);
const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3);
_mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), lo128(_04));
_mm_storeu_ps(aliasing_cast<float>(&data[i[1]]), lo128(_15));
_mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), lo128(_26));
_mm_storeu_ps(aliasing_cast<float>(&data[i[3]]), lo128(_37));
_mm_storeu_ps(aliasing_cast<float>(&data[i[4]]), hi128(_04));
_mm_storeu_ps(aliasing_cast<float>(&data[i[5]]), hi128(_15));
_mm_storeu_ps(aliasing_cast<float>(&data[i[6]]), hi128(_26));
_mm_storeu_ps(aliasing_cast<float>(&data[i[7]]), hi128(_37));
}
static inline void interleave(typename V::EntryType *const data,
const Common::SuccessiveEntries<4> &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3)
{
using namespace AVX;
const __m256 tmp0 =
_mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
const __m256 tmp1 =
_mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
const __m256 tmp2 =
_mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
const __m256 tmp3 =
_mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2);
const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2);
const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3);
const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3);
_mm256_storeu_ps(aliasing_cast<float>(&data[i[0]]),
_mm256_permute2f128_ps(_04, _15, 0x20));
_mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 8),
_mm256_permute2f128_ps(_26, _37, 0x20));
_mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 16),
_mm256_permute2f128_ps(_04, _15, 0x31));
_mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 24),
_mm256_permute2f128_ps(_26, _37, 0x31));
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4)
{
interleave(data, i, v0, v1, v2, v3);
v4.scatter(data + 4, i);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6, const typename V::AsArg v7)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6, v7);
}
template <typename I>
static inline void deinterleave(typename V::EntryType const *const data, const I &i,
V &v0, V &v1)
{
using namespace AVX;
const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[0]]));
const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[2]]));
const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[4]]));
const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[6]]));
const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&data[i[1]]));
const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&data[i[3]]));
const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&data[i[5]]));
const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&data[i[7]]));
const m256 tmp2 = concat(il01, il45);
const m256 tmp3 = concat(il23, il67);
const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);
v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
}
static inline void deinterleave(typename V::EntryType const *const data,
const Common::SuccessiveEntries<2> &i, V &v0, V &v1)
{
using namespace AVX;
const m256 il0123 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]]));
const m256 il4567 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[4]]));
const m256 tmp2 = Mem::shuffle128<X0, Y0>(il0123, il4567);
const m256 tmp3 = Mem::shuffle128<X1, Y1>(il0123, il4567);
const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);
v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
}
template <typename I>
static inline void deinterleave(typename V::EntryType const *const data, const I &i,
V &v0, V &v1, V &v2)
{
using namespace AVX;
const m128 il0 = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
const m128 il1 = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
const m128 il2 = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
const m128 il3 = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
const m128 il4 = _mm_loadu_ps(aliasing_cast<float>(&data[i[4]]));
const m128 il5 = _mm_loadu_ps(aliasing_cast<float>(&data[i[5]]));
const m128 il6 = _mm_loadu_ps(aliasing_cast<float>(&data[i[6]]));
const m128 il7 = _mm_loadu_ps(aliasing_cast<float>(&data[i[7]]));
const m256 il04 = concat(il0, il4);
const m256 il15 = concat(il1, il5);
const m256 il26 = concat(il2, il6);
const m256 il37 = concat(il3, il7);
const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
}
static inline void deinterleave(typename V::EntryType const *const data,
const Common::SuccessiveEntries<3> &i, V &v0, V &v1,
V &v2)
{
__m256 in0 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 0));
__m256 in1 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 8));
__m256 in2 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 16));
const __m256 aaabffgg = _mm256_permute2f128_ps(in0, in2, 0x20);
const __m256 cdddeeef = in1;
const __m256 bbccghhh = _mm256_permute2f128_ps(in0, in2, 0x31);
const __m256 x0 = _mm256_blend_ps(
_mm256_blend_ps(aaabffgg, cdddeeef, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80),
bbccghhh, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0);
const __m256 x1 = _mm256_blend_ps(
_mm256_blend_ps(aaabffgg, cdddeeef, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0),
bbccghhh, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0);
const __m256 x2 = _mm256_blend_ps(
_mm256_blend_ps(aaabffgg, cdddeeef, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0),
bbccghhh, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80);
v0 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x0, x0, 0x6c));
v1 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x1, x1, 0xb1));
v2 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x2, x2, 0xc6));
}
template <typename I>
static inline void deinterleave(typename V::EntryType const *const data, const I &i,
V &v0, V &v1, V &v2, V &v3)
{
using namespace AVX;
const m128 il0 = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
const m128 il1 = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
const m128 il2 = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
const m128 il3 = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
const m128 il4 = _mm_loadu_ps(aliasing_cast<float>(&data[i[4]]));
const m128 il5 = _mm_loadu_ps(aliasing_cast<float>(&data[i[5]]));
const m128 il6 = _mm_loadu_ps(aliasing_cast<float>(&data[i[6]]));
const m128 il7 = _mm_loadu_ps(aliasing_cast<float>(&data[i[7]]));
const m256 il04 = concat(il0, il4);
const m256 il15 = concat(il1, il5);
const m256 il26 = concat(il2, il6);
const m256 il37 = concat(il3, il7);
const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(cd0246, cd1357));
}
static inline void deinterleave(typename V::EntryType const *const data,
const Common::SuccessiveEntries<4> &i, V &v0, V &v1,
V &v2, V &v3)
{
using namespace AVX;
const __m256 il01 = _mm256_loadu_ps(
aliasing_cast<float>(&data[i[0]]));
const __m256 il23 = _mm256_loadu_ps(
aliasing_cast<float>(&data[i[2]]));
const __m256 il45 = _mm256_loadu_ps(
aliasing_cast<float>(&data[i[4]]));
const __m256 il67 = _mm256_loadu_ps(
aliasing_cast<float>(&data[i[6]]));
const __m256 il04 = _mm256_permute2f128_ps(il01, il45, 0x20);
const __m256 il15 = _mm256_permute2f128_ps(il01, il45, 0x31);
const __m256 il26 = _mm256_permute2f128_ps(il23, il67, 0x20);
const __m256 il37 = _mm256_permute2f128_ps(il23, il67, 0x31);
const __m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
const __m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
const __m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
const __m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(cd0246, cd1357));
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
{
v4.gather(data + 4, i);
deinterleave(data, i, v0, v1, v2, v3);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
{
deinterleave(data, i, v0, v1, v2, v3);
deinterleave(data + 4, i, v4, v5);
}
static inline void deinterleave(typename V::EntryType const *const data,
const Common::SuccessiveEntries<6> &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
{
using namespace AVX;
const m256 a = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]]));
const m256 b = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 1 * V::Size]));
const m256 c = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 2 * V::Size]));
const m256 d = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 3 * V::Size]));
const m256 e = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 4 * V::Size]));
const m256 f = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 5 * V::Size]));
const __m256 tmp2 = Mem::shuffle128<X0, Y0>(a, d);
const __m256 tmp3 = Mem::shuffle128<X1, Y1>(b, e);
const __m256 tmp4 = Mem::shuffle128<X1, Y1>(a, d);
const __m256 tmp5 = Mem::shuffle128<X0, Y0>(c, f);
const __m256 tmp8 = Mem::shuffle128<X0, Y0>(b, e);
const __m256 tmp9 = Mem::shuffle128<X1, Y1>(c, f);
const __m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
const __m256 tmp1 = _mm256_unpackhi_ps(tmp4, tmp5);
const __m256 tmp6 = _mm256_unpackhi_ps(tmp2, tmp3);
const __m256 tmp7 = _mm256_unpacklo_ps(tmp8, tmp9);
const __m256 tmp10 = _mm256_unpacklo_ps(tmp4, tmp5);
const __m256 tmp11 = _mm256_unpackhi_ps(tmp8, tmp9);
v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp6, tmp7));
v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp6, tmp7));
v4.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp10, tmp11));
v5.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp10, tmp11));
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
{
deinterleave(data, i, v0, v1, v2, v3);
deinterleave(data + 4, i, v4, v5, v6);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
{
deinterleave(data, i, v0, v1, v2, v3);
deinterleave(data + 4, i, v4, v5, v6, v7);
}
};
template<typename V> struct InterleaveImpl<V, 4, 32> {
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1)
{
using namespace AVX;
const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
_mm_storeu_pd(&data[i[0]], lo128(tmp0));
_mm_storeu_pd(&data[i[1]], lo128(tmp1));
_mm_storeu_pd(&data[i[2]], hi128(tmp0));
_mm_storeu_pd(&data[i[3]], hi128(tmp1));
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2)
{
using namespace AVX;
#ifdef Vc_USE_MASKMOV_SCATTER
const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v2.data());
const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v2.data());
#if defined(Vc_MSVC) && (Vc_MSVC < 170000000 || !defined(_WIN64))
const m256i mask = concat(_mm_setallone_si128(), _mm_set_epi32(0, 0, -1, -1));
#else
const m256i mask = _mm256_set_epi64x(0, -1, -1, -1);
#endif
_mm256_maskstore_pd(&data[i[0]], mask, Mem::shuffle128<X0, Y0>(tmp0, tmp2));
_mm256_maskstore_pd(&data[i[1]], mask, Mem::shuffle128<X0, Y0>(tmp1, tmp3));
_mm256_maskstore_pd(&data[i[2]], mask, Mem::shuffle128<X1, Y1>(tmp0, tmp2));
_mm256_maskstore_pd(&data[i[3]], mask, Mem::shuffle128<X1, Y1>(tmp1, tmp3));
#else
interleave(data, i, v0, v1);
v2.scatter(data + 2, i);
#endif
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3)
{
using namespace AVX;
const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v3.data());
const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v3.data());
_mm_storeu_pd(&data[i[0] ], lo128(tmp0));
_mm_storeu_pd(&data[i[0]+2], lo128(tmp2));
_mm_storeu_pd(&data[i[1] ], lo128(tmp1));
_mm_storeu_pd(&data[i[1]+2], lo128(tmp3));
_mm_storeu_pd(&data[i[2] ], hi128(tmp0));
_mm_storeu_pd(&data[i[2]+2], hi128(tmp2));
_mm_storeu_pd(&data[i[3] ], hi128(tmp1));
_mm_storeu_pd(&data[i[3]+2], hi128(tmp3));
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4)
{
interleave(data, i, v0, v1, v2, v3);
v4.scatter(data + 4, i);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6);
}
template <typename I>
static inline void interleave(typename V::EntryType *const data, const I &i,
const typename V::AsArg v0, const typename V::AsArg v1,
const typename V::AsArg v2, const typename V::AsArg v3,
const typename V::AsArg v4, const typename V::AsArg v5,
const typename V::AsArg v6, const typename V::AsArg v7)
{
interleave(data, i, v0, v1, v2, v3);
interleave(data + 4, i, v4, v5, v6, v7);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1)
{
using namespace Vc::AVX;
const m256d ab02 = concat(_mm_loadu_pd(&data[i[0]]), _mm_loadu_pd(&data[i[2]]));
const m256d ab13 = concat(_mm_loadu_pd(&data[i[1]]), _mm_loadu_pd(&data[i[3]]));
v0.data() = _mm256_unpacklo_pd(ab02, ab13);
v1.data() = _mm256_unpackhi_pd(ab02, ab13);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2)
{
v2.gather(data + 2, i);
deinterleave(data, i, v0, v1);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3)
{
deinterleave(data, i, v0, v1);
deinterleave(data + 2, i, v2, v3);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
{
v4.gather(data + 4, i);
deinterleave(data, i, v0, v1);
deinterleave(data + 2, i, v2, v3);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
{
deinterleave(data, i, v0, v1);
deinterleave(data + 2, i, v2, v3);
deinterleave(data + 4, i, v4, v5);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
{
v6.gather(data + 6, i);
deinterleave(data, i, v0, v1);
deinterleave(data + 2, i, v2, v3);
deinterleave(data + 4, i, v4, v5);
}
template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
{
deinterleave(data, i, v0, v1);
deinterleave(data + 2, i, v2, v3);
deinterleave(data + 4, i, v4, v5);
deinterleave(data + 6, i, v6, v7);
}
};
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
template <typename T> class Mask<T, VectorAbi::Avx>
{
public:
using abi = VectorAbi::Avx;
typedef bool EntryType;
using value_type = EntryType;
using MaskBool = Common::MaskBool<sizeof(T)>;
using VectorEntryType = MaskBool;
using Vector = AVX2::Vector<T>;
using VectorTypeF = AVX::FloatVectorType<typename AVX::VectorTypeHelper<T>::Type>;
using VectorTypeD = AVX::DoubleVectorType<VectorTypeF>;
using VectorTypeI = AVX::IntegerVectorType<VectorTypeF>;
private:
typedef const VectorTypeF VArg;
typedef const VectorTypeD VdArg;
typedef const VectorTypeI ViArg;
public:
static constexpr size_t Size = sizeof(VectorTypeF) / sizeof(T);
static constexpr size_t MemoryAlignment = Size;
static constexpr std::size_t size() { return Size; }
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType));
private:
typedef Common::Storage<T, Size> Storage;
public:
using VectorType = typename Storage::VectorType;
using EntryReference = Vc::Detail::ElementReference<Mask>;
using reference = EntryReference;
#if defined Vc_MSVC && defined _WIN32
typedef const Mask &AsArg;
#else
typedef const Mask AsArg;
#endif
Vc_INTRINSIC Mask() {}
Vc_INTRINSIC Mask(VArg x) : d(AVX::avx_cast<VectorType>(x)) {}
Vc_INTRINSIC Mask(VdArg x) : d(AVX::avx_cast<VectorType>(x)) {}
Vc_INTRINSIC Mask(ViArg x) : d(AVX::avx_cast<VectorType>(x)) {}
Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : d(Detail::zero<VectorType>()) {}
Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : d(Detail::allone<VectorType>()) {}
Vc_INTRINSIC explicit Mask(bool b)
: d(b ? Detail::allone<VectorType>() : Detail::zero<VectorType>())
{
}
Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; }
Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; }
template <typename U>
Vc_INTRINSIC Mask(
U &&rhs, Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg)
: d(AVX::avx_cast<VectorType>(
Detail::mask_cast<Traits::decay<U>::Size, Size, VectorTypeF>(
rhs.dataI())))
{
}
#if Vc_IS_VERSION_1
template <typename U>
Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
"mask types") Vc_INTRINSIC
explicit Mask(U &&rhs,
Common::enable_if_mask_converts_explicitly<T, U> = nullarg);
#endif
template<typename Flags = DefaultLoadTag> Vc_INTRINSIC explicit Mask(const bool *mem, Flags f = Flags()) { load(mem, f); }
template<typename Flags = DefaultLoadTag> Vc_INTRINSIC void load(const bool *mem, Flags = Flags());
template<typename Flags = DefaultLoadTag> Vc_INTRINSIC void store(bool *mem, Flags = Flags()) const;
Vc_INTRINSIC Mask &operator=(const Mask &) = default;
Vc_INTRINSIC_L Mask &operator=(const std::array<bool, Size> &values) Vc_INTRINSIC_R;
Vc_INTRINSIC_L operator std::array<bool, Size>() const Vc_INTRINSIC_R;
Vc_INTRINSIC Vc_PURE bool operator==(const Mask &rhs) const
{ return Detail::movemask(d.v()) == Detail::movemask(rhs.d.v()); }
Vc_INTRINSIC Vc_PURE bool operator!=(const Mask &rhs) const
{ return !operator==(rhs); }
Vc_INTRINSIC Mask operator!() const
{
#ifdef Vc_GCC
return ~dataI();
#else
return Detail::andnot_(dataF(), Detail::allone<VectorTypeF>());
#endif
}
Vc_INTRINSIC Mask &operator&=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::and_(data(), rhs.data())); return *this; }
Vc_INTRINSIC Mask &operator|=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::or_ (data(), rhs.data())); return *this; }
Vc_INTRINSIC Mask &operator^=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::xor_(data(), rhs.data())); return *this; }
Vc_INTRINSIC Vc_PURE Mask operator&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); }
Vc_INTRINSIC Vc_PURE Mask operator|(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); }
Vc_INTRINSIC Vc_PURE Mask operator^(const Mask &rhs) const { return Detail::xor_(data(), rhs.data()); }
Vc_INTRINSIC Vc_PURE Mask operator&&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); }
Vc_INTRINSIC Vc_PURE Mask operator||(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); }
Vc_INTRINSIC_L bool isNotEmpty() const Vc_INTRINSIC_R;
Vc_INTRINSIC_L bool isEmpty() const Vc_INTRINSIC_R;
Vc_INTRINSIC_L bool isFull() const Vc_INTRINSIC_R;
Vc_INTRINSIC_L bool isMix() const Vc_INTRINSIC_R;
Vc_INTRINSIC Vc_PURE int shiftMask() const { return Detail::movemask(dataI()); }
Vc_INTRINSIC Vc_PURE int toInt() const { return Detail::mask_to_int<Size>(dataI()); }
Vc_INTRINSIC VectorType data () const { return d.v(); }
Vc_INTRINSIC VectorTypeF dataF() const { return AVX::avx_cast<VectorTypeF>(d.v()); }
Vc_INTRINSIC VectorTypeI dataI() const { return AVX::avx_cast<VectorTypeI>(d.v()); }
Vc_INTRINSIC VectorTypeD dataD() const { return AVX::avx_cast<VectorTypeD>(d.v()); }
private:
friend reference;
static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept
{
return m.toInt() & (1 << i);
}
template <typename U>
static Vc_INTRINSIC void set(Mask &m, int i,
U &&v) noexcept(noexcept(MaskBool(std::declval<U>())))
{
m.d.set(i, MaskBool(std::forward<U>(v)));
}
public:
Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
{
return {*this, int(index)};
}
Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept
{
return get(*this, index);
}
Vc_INTRINSIC Vc_PURE int count() const { return Detail::popcnt16(toInt()); }
Vc_INTRINSIC Vc_PURE int firstOne() const { return _bit_scan_forward(toInt()); }
template <typename G> static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R;
private:
#ifdef Vc_COMPILE_BENCHMARKS
public:
#endif
Storage d;
};
template <typename T> constexpr size_t Mask<T, VectorAbi::Avx>::Size;
template <typename T> constexpr size_t Mask<T, VectorAbi::Avx>::MemoryAlignment;
}
namespace Vc_VERSIONED_NAMESPACE
{
template <typename T>
template <typename Flags>
Vc_INTRINSIC void Mask<T, VectorAbi::Avx>::store(bool *mem, Flags f) const
{
Detail::mask_store<Size>(dataI(), mem, f);
}
template <typename T>
template <typename Flags>
Vc_INTRINSIC void Mask<T, VectorAbi::Avx>::load(const bool *mem, Flags f)
{
d.v() = AVX::avx_cast<VectorType>(Detail::mask_load<VectorTypeF, Size>(mem, f));
}
#ifdef Vc_IMPL_AVX2
template <>
Vc_INTRINSIC Vc_PURE bool AVX2::Mask<int16_t>::get(const AVX2::Mask<int16_t> &m,
int index) noexcept
{
return m.shiftMask() & (1 << 2 * index);
}
template <>
Vc_INTRINSIC Vc_PURE bool AVX2::Mask<uint16_t>::get(const AVX2::Mask<uint16_t> &m,
int index) noexcept
{
return m.shiftMask() & (1 << 2 * index);
}
#endif
template <> Vc_INTRINSIC Vc_PURE bool AVX2::double_m::operator==(const AVX2::double_m &rhs) const
{ return Detail::movemask(dataD()) == Detail::movemask(rhs.dataD()); }
#ifdef Vc_IMPL_AVX2
template <> Vc_INTRINSIC Vc_PURE bool AVX2::short_m::operator==(const AVX2::short_m &rhs) const
{ return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); }
template <> Vc_INTRINSIC Vc_PURE bool AVX2::ushort_m::operator==(const AVX2::ushort_m &rhs) const
{ return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); }
#endif
template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isFull() const {
if (sizeof(T) == 8) {
return 0 != Detail::testc(dataD(), Detail::allone<VectorTypeD>());
} else if (sizeof(T) == 4) {
return 0 != Detail::testc(dataF(), Detail::allone<VectorTypeF>());
} else {
return 0 != Detail::testc(dataI(), Detail::allone<VectorTypeI>());
}
}
template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isNotEmpty() const {
if (sizeof(T) == 8) {
return 0 == Detail::testz(dataD(), dataD());
} else if (sizeof(T) == 4) {
return 0 == Detail::testz(dataF(), dataF());
} else {
return 0 == Detail::testz(dataI(), dataI());
}
}
template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isEmpty() const {
if (sizeof(T) == 8) {
return 0 != Detail::testz(dataD(), dataD());
} else if (sizeof(T) == 4) {
return 0 != Detail::testz(dataF(), dataF());
} else {
return 0 != Detail::testz(dataI(), dataI());
}
}
template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isMix() const {
if (sizeof(T) == 8) {
return 0 != Detail::testnzc(dataD(), Detail::allone<VectorTypeD>());
} else if (sizeof(T) == 4) {
return 0 != Detail::testnzc(dataF(), Detail::allone<VectorTypeF>());
} else {
return 0 != Detail::testnzc(dataI(), Detail::allone<VectorTypeI>());
}
}
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4 + 32>)
{
return _mm256_setr_epi64x(
gen(0) ? 0xffffffffffffffffull : 0, gen(1) ? 0xffffffffffffffffull : 0,
gen(2) ? 0xffffffffffffffffull : 0, gen(3) ? 0xffffffffffffffffull : 0);
}
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8 + 32>)
{
return _mm256_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0,
gen(4) ? 0xfffffffful : 0, gen(5) ? 0xfffffffful : 0,
gen(6) ? 0xfffffffful : 0, gen(7) ? 0xfffffffful : 0);
}
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 16 + 32>)
{
return _mm256_setr_epi16(gen(0) ? 0xfffful : 0, gen(1) ? 0xfffful : 0,
gen(2) ? 0xfffful : 0, gen(3) ? 0xfffful : 0,
gen(4) ? 0xfffful : 0, gen(5) ? 0xfffful : 0,
gen(6) ? 0xfffful : 0, gen(7) ? 0xfffful : 0,
gen(8) ? 0xfffful : 0, gen(9) ? 0xfffful : 0,
gen(10) ? 0xfffful : 0, gen(11) ? 0xfffful : 0,
gen(12) ? 0xfffful : 0, gen(13) ? 0xfffful : 0,
gen(14) ? 0xfffful : 0, gen(15) ? 0xfffful : 0);
}
template <typename T>
template <typename G>
Vc_INTRINSIC AVX2::Mask<T> Mask<T, VectorAbi::Avx>::generate(G &&gen)
{
return generate_impl<AVX2::Mask<T>>(std::forward<G>(gen),
std::integral_constant<int, Size + sizeof(Storage)>());
}
template <typename T> Vc_INTRINSIC Vc_PURE AVX2::Mask<T> Mask<T, VectorAbi::Avx>::shifted(int amount) const
{
switch (amount * int(sizeof(VectorEntryType))) {
case 0: return *this;
case 1: return Detail::shifted< 1>(dataI());
case 2: return Detail::shifted< 2>(dataI());
case 3: return Detail::shifted< 3>(dataI());
case 4: return Detail::shifted< 4>(dataI());
case 5: return Detail::shifted< 5>(dataI());
case 6: return Detail::shifted< 6>(dataI());
case 7: return Detail::shifted< 7>(dataI());
case 8: return Detail::shifted< 8>(dataI());
case 9: return Detail::shifted< 9>(dataI());
case 10: return Detail::shifted< 10>(dataI());
case 11: return Detail::shifted< 11>(dataI());
case 12: return Detail::shifted< 12>(dataI());
case 13: return Detail::shifted< 13>(dataI());
case 14: return Detail::shifted< 14>(dataI());
case 15: return Detail::shifted< 15>(dataI());
case 16: return Detail::shifted< 16>(dataI());
case 17: return Detail::shifted< 17>(dataI());
case 18: return Detail::shifted< 18>(dataI());
case 19: return Detail::shifted< 19>(dataI());
case 20: return Detail::shifted< 20>(dataI());
case 21: return Detail::shifted< 21>(dataI());
case 22: return Detail::shifted< 22>(dataI());
case 23: return Detail::shifted< 23>(dataI());
case 24: return Detail::shifted< 24>(dataI());
case 25: return Detail::shifted< 25>(dataI());
case 26: return Detail::shifted< 26>(dataI());
case 27: return Detail::shifted< 27>(dataI());
case 28: return Detail::shifted< 28>(dataI());
case 29: return Detail::shifted< 29>(dataI());
case 30: return Detail::shifted< 30>(dataI());
case 31: return Detail::shifted< 31>(dataI());
case -1: return Detail::shifted< -1>(dataI());
case -2: return Detail::shifted< -2>(dataI());
case -3: return Detail::shifted< -3>(dataI());
case -4: return Detail::shifted< -4>(dataI());
case -5: return Detail::shifted< -5>(dataI());
case -6: return Detail::shifted< -6>(dataI());
case -7: return Detail::shifted< -7>(dataI());
case -8: return Detail::shifted< -8>(dataI());
case -9: return Detail::shifted< -9>(dataI());
case -10: return Detail::shifted<-10>(dataI());
case -11: return Detail::shifted<-11>(dataI());
case -12: return Detail::shifted<-12>(dataI());
case -13: return Detail::shifted<-13>(dataI());
case -14: return Detail::shifted<-14>(dataI());
case -15: return Detail::shifted<-15>(dataI());
case -16: return Detail::shifted<-16>(dataI());
case -17: return Detail::shifted<-17>(dataI());
case -18: return Detail::shifted<-18>(dataI());
case -19: return Detail::shifted<-19>(dataI());
case -20: return Detail::shifted<-20>(dataI());
case -21: return Detail::shifted<-21>(dataI());
case -22: return Detail::shifted<-22>(dataI());
case -23: return Detail::shifted<-23>(dataI());
case -24: return Detail::shifted<-24>(dataI());
case -25: return Detail::shifted<-25>(dataI());
case -26: return Detail::shifted<-26>(dataI());
case -27: return Detail::shifted<-27>(dataI());
case -28: return Detail::shifted<-28>(dataI());
case -29: return Detail::shifted<-29>(dataI());
case -30: return Detail::shifted<-30>(dataI());
case -31: return Detail::shifted<-31>(dataI());
}
return Zero();
}
}
#endif
#include <algorithm>
#include <cmath>
#ifdef isfinite
#undef isfinite
#endif
#ifdef isnan
#undef isnan
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename T, typename Abi> struct VectorTraits
{
using mask_type = Vc::Mask<T, Abi>;
using vector_type = Vc::Vector<T, Abi>;
using writemasked_vector_type = Common::WriteMaskedVector<vector_type, mask_type>;
using intrinsic_type = typename AVX::VectorTypeHelper<T>::Type;
};
}
#define Vc_CURRENT_CLASS_NAME Vector
template <typename T> class Vector<T, VectorAbi::Avx>
{
public:
using abi = VectorAbi::Avx;
private:
using traits_type = Detail::VectorTraits<T, abi>;
static_assert(
std::is_arithmetic<T>::value,
"Vector<T> only accepts arithmetic builtin types as template parameter T.");
using WriteMaskedVector = typename traits_type::writemasked_vector_type;
public:
using VectorType = typename traits_type::intrinsic_type;
using vector_type = VectorType;
using mask_type = typename traits_type::mask_type;
using Mask = mask_type;
using MaskType = mask_type;
using MaskArg Vc_DEPRECATED_ALIAS("Use MaskArgument instead.") = typename Mask::AsArg;
using MaskArgument = typename Mask::AsArg;
using reference = Detail::ElementReference<Vector>;
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType));
using EntryType = T;
using value_type = EntryType;
typedef EntryType VectorEntryType;
static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
static constexpr size_t MemoryAlignment = alignof(VectorType);
using IndexType = fixed_size_simd<int, Size>;
using index_type = IndexType;
typedef Vector<T, abi> AsArg;
typedef VectorType VectorTypeArg;
protected:
template <typename U> using V = Vector<U, abi>;
typedef AVX::VectorHelper<VectorType> HV;
typedef AVX::VectorHelper<T> HT;
template <typename V> static Vc_INTRINSIC VectorType _cast(V v)
{
return AVX::avx_cast<VectorType>(v);
}
typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
StorageType d;
using WidthT = Common::WidthT<VectorType>;
public:
public:
Vc_INTRINSIC Vector() = default;
static constexpr std::size_t size() { return Size; }
explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R;
explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R;
explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R;
static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); }
static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); }
static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero()
{
return Vector(Vc::IndexesFromZero);
}
template <class G, int = 0,
class = typename std::enable_if<std::is_convertible<
decltype(std::declval<G>()(size_t())), value_type>::value>::type>
explicit Vector(G &&g) : Vector(generate(std::forward<G>(g)))
{
}
static Vc_ALWAYS_INLINE_L Vector Random() Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE Vector(VectorTypeArg x) : d(x) {}
template <typename U>
Vc_INTRINSIC Vector(
V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
void *>::type = nullptr)
: d(AVX::convert<U, T>(x.data()))
{
}
#if Vc_IS_VERSION_1
template <typename U>
Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
"vector types") Vc_INTRINSIC explicit Vector(
V<U> x,
typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
void *>::type = nullptr)
: d(Detail::zeroExtendIfNeeded(AVX::convert<U, T>(x.data())))
{
}
template <typename U,
typename = enable_if<Traits::is_simd_vector<U>::value &&
!std::is_same<Vector, Traits::decay<U>>::value>>
Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
"vector types") Vc_INTRINSIC_L
explicit Vector(U &&x) Vc_INTRINSIC_R;
#endif
Vc_INTRINSIC explicit Vector(reference a) : Vector(static_cast<EntryType>(a)) {}
Vc_INTRINSIC Vector(EntryType a) : d(Detail::avx_broadcast(a)) {}
template <typename U>
Vc_INTRINSIC Vector(U a,
typename std::enable_if<std::is_same<U, int>::value &&
!std::is_same<U, EntryType>::value,
void *>::type = nullptr)
: Vector(static_cast<EntryType>(a))
{
}
explicit Vector(std::initializer_list<EntryType>)
{
static_assert(std::is_same<EntryType, void>::value,
"A SIMD vector object cannot be initialized from an initializer list "
"because the number of entries in the vector is target-dependent.");
}
explicit Vc_INTRINSIC Vector(const EntryType *mem)
{
load(mem);
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
{
load(mem, flags);
}
template <typename U, typename Flags = DefaultLoadTag,
typename = enable_if<
(!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
sizeof(EntryType) >= sizeof(U)) &&
std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
{
load<U, Flags>(x, flags);
}
Vc_INTRINSIC void load(const EntryType *mem)
{
load(mem, DefaultLoadTag());
}
template <typename Flags>
Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
load(const EntryType *mem, Flags flags)
{
load<EntryType, Flags>(mem, flags);
}
private:
template <typename U, typename Flags>
struct load_concept : public std::enable_if<
(!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
sizeof(EntryType) >= sizeof(U)) &&
std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
{};
public:
template <typename U, typename Flags = DefaultLoadTag>
Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
template <
typename U,
typename Flags = DefaultStoreTag,
typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
template <
typename U,
typename Flags = DefaultStoreTag,
typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
Vc_INTRINSIC void store(EntryType *mem) const
{
store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
{
store<EntryType, Flags>(mem, flags);
}
Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
{
store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
{
store<EntryType, Flags>(mem, mask, flags);
}
Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R;
Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R;
Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R;
Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
Vc_INTRINSIC_L void setQnan(MaskArgument k) Vc_INTRINSIC_R;
#ifndef Vc_CURRENT_CLASS_NAME
#error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
#endif
private:
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
MaskArgument mask);
public:
#define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<MT, EntryType>::value, \
"The memory pointer needs to point to a type that can be converted to the " \
"EntryType of this SIMD vector type."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
template <typename MT, typename IT,
typename = enable_if<Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
#undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
private:
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes) const;
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
public:
#define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<EntryType, MT>::value, \
"The memory pointer needs to point to a type that the EntryType of this " \
"SIMD vector type can be converted to."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes));
}
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes), mask);
}
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
{
scatter(args.address, args.indexes);
}
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
{
scatter(args.address, args.indexes, mask);
}
#undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
#if defined Vc_IMPL_AVX2 && !defined Vc_MSVC
template <class U, class A, int Scale, int N = Vector<U, A>::size(),
class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<T, Vector<U, A>, Scale> &args)
{
d.v() = AVX::gather<sizeof(T) * Scale>(
args.address,
simd_cast<conditional_t<Size == 4, SSE::int_v, AVX2::int_v>>(args.indexes)
.data());
}
template <class U, class A, int Scale, int N = Vector<U, A>::size(),
class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<T, Vector<U, A>, Scale> &args, MaskArgument k)
{
d.v() = AVX::gather<sizeof(T) * Scale>(
d.v(), k.data(), args.address,
simd_cast<conditional_t<Size == 4, SSE::int_v, AVX2::int_v>>(args.indexes)
.data());
}
template <
class MT, class U, class A, int Scale,
class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
(sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
{
using AVX2::int_v;
const auto idx0 = simd_cast<int_v, 0>(args.indexes).data();
const auto idx1 = simd_cast<int_v, 1>(args.indexes).data();
*this = simd_cast<Vector>(int_v(AVX::gather<sizeof(MT) * Scale>(
aliasing_cast<int>(args.address), idx0)),
int_v(AVX::gather<sizeof(MT) * Scale>(
aliasing_cast<int>(args.address), idx1)));
if (sizeof(MT) == 1) {
if (std::is_signed<MT>::value) {
using Signed = AVX2::Vector<typename std::make_signed<T>::type>;
*this = (simd_cast<Signed>(*this) << 8) >> 8;
} else {
*this &= 0xff;
}
}
}
template <
class MT, class U, class A, int Scale,
class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
(sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<MT, Vector<U, A>, Scale> &args, MaskArgument k)
{
using AVX2::int_v;
const auto idx0 = simd_cast<int_v, 0>(args.indexes).data();
const auto idx1 = simd_cast<int_v, 1>(args.indexes).data();
const auto k0 = simd_cast<AVX2::int_m, 0>(k).data();
const auto k1 = simd_cast<AVX2::int_m, 1>(k).data();
auto v = simd_cast<Vector>(
int_v(AVX::gather<sizeof(MT) * Scale>(
_mm256_setzero_si256(), k0, aliasing_cast<int>(args.address), idx0)),
int_v(AVX::gather<sizeof(MT) * Scale>(
_mm256_setzero_si256(), k1, aliasing_cast<int>(args.address), idx1)));
if (sizeof(MT) == 1) {
if (std::is_signed<MT>::value) {
using Signed = AVX2::Vector<typename std::make_signed<T>::type>;
v = (simd_cast<Signed>(v) << 8) >> 8;
} else {
v &= 0xff;
}
}
assign(v, k);
}
template <class MT, class U, class A, int Scale>
Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
Traits::is_valid_vector_argument<MT>::value &&
!std::is_same<MT, T>::value &&
Vector<U, A>::size() >= size()),
void>
gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
{
*this = simd_cast<Vector>(fixed_size_simd<MT, Size>(args));
}
template <class MT, class U, class A, int Scale>
Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
Traits::is_valid_vector_argument<MT>::value &&
!std::is_same<MT, T>::value &&
Vector<U, A>::size() >= size()),
void>
gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args,
MaskArgument k)
{
assign(simd_cast<Vector>(fixed_size_simd<MT, Size>(args, k)), k);
}
#endif
Vc_ALWAYS_INLINE Vector &operator++() { data() = Detail::add(data(), Detail::one(T()), T()); return *this; }
Vc_ALWAYS_INLINE Vector &operator--() { data() = Detail::sub(data(), Detail::one(T()), T()); return *this; }
Vc_ALWAYS_INLINE Vector operator++(int) { const Vector r = *this; data() = Detail::add(data(), Detail::one(T()), T()); return r; }
Vc_ALWAYS_INLINE Vector operator--(int) { const Vector r = *this; data() = Detail::sub(data(), Detail::one(T()), T()); return r; }
private:
friend reference;
Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
{
return o.d.m(i);
}
template <typename U>
Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
noexcept(std::declval<value_type &>() = v))
{
return o.d.set(i, v);
}
public:
Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
{
static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
return {*this, int(index)};
}
Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
{
return d.m(index);
}
Vc_INTRINSIC_L Vc_PURE_L Vector operator[](Permutation::ReversedTag) const Vc_INTRINSIC_R Vc_PURE_R;
Vc_INTRINSIC_L Vc_PURE_L Vector operator[](const IndexType &perm) const Vc_INTRINSIC_R Vc_PURE_R;
Vc_INTRINSIC Vc_PURE Mask operator!() const
{
return *this == Zero();
}
Vc_ALWAYS_INLINE Vector operator~() const
{
#ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
static_assert(std::is_integral<T>::value,
"bit-complement can only be used with Vectors of integral type");
#endif
return Detail::andnot_(data(), Detail::allone<VectorType>());
}
Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; }
#define Vc_OP_VEC(op) \
Vc_INTRINSIC Vector &operator op##=(AsArg x); \
Vc_INTRINSIC Vc_PURE Vector operator op(AsArg x) const \
{ \
static_assert( \
std::is_integral<T>::value, \
"bitwise-operators can only be used with Vectors of integral type"); \
}
Vc_ALL_SHIFTS(Vc_OP_VEC);
#undef Vc_OP_VEC
Vc_ALWAYS_INLINE_L Vector &operator>>=(int x) Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L Vector &operator<<=(int x) Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L Vector operator>>(int x) const Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L Vector operator<<(int x) const Vc_ALWAYS_INLINE_R;
Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
isNegative() const
{
return Vc::isnegative(*this);
}
Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) {
data() = Detail::blend(data(), v.data(), mask.data());
}
template <typename V2>
Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2
staticCast() const
{
return V2(*this);
}
template <typename V2>
Vc_DEPRECATED("use reinterpret_components_cast instead") Vc_ALWAYS_INLINE V2
reinterpretCast() const
{
return AVX::avx_cast<typename V2::VectorType>(data());
}
Vc_ALWAYS_INLINE WriteMaskedVector operator()(const Mask &k)
{
return {*this, k};
}
Vc_ALWAYS_INLINE VectorType &data() { return d.v(); }
Vc_ALWAYS_INLINE const VectorType &data() const { return d.v(); }
template<int Index>
Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R;
Vc_INTRINSIC_L std::pair<Vector, int> minIndex() const Vc_INTRINSIC_R;
Vc_INTRINSIC_L std::pair<Vector, int> maxIndex() const Vc_INTRINSIC_R;
Vc_ALWAYS_INLINE EntryType min() const { return Detail::min(data(), T()); }
Vc_ALWAYS_INLINE EntryType max() const { return Detail::max(data(), T()); }
Vc_ALWAYS_INLINE EntryType product() const { return Detail::mul(data(), T()); }
Vc_ALWAYS_INLINE EntryType sum() const { return Detail::add(data(), T()); }
Vc_ALWAYS_INLINE_L Vector partialSum() const Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L EntryType min(MaskArgument m) const Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L EntryType max(MaskArgument m) const Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L EntryType product(MaskArgument m) const Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L EntryType sum(MaskArgument m) const Vc_ALWAYS_INLINE_R;
Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R;
Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
template <typename F> void callWithValuesSorted(F &&f)
{
EntryType value = d.m(0);
f(value);
for (size_t i = 1; i < Size; ++i) {
if (d.m(i) != value) {
value = d.m(i);
f(value);
}
}
}
template <typename F> Vc_INTRINSIC void call(F &&f) const
{
Common::for_all_vector_entries<Size>([&](size_t i) { f(EntryType(d.m(i))); });
}
template <typename F> Vc_INTRINSIC void call(F &&f, const Mask &mask) const
{
for (size_t i : where(mask)) {
f(EntryType(d.m(i)));
}
}
template <typename F> Vc_INTRINSIC Vector apply(F &&f) const
{
Vector r;
Common::for_all_vector_entries<Size>(
[&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); });
return r;
}
template <typename F> Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const
{
Vector r(*this);
for (size_t i : where(mask)) {
r.d.set(i, f(EntryType(r.d.m(i))));
}
return r;
}
template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f(i)); });
}
Vc_INTRINSIC void fill(EntryType (&f)()) {
Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f()); });
}
template <typename G> static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R;
Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector
copySign(AsArg x) const
{
return Vc::copysign(*this, x);
}
Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
{
Vc::exponent(*this);
}
Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R;
};
#undef Vc_CURRENT_CLASS_NAME
template <typename T> constexpr size_t Vector<T, VectorAbi::Avx>::Size;
template <typename T> constexpr size_t Vector<T, VectorAbi::Avx>::MemoryAlignment;
#define Vc_CONDITIONAL_ASSIGN(name_,op_) \
template <Operator O, typename T, typename M, typename U> \
Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign( \
AVX2::Vector<T> &lhs, M &&mask, U &&rhs) \
{ \
lhs(mask) op_ rhs; \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN( Assign, =);
Vc_CONDITIONAL_ASSIGN( PlusAssign, +=);
Vc_CONDITIONAL_ASSIGN( MinusAssign, -=);
Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=);
Vc_CONDITIONAL_ASSIGN( DivideAssign, /=);
Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
Vc_CONDITIONAL_ASSIGN( XorAssign, ^=);
Vc_CONDITIONAL_ASSIGN( AndAssign, &=);
Vc_CONDITIONAL_ASSIGN( OrAssign, |=);
Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
#undef Vc_CONDITIONAL_ASSIGN
#define Vc_CONDITIONAL_ASSIGN(name_,expr_) \
template <Operator O, typename T, typename M> \
Vc_INTRINSIC enable_if<O == Operator::name_, AVX2::Vector<T>> conditional_assign( \
AVX2::Vector<T> &lhs, M &&mask) \
{ \
return expr_; \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++);
Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask));
Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--);
Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask));
#undef Vc_CONDITIONAL_ASSIGN
}
#ifndef VC_AVX_LIMITS_H_
#define VC_AVX_LIMITS_H_
namespace std
{
#define Vc_NUM_LIM(T,_max,_min) \
template <> struct numeric_limits<Vc::AVX2::Vector<T>> : public numeric_limits<T> { \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> max() Vc_NOEXCEPT \
{ \
return _max; \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> min() Vc_NOEXCEPT \
{ \
return _min; \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> lowest() Vc_NOEXCEPT \
{ \
return min(); \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> epsilon() Vc_NOEXCEPT \
{ \
return Vc::AVX2::Vector<T>::Zero(); \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> round_error() Vc_NOEXCEPT \
{ \
return Vc::AVX2::Vector<T>::Zero(); \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> infinity() Vc_NOEXCEPT \
{ \
return Vc::AVX2::Vector<T>::Zero(); \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> quiet_NaN() Vc_NOEXCEPT \
{ \
return Vc::AVX2::Vector<T>::Zero(); \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> signaling_NaN() Vc_NOEXCEPT \
{ \
return Vc::AVX2::Vector<T>::Zero(); \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> denorm_min() Vc_NOEXCEPT \
{ \
return Vc::AVX2::Vector<T>::Zero(); \
} \
}
#ifdef Vc_IMPL_AVX2
Vc_NUM_LIM(unsigned short, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>());
Vc_NUM_LIM( short, _mm256_srli_epi16(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi16());
Vc_NUM_LIM( unsigned int, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>());
Vc_NUM_LIM( int, _mm256_srli_epi32(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi32());
#endif
#undef Vc_NUM_LIM
}
#endif
#ifndef VC_AVX_CONST_H_
#define VC_AVX_CONST_H_
#include <cstddef>
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
template<typename T> struct IndexesFromZeroData;
template<> struct IndexesFromZeroData<int> {
static Vc_ALWAYS_INLINE Vc_CONST const int *address() { return reinterpret_cast<const int *>(&_IndexesFromZero32[0]); }
};
template<> struct IndexesFromZeroData<unsigned int> {
static Vc_ALWAYS_INLINE Vc_CONST const unsigned int *address() { return &_IndexesFromZero32[0]; }
};
template<> struct IndexesFromZeroData<short> {
static Vc_ALWAYS_INLINE Vc_CONST const short *address() { return reinterpret_cast<const short *>(&_IndexesFromZero16[0]); }
};
template<> struct IndexesFromZeroData<unsigned short> {
static Vc_ALWAYS_INLINE Vc_CONST const unsigned short *address() { return &_IndexesFromZero16[0]; }
};
template<> struct IndexesFromZeroData<signed char> {
static Vc_ALWAYS_INLINE Vc_CONST const signed char *address() { return reinterpret_cast<const signed char *>(&_IndexesFromZero8[0]); }
};
template<> struct IndexesFromZeroData<char> {
static Vc_ALWAYS_INLINE Vc_CONST const char *address() { return reinterpret_cast<const char *>(&_IndexesFromZero8[0]); }
};
template<> struct IndexesFromZeroData<unsigned char> {
static Vc_ALWAYS_INLINE Vc_CONST const unsigned char *address() { return &_IndexesFromZero8[0]; }
};
template<typename _T> struct Const
{
typedef Vector<_T> V;
typedef typename V::EntryType T;
typedef typename V::Mask M;
static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return V(c_trig<T>::data[0]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return V(c_trig<T>::data[1]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return V(c_trig<T>::data[2]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return V(c_trig<T>::data[3]); }
static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return V(c_trig<T>::data[4]); }
static Vc_ALWAYS_INLINE Vc_CONST V _16() { return V(c_trig<T>::data[5]); }
static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return V(c_trig<T>::data[(12 + i)]); }
static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return V(c_trig<T>::data[(17 + i)]); }
static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return V(c_trig<T>::data[22]); }
static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return V(c_trig<T>::data[23]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return V(c_trig<T>::data[24]); }
static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return V(c_trig<T>::data[8]); }
static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return V(c_trig<T>::data[9]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return V(c_trig<T>::data[10]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return V(c_trig<T>::data[11]); }
static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return V(c_trig<T>::data[(28 + i)]); }
static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return V(c_trig<T>::data[(33 + i)]); }
static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return V(c_trig<T>::data[(37 + i)]); }
static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return V(c_trig<T>::data[(43 + i)]); }
static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return V(c_trig<T>::data[25]); }
static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return V(c_trig<T>::data[26]); }
static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(V(c_log<T>::d(1)).data()); }
static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return V(c_log<T>::d(18)); }
static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return V(c_log<T>::d(15)); }
static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return V(c_log<T>::d(2 + i)); }
static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return V(c_log<T>::d(8 + i)); }
static Vc_ALWAYS_INLINE Vc_CONST V min() { return V(c_log<T>::d(14)); }
static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return V(c_log<T>::d(17)); }
static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return V(c_log<T>::d(16)); }
static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return V(c_log<T>::d(13)); }
static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return V(c_log<T>::d(19)); }
static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return V(c_log<T>::d(20)); }
static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R;
static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask(int bits) Vc_ALWAYS_INLINE_R Vc_CONST_R;
};
template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask()
{
return _mm256_broadcast_ss(
reinterpret_cast<const float *>(&c_general::highMaskFloat));
}
template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask()
{
return _mm256_broadcast_sd(
reinterpret_cast<const double *>(&c_general::highMaskDouble));
}
template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask(int bits)
{
#ifdef Vc_IMPL_AVX2
#if defined Vc_ICC || defined Vc_MSVC
__m256i allone = _mm256_set1_epi64x(~0);
#else
auto allone = ~__m256i();
#endif
return _mm256_castsi256_ps(_mm256_slli_epi32(allone, bits));
#else
__m128 tmp = _mm_castsi128_ps(_mm_slli_epi32(_mm_setallone_si128(), bits));
return concat(tmp, tmp);
#endif
}
template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask(int bits)
{
#ifdef Vc_IMPL_AVX2
#if defined Vc_ICC || defined Vc_MSVC
__m256i allone = _mm256_set1_epi64x(~0);
#else
auto allone = ~__m256i();
#endif
return _mm256_castsi256_pd(_mm256_slli_epi64(allone, bits));
#else
__m128d tmp = _mm_castsi128_pd(_mm_slli_epi64(_mm_setallone_si128(), bits));
return concat(tmp, tmp);
#endif
}
}
namespace AVX2
{
using AVX::IndexesFromZeroData;
using AVX::Const;
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
Vc_INTRINSIC AVX2::double_m operator==(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpeq_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator==(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpeq_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::double_m operator!=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpneq_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator!=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpneq_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::double_m operator>=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpnlt_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator>=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpnlt_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::double_m operator<=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmple_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator<=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmple_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::double_m operator> (AVX2::double_v a, AVX2::double_v b) { return AVX::cmpgt_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator> (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpgt_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::double_m operator< (AVX2::double_v a, AVX2::double_v b) { return AVX::cmplt_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator< (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmplt_ps(a.data(), b.data()); }
#ifdef Vc_IMPL_AVX2
Vc_INTRINSIC AVX2:: int_m operator==(AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: uint_m operator==(AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: short_m operator==(AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
Vc_INTRINSIC AVX2::ushort_m operator==(AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: int_m operator!=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: uint_m operator!=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: short_m operator!=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
Vc_INTRINSIC AVX2::ushort_m operator!=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: int_m operator>=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmplt_epi32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: uint_m operator>=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmplt_epu32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: short_m operator>=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmplt_epi16(a.data(), b.data())); }
Vc_INTRINSIC AVX2::ushort_m operator>=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmplt_epu16(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: int_m operator<=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpgt_epi32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: uint_m operator<=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpgt_epu32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: short_m operator<=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpgt_epi16(a.data(), b.data())); }
Vc_INTRINSIC AVX2::ushort_m operator<=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpgt_epu16(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: int_m operator> (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpgt_epi32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: uint_m operator> (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpgt_epu32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: short_m operator> (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpgt_epi16(a.data(), b.data()); }
Vc_INTRINSIC AVX2::ushort_m operator> (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpgt_epu16(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: int_m operator< (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmplt_epi32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: uint_m operator< (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmplt_epu32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: short_m operator< (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmplt_epi16(a.data(), b.data()); }
Vc_INTRINSIC AVX2::ushort_m operator< (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmplt_epu16(a.data(), b.data()); }
#endif
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator^(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return xor_(a.data(), b.data());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator&(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return and_(a.data(), b.data());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator|(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return or_(a.data(), b.data());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator+(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return add(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator-(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return sub(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator*(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return mul(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator/(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return div(a.data(), b.data(), T());
}
Vc_INTRINSIC AVX2::Vector<ushort> operator/(AVX2::Vector<ushort> a,
AVX2::Vector<ushort> b)
{
using namespace AVX;
const __m256 lo = _mm256_div_ps(convert<ushort, float>(lo128(a.data())),
convert<ushort, float>(lo128(b.data())));
const __m256 hi = _mm256_div_ps(convert<ushort, float>(hi128(a.data())),
convert<ushort, float>(hi128(b.data())));
const float_v threshold = 32767.f;
using Detail::operator>;
const __m128i loShort = (Vc_IS_UNLIKELY((float_v(lo) > threshold).isNotEmpty()))
? convert<float, ushort>(lo)
: convert<float, short>(lo);
const __m128i hiShort = (Vc_IS_UNLIKELY((float_v(hi) > threshold).isNotEmpty()))
? convert<float, ushort>(hi)
: convert<float, short>(hi);
return concat(loShort, hiShort);
}
template <typename T>
Vc_INTRINSIC enable_if<std::is_integral<T>::value, AVX2::Vector<T>> operator%(
AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return a - a / b * b;
}
}
template <> template <typename G> Vc_INTRINSIC AVX2::double_v AVX2::double_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
return _mm256_setr_pd(tmp0, tmp1, tmp2, tmp3);
}
template <> template <typename G> Vc_INTRINSIC AVX2::float_v AVX2::float_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
const auto tmp4 = gen(4);
const auto tmp5 = gen(5);
const auto tmp6 = gen(6);
const auto tmp7 = gen(7);
return _mm256_setr_ps(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
}
#ifdef Vc_IMPL_AVX2
template <> template <typename G> Vc_INTRINSIC AVX2::int_v AVX2::int_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
const auto tmp4 = gen(4);
const auto tmp5 = gen(5);
const auto tmp6 = gen(6);
const auto tmp7 = gen(7);
return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
}
template <> template <typename G> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
const auto tmp4 = gen(4);
const auto tmp5 = gen(5);
const auto tmp6 = gen(6);
const auto tmp7 = gen(7);
return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
}
template <> template <typename G> Vc_INTRINSIC AVX2::short_v AVX2::short_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
const auto tmp4 = gen(4);
const auto tmp5 = gen(5);
const auto tmp6 = gen(6);
const auto tmp7 = gen(7);
const auto tmp8 = gen(8);
const auto tmp9 = gen(9);
const auto tmp10 = gen(10);
const auto tmp11 = gen(11);
const auto tmp12 = gen(12);
const auto tmp13 = gen(13);
const auto tmp14 = gen(14);
const auto tmp15 = gen(15);
return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
}
template <> template <typename G> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
const auto tmp4 = gen(4);
const auto tmp5 = gen(5);
const auto tmp6 = gen(6);
const auto tmp7 = gen(7);
const auto tmp8 = gen(8);
const auto tmp9 = gen(9);
const auto tmp10 = gen(10);
const auto tmp11 = gen(11);
const auto tmp12 = gen(12);
const auto tmp13 = gen(13);
const auto tmp14 = gen(14);
const auto tmp15 = gen(15);
return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
}
#endif
template <typename T> Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(VectorSpecialInitializerZero) : d{} {}
template <> Vc_INTRINSIC Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_pd()) {}
template <> Vc_INTRINSIC Vector< float, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_ps()) {}
#ifdef Vc_IMPL_AVX2
template <> Vc_INTRINSIC Vector< int, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi32()) {}
template <> Vc_INTRINSIC Vector< uint, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu32()) {}
template <> Vc_INTRINSIC Vector< short, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi16()) {}
template <> Vc_INTRINSIC Vector<ushort, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu16()) {}
template <> Vc_INTRINSIC Vector< schar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi8()) {}
template <> Vc_INTRINSIC Vector< uchar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu8()) {}
#endif
template <typename T>
Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx>::Vector(
VectorSpecialInitializerIndexesFromZero)
: Vector(AVX::IndexesFromZeroData<T>::address(), Vc::Aligned)
{
}
template <>
Vc_ALWAYS_INLINE Vector<float, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
: Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
{
}
template <>
Vc_ALWAYS_INLINE Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
: Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
{
}
template <typename DstT>
template <typename SrcT, typename Flags>
Vc_INTRINSIC typename Vector<DstT, VectorAbi::Avx>::
#ifndef Vc_MSVC
template
#endif
load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Avx>::load(const SrcT *mem, Flags flags)
{
Common::handleLoadPrefetches(mem, flags);
d.v() = Detail::load<VectorType, DstT>(mem, flags);
}
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero()
{
data() = Detail::zero<VectorType>();
}
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero(const Mask &k)
{
data() = Detail::andnot_(k.data(), data());
}
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZeroInverted(const Mask &k)
{
data() = Detail::and_(k.data(), data());
}
template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan()
{
data() = Detail::allone<VectorType>();
}
template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan(MaskArgument k)
{
data() = _mm256_or_pd(data(), k.dataD());
}
template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan()
{
data() = Detail::allone<VectorType>();
}
template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan(MaskArgument k)
{
data() = _mm256_or_ps(data(), k.dataF());
}
template <typename T>
template <typename U,
typename Flags,
typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Flags flags) const
{
Common::handleStorePrefetches(mem, flags);
HV::template store<Flags>(mem, data());
}
template <typename T>
template <typename U,
typename Flags,
typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Mask mask, Flags flags) const
{
Common::handleStorePrefetches(mem, flags);
HV::template store<Flags>(mem, data(), mask.data());
}
#ifdef Vc_IMPL_AVX2
template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srav_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srlv_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
template <typename T>
Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(AsArg x)
{
static_assert(std::is_integral<T>::value,
"bitwise-operators can only be used with Vectors of integral type");
return *this = *this << x;
}
template <typename T>
Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(AsArg x)
{
static_assert(std::is_integral<T>::value,
"bitwise-operators can only be used with Vectors of integral type");
return *this = *this >> x;
}
#endif
template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(int shift) {
d.v() = Detail::shiftRight(d.v(), shift, T());
return *static_cast<AVX2::Vector<T> *>(this);
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator>>(int shift) const {
return Detail::shiftRight(d.v(), shift, T());
}
template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(int shift) {
d.v() = Detail::shiftLeft(d.v(), shift, T());
return *static_cast<AVX2::Vector<T> *>(this);
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator<<(int shift) const {
return Detail::shiftLeft(d.v(), shift, T());
}
Vc_INTRINSIC Vc_CONST AVX2::float_m isnegative(AVX2::float_v x)
{
return AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
AVX::avx_cast<__m256i>(_mm256_and_ps(AVX::setsignmask_ps(), x.data()))));
}
Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x)
{
return Mem::permute<X1, X1, X3, X3>(AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data())))));
}
#define Vc_GATHER_IMPL(V_) \
template <> \
template <class MT, class IT, int Scale> \
inline void AVX2::V_::gatherImplementation( \
const Common::GatherArguments<MT, IT, Scale> &args)
#define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
Vc_GATHER_IMPL(double_v) { d.v() = _mm256_setr_pd(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
Vc_GATHER_IMPL(float_v)
{
d.v() = _mm256_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6),
Vc_M(7));
}
#ifdef Vc_IMPL_AVX2
Vc_GATHER_IMPL(int_v)
{
d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
Vc_M(6), Vc_M(7));
}
Vc_GATHER_IMPL(uint_v)
{
d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
Vc_M(6), Vc_M(7));
}
Vc_GATHER_IMPL(short_v)
{
d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
}
Vc_GATHER_IMPL(ushort_v)
{
d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
}
#endif
#undef Vc_M
#undef Vc_GATHER_IMPL
template <class T>
template <class MT, class IT, int Scale>
inline void Vector<T, VectorAbi::Avx>::gatherImplementation(
const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
{
const auto *mem = args.address;
const auto indexes = Scale * args.indexes;
using Selector = std::integral_constant < Common::GatherScatterImplementation,
#ifdef Vc_USE_SET_GATHERS
Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
#endif
#ifdef Vc_USE_BSF_GATHERS
Common::GatherScatterImplementation::BitScanLoop
#elif defined Vc_USE_POPCNT_BSF_GATHERS
Common::GatherScatterImplementation::PopcntSwitch
#else
Common::GatherScatterImplementation::SimpleLoop
#endif
> ;
Common::executeGather(Selector(), *this, mem, indexes, mask);
}
template <typename T>
template <typename MT, typename IT>
inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes) const
{
Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
}
template <typename T>
template <typename MT, typename IT>
inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
{
using Selector = std::integral_constant < Common::GatherScatterImplementation,
#ifdef Vc_USE_SET_GATHERS
Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
#endif
#ifdef Vc_USE_BSF_GATHERS
Common::GatherScatterImplementation::BitScanLoop
#elif defined Vc_USE_POPCNT_BSF_GATHERS
Common::GatherScatterImplementation::PopcntSwitch
#else
Common::GatherScatterImplementation::SimpleLoop
#endif
> ;
Common::executeScatter(Selector(), *this, mem, std::forward<IT>(indexes), mask);
}
#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
{
return VectorType(-d.builtin());
}
#else
template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
{
return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
}
#endif
template <typename T>
Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
Vector<T, VectorAbi::Avx>::minIndex() const
{
AVX2::Vector<T> x = min();
return std::make_pair(x, (*this == x).firstOne());
}
template <typename T>
Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
Vector<T, VectorAbi::Avx>::maxIndex() const
{
AVX2::Vector<T> x = max();
return std::make_pair(x, (*this == x).firstOne());
}
template <> Vc_INTRINSIC std::pair<AVX2::float_v, int> AVX2::float_v::minIndex() const
{
__m256 x = d.v();
__m256 idx = Vector<float>::IndexesFromZero().data();
__m256 y = Mem::permute128<X1, X0>(x);
__m256 idy = Mem::permute128<X1, X0>(idx);
__m256 less = AVX::cmplt_ps(x, y);
x = _mm256_blendv_ps(y, x, less);
idx = _mm256_blendv_ps(idy, idx, less);
y = Reg::permute<X2, X3, X0, X1>(x);
idy = Reg::permute<X2, X3, X0, X1>(idx);
less = AVX::cmplt_ps(x, y);
x = _mm256_blendv_ps(y, x, less);
idx = _mm256_blendv_ps(idy, idx, less);
y = Reg::permute<X1, X0, X3, X2>(x);
idy = Reg::permute<X1, X0, X3, X2>(idx);
less = AVX::cmplt_ps(x, y);
idx = _mm256_blendv_ps(idy, idx, less);
const auto index = _mm_cvtsi128_si32(AVX::avx_cast<__m128i>(idx));
#ifdef Vc_GNU_ASM
__asm__ __volatile__("");
#endif
x = _mm256_blendv_ps(y, x, less);
return std::make_pair(x, index);
}
template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum() const
{
AVX2::Vector<T> tmp = *this;
if (Size > 1) tmp += tmp.shifted(-1);
if (Size > 2) tmp += tmp.shifted(-2);
if (Size > 4) tmp += tmp.shifted(-4);
if (Size > 8) tmp += tmp.shifted(-8);
if (Size > 16) tmp += tmp.shifted(-16);
return tmp;
}
template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::min(MaskArgument m) const
{
AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::max();
tmp(m) = *this;
return tmp.min();
}
template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::max(MaskArgument m) const
{
AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::min();
tmp(m) = *this;
return tmp.max();
}
template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::product(MaskArgument m) const
{
AVX2::Vector<T> tmp(Vc::One);
tmp(m) = *this;
return tmp.product();
}
template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::sum(MaskArgument m) const
{
AVX2::Vector<T> tmp(Vc::Zero);
tmp(m) = *this;
return tmp.sum();
}
namespace Detail
{
Vc_INTRINSIC Vc_CONST __m256 exponent(__m256 v)
{
using namespace AVX;
__m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(v), 23);
__m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(v)), 23);
tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
}
Vc_INTRINSIC Vc_CONST __m256d exponent(__m256d v)
{
using namespace AVX;
__m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(v), 52);
__m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(v)), 52);
tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1))));
}
}
Vc_INTRINSIC Vc_CONST AVX2::float_v exponent(AVX2::float_v x)
{
using Detail::operator>=;
Vc_ASSERT((x >= x.Zero()).isFull());
return Detail::exponent(x.data());
}
Vc_INTRINSIC Vc_CONST AVX2::double_v exponent(AVX2::double_v x)
{
using Detail::operator>=;
Vc_ASSERT((x >= x.Zero()).isFull());
return Detail::exponent(x.data());
}
static Vc_ALWAYS_INLINE __m256i _doRandomStep()
{
using Detail::operator*;
using Detail::operator+;
#ifdef Vc_IMPL_AVX2
using AVX2::uint_v;
uint_v state0(&Common::RandomState[0]);
uint_v state1(&Common::RandomState[uint_v::Size]);
(state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
_mm256_srli_epi32(state1.data(), 16)))
.store(&Common::RandomState[0]);
return state0.data();
#else
using SSE::uint_v;
uint_v state0(&Common::RandomState[0]);
uint_v state1(&Common::RandomState[uint_v::Size]);
uint_v state2(&Common::RandomState[2 * uint_v::Size]);
uint_v state3(&Common::RandomState[3 * uint_v::Size]);
(state2 * uint_v(0xdeece66du) + uint_v(11))
.store(&Common::RandomState[2 * uint_v::Size]);
(state3 * uint_v(0xdeece66du) + uint_v(11))
.store(&Common::RandomState[3 * uint_v::Size]);
uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
_mm_srli_epi32(state2.data(), 16)))
.store(&Common::RandomState[0]);
uint_v(Detail::xor_((state1 * uint_v(0xdeece66du) + uint_v(11)).data(),
_mm_srli_epi32(state3.data(), 16)))
.store(&Common::RandomState[uint_v::Size]);
return AVX::concat(state0.data(), state1.data());
#endif
}
#ifdef Vc_IMPL_AVX2
template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::Random()
{
return {_doRandomStep()};
}
#endif
template <> Vc_ALWAYS_INLINE AVX2::float_v AVX2::float_v::Random()
{
return HT::sub(Detail::or_(_cast(AVX::srli_epi32<2>(_doRandomStep())), HT::one()),
HT::one());
}
template<> Vc_ALWAYS_INLINE AVX2::double_v AVX2::double_v::Random()
{
const __m256i state = Detail::load(&Common::RandomState[0], Vc::Aligned,
Detail::LoadTag<__m256i, int>());
for (size_t k = 0; k < 8; k += 2) {
typedef unsigned long long uint64 Vc_MAY_ALIAS;
const uint64 stateX = *aliasing_cast<uint64>(&Common::RandomState[k]);
*aliasing_cast<uint64>(&Common::RandomState[k]) = (stateX * 0x5deece66dull + 11);
}
return HT::sub(Detail::or_(_cast(AVX::srli_epi64<12>(state)), HT::one()), HT::one());
}
template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount) const
{
return Detail::shifted<EntryType>(d.v(), amount);
}
template <typename VectorType>
Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m128>)
{
return Mem::shuffle<X2, X3, Y0, Y1>(left, right);
}
template <typename VectorType>
Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m256>)
{
return Mem::shuffle128<X1, Y0>(left, right);
}
template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount, Vector shiftIn) const
{
#ifdef __GNUC__
if (__builtin_constant_p(amount)) {
const __m256i a = AVX::avx_cast<__m256i>(d.v());
const __m256i b = AVX::avx_cast<__m256i>(shiftIn.d.v());
if (amount * 2 == int(Size)) {
return shifted_shortcut(d.v(), shiftIn.d.v(), WidthT());
}
if (amount * 2 == -int(Size)) {
return shifted_shortcut(shiftIn.d.v(), d.v(), WidthT());
}
switch (amount) {
case 1:
return AVX::avx_cast<VectorType>(
#ifdef Vc_IMPL_AVX2
_mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
sizeof(EntryType))
#else
AVX::concat(
_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), sizeof(EntryType)),
_mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), sizeof(EntryType)))
#endif
);
case 2:
return AVX::avx_cast<VectorType>(
#ifdef Vc_IMPL_AVX2
_mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
2 * sizeof(EntryType))
#else
AVX::concat(
_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), 2 * sizeof(EntryType)),
_mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), 2 * sizeof(EntryType)))
#endif
);
case 3:
if (6u < Size) {
return AVX::avx_cast<VectorType>(
#ifdef Vc_IMPL_AVX2
_mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
3 * sizeof(EntryType))
#else
AVX::concat(_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a),
3 * sizeof(EntryType)),
_mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a),
3 * sizeof(EntryType)))
#endif
);
}
}
}
#endif
using Detail::operator|;
return shifted(amount) | (amount > 0 ?
shiftIn.shifted(amount - Size) :
shiftIn.shifted(Size + amount));
}
template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::rotated(int amount) const
{
return Detail::rotated<EntryType, size()>(d.v(), amount);
}
template <typename T>
Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::sorted()
const
{
return Detail::sorted(*this);
}
template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveLow(AVX2::double_v x) const
{
return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_pd(data(), x.data()),
_mm256_unpackhi_pd(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveHigh(AVX2::double_v x) const
{
return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_pd(data(), x.data()),
_mm256_unpackhi_pd(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveLow(AVX2::float_v x) const
{
return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_ps(data(), x.data()),
_mm256_unpackhi_ps(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveHigh(AVX2::float_v x) const
{
return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_ps(data(), x.data()),
_mm256_unpackhi_ps(data(), x.data()));
}
#ifdef Vc_IMPL_AVX2
template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveLow ( AVX2::int_v x) const {
return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
_mm256_unpackhi_epi32(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveHigh( AVX2::int_v x) const {
return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
_mm256_unpackhi_epi32(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveLow ( AVX2::uint_v x) const {
return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
_mm256_unpackhi_epi32(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveHigh( AVX2::uint_v x) const {
return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
_mm256_unpackhi_epi32(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveLow ( AVX2::short_v x) const {
return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
_mm256_unpackhi_epi16(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveHigh( AVX2::short_v x) const {
return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
_mm256_unpackhi_epi16(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveLow (AVX2::ushort_v x) const {
return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
_mm256_unpackhi_epi16(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveHigh(AVX2::ushort_v x) const {
return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
_mm256_unpackhi_epi16(data(), x.data()));
}
#endif
template <> Vc_INTRINSIC Vc_PURE AVX2::double_v AVX2::double_v::operator[](Permutation::ReversedTag) const
{
return Mem::permute128<X1, X0>(Mem::permute<X1, X0, X3, X2>(d.v()));
}
template <> Vc_INTRINSIC Vc_PURE AVX2::float_v AVX2::float_v::operator[](Permutation::ReversedTag) const
{
return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
}
#ifdef Vc_IMPL_AVX2
template <>
Vc_INTRINSIC Vc_PURE AVX2::int_v AVX2::int_v::operator[](Permutation::ReversedTag) const
{
return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
}
template <>
Vc_INTRINSIC Vc_PURE AVX2::uint_v AVX2::uint_v::operator[](Permutation::ReversedTag) const
{
return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
}
template <>
Vc_INTRINSIC Vc_PURE AVX2::short_v AVX2::short_v::operator[](
Permutation::ReversedTag) const
{
return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
}
template <>
Vc_INTRINSIC Vc_PURE AVX2::ushort_v AVX2::ushort_v::operator[](
Permutation::ReversedTag) const
{
return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
}
#endif
template <> Vc_INTRINSIC AVX2::float_v Vector<float, VectorAbi::Avx>::operator[](const IndexType & ) const
{
return *this;
#ifdef Vc_IMPL_AVX2
#else
#endif
}
template <typename T>
Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::reversed() const
{
return (*this)[Permutation::Reversed];
}
template <> template <int Index> Vc_INTRINSIC AVX2::float_v AVX2::float_v::broadcast() const
{
constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
constexpr VecPos Outer = static_cast<VecPos>((Index & 0x4) / 4);
return Mem::permute<Inner, Inner, Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
}
template <> template <int Index> Vc_INTRINSIC AVX2::double_v AVX2::double_v::broadcast() const
{
constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
constexpr VecPos Outer = static_cast<VecPos>((Index & 0x2) / 2);
return Mem::permute<Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
}
}
#ifndef VC_AVX_SIMD_CAST_H_
#define VC_AVX_SIMD_CAST_H_
#ifndef VC_AVX_VECTOR_H_
#error "Vc/avx/vector.h needs to be included before Vc/avx/simd_cast.h"
#endif
namespace Vc_VERSIONED_NAMESPACE
{
#define Vc_SIMD_CAST_AVX_1(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
AVX2::from_ x, enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)
#define Vc_SIMD_CAST_AVX_2(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
AVX2::from_ x0, AVX2::from_ x1, \
enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)
#define Vc_SIMD_CAST_AVX_3(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \
enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)
#define Vc_SIMD_CAST_AVX_4(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, AVX2::from_ x3, \
enable_if<std::is_same<To, AVX2::to_>::value> = nullarg)
#define Vc_SIMD_CAST_1(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x, enable_if<std::is_same<To, to_>::value> = nullarg)
#define Vc_SIMD_CAST_2(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x0, from_ x1, enable_if<std::is_same<To, to_>::value> = nullarg)
#define Vc_SIMD_CAST_3(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x0, from_ x1, from_ x2, enable_if<std::is_same<To, to_>::value> = nullarg)
#define Vc_SIMD_CAST_4(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x0, from_ x1, from_ x2, from_ x3, \
enable_if<std::is_same<To, to_>::value> = nullarg)
#define Vc_SIMD_CAST_5(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
enable_if<std::is_same<To, to_>::value> = nullarg)
#define Vc_SIMD_CAST_6(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, \
enable_if<std::is_same<To, to_>::value> = nullarg)
#define Vc_SIMD_CAST_7(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, \
enable_if<std::is_same<To, to_>::value> = nullarg)
#define Vc_SIMD_CAST_8(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, from_ x5, from_ x6, from_ x7, \
enable_if<std::is_same<To, to_>::value> = nullarg)
#define Vc_SIMD_CAST_OFFSET(from_,to_,offset_) \
static_assert(from_::size() >= to_::size() * (offset_ + 1), \
"this offset cannot exist for this type combination"); \
template <typename To, int offset> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x, \
enable_if<(offset == offset_ && std::is_same<To, to_>::value)> = nullarg)
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
simd_cast(From x, enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
SSE::Vector<typename To::EntryType>::Size == To::Size)> =
nullarg);
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To simd_cast(
From x0, From x1,
enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To simd_cast(
From x0, From x1, From x2,
enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To simd_cast(
From x0, From x1, From x2, From x3,
enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To simd_cast(
From x0, From x1, From x2, From x3, From x4, From x5, From x6, From x7,
enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
SSE::Vector<typename To::EntryType>::Size == To::Size)> = nullarg);
Vc_SIMD_CAST_AVX_1( float_v, double_v);
Vc_SIMD_CAST_AVX_1(double_v, float_v);
Vc_SIMD_CAST_AVX_2(double_v, float_v);
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_1( int_v, double_v);
Vc_SIMD_CAST_AVX_1( uint_v, double_v);
Vc_SIMD_CAST_AVX_1( short_v, double_v);
Vc_SIMD_CAST_AVX_1(ushort_v, double_v);
Vc_SIMD_CAST_AVX_1( int_v, float_v);
Vc_SIMD_CAST_AVX_1( uint_v, float_v);
Vc_SIMD_CAST_AVX_1( short_v, float_v);
Vc_SIMD_CAST_AVX_1(ushort_v, float_v);
Vc_SIMD_CAST_AVX_1(double_v, int_v);
Vc_SIMD_CAST_AVX_1( float_v, int_v);
Vc_SIMD_CAST_AVX_1( uint_v, int_v);
Vc_SIMD_CAST_AVX_1( short_v, int_v);
Vc_SIMD_CAST_AVX_1(ushort_v, int_v);
Vc_SIMD_CAST_AVX_2(double_v, int_v);
Vc_SIMD_CAST_AVX_1(double_v, uint_v);
Vc_SIMD_CAST_AVX_1( float_v, uint_v);
Vc_SIMD_CAST_AVX_1( int_v, uint_v);
Vc_SIMD_CAST_AVX_1( short_v, uint_v);
Vc_SIMD_CAST_AVX_1(ushort_v, uint_v);
Vc_SIMD_CAST_AVX_2(double_v, uint_v);
Vc_SIMD_CAST_AVX_1(double_v, short_v);
Vc_SIMD_CAST_AVX_1( float_v, short_v);
Vc_SIMD_CAST_AVX_1( int_v, short_v);
Vc_SIMD_CAST_AVX_1( uint_v, short_v);
Vc_SIMD_CAST_AVX_1(ushort_v, short_v);
Vc_SIMD_CAST_AVX_2(double_v, short_v);
Vc_SIMD_CAST_AVX_2( float_v, short_v);
Vc_SIMD_CAST_AVX_2( int_v, short_v);
Vc_SIMD_CAST_AVX_2( uint_v, short_v);
Vc_SIMD_CAST_AVX_3(double_v, short_v);
Vc_SIMD_CAST_AVX_4(double_v, short_v);
Vc_SIMD_CAST_AVX_1(double_v, ushort_v);
Vc_SIMD_CAST_AVX_1( float_v, ushort_v);
Vc_SIMD_CAST_AVX_1( int_v, ushort_v);
Vc_SIMD_CAST_AVX_1( uint_v, ushort_v);
Vc_SIMD_CAST_AVX_1( short_v, ushort_v);
Vc_SIMD_CAST_AVX_2(double_v, ushort_v);
Vc_SIMD_CAST_AVX_2( float_v, ushort_v);
Vc_SIMD_CAST_AVX_2( int_v, ushort_v);
Vc_SIMD_CAST_AVX_2( uint_v, ushort_v);
Vc_SIMD_CAST_AVX_3(double_v, ushort_v);
Vc_SIMD_CAST_AVX_4(double_v, ushort_v);
#endif
Vc_SIMD_CAST_1(SSE::double_v, AVX2::double_v);
Vc_SIMD_CAST_1(SSE:: float_v, AVX2::double_v);
Vc_SIMD_CAST_1(SSE:: int_v, AVX2::double_v);
Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::double_v);
Vc_SIMD_CAST_1(SSE:: short_v, AVX2::double_v);
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::double_v);
Vc_SIMD_CAST_1(SSE::double_v, AVX2:: float_v);
Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: float_v);
Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: float_v);
Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: float_v);
Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: float_v);
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: float_v);
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_1(SSE::double_v, AVX2:: int_v);
Vc_SIMD_CAST_1(SSE::double_v, AVX2:: uint_v);
Vc_SIMD_CAST_1(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_1(SSE::double_v, AVX2::ushort_v);
Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: int_v);
Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: uint_v);
Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: short_v);
Vc_SIMD_CAST_1(SSE:: float_v, AVX2::ushort_v);
Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: int_v);
Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: int_v);
Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: int_v);
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: int_v);
Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: uint_v);
Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: uint_v);
Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: uint_v);
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: uint_v);
Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: short_v);
Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: short_v);
Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: short_v);
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: short_v);
Vc_SIMD_CAST_1(SSE:: int_v, AVX2::ushort_v);
Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::ushort_v);
Vc_SIMD_CAST_1(SSE:: short_v, AVX2::ushort_v);
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::ushort_v);
#endif
Vc_SIMD_CAST_2(SSE::double_v, AVX2::double_v);
Vc_SIMD_CAST_2(SSE::double_v, AVX2:: float_v);
Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: float_v);
Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: float_v);
Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: float_v);
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_2(SSE::double_v, AVX2:: int_v);
Vc_SIMD_CAST_2(SSE::double_v, AVX2:: uint_v);
Vc_SIMD_CAST_2(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_2(SSE::double_v, AVX2::ushort_v);
Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: int_v);
Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: uint_v);
Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: short_v);
Vc_SIMD_CAST_2(SSE:: float_v, AVX2::ushort_v);
Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: int_v);
Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: int_v);
Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: uint_v);
Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: uint_v);
Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: short_v);
Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: short_v);
Vc_SIMD_CAST_2(SSE:: short_v, AVX2:: short_v);
Vc_SIMD_CAST_2(SSE::ushort_v, AVX2:: short_v);
Vc_SIMD_CAST_2(SSE:: int_v, AVX2::ushort_v);
Vc_SIMD_CAST_2(SSE:: uint_v, AVX2::ushort_v);
Vc_SIMD_CAST_2(SSE:: short_v, AVX2::ushort_v);
Vc_SIMD_CAST_2(SSE::ushort_v, AVX2::ushort_v);
#endif
Vc_SIMD_CAST_3(SSE::double_v, AVX2:: float_v);
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_3(SSE::double_v, AVX2:: int_v);
Vc_SIMD_CAST_3(SSE::double_v, AVX2:: uint_v);
Vc_SIMD_CAST_3(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_3(SSE::double_v, AVX2::ushort_v);
Vc_SIMD_CAST_3(SSE:: float_v, AVX2:: short_v);
Vc_SIMD_CAST_3(SSE:: float_v, AVX2::ushort_v);
Vc_SIMD_CAST_3(SSE:: int_v, AVX2:: short_v);
Vc_SIMD_CAST_3(SSE:: uint_v, AVX2:: short_v);
Vc_SIMD_CAST_3(SSE:: int_v, AVX2::ushort_v);
Vc_SIMD_CAST_3(SSE:: uint_v, AVX2::ushort_v);
#endif
Vc_SIMD_CAST_4(SSE::double_v, AVX2:: float_v);
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_4(SSE::double_v, AVX2:: int_v);
Vc_SIMD_CAST_4(SSE::double_v, AVX2:: uint_v);
Vc_SIMD_CAST_4(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_4(SSE::double_v, AVX2::ushort_v);
Vc_SIMD_CAST_4(SSE:: float_v, AVX2:: short_v);
Vc_SIMD_CAST_4(SSE:: float_v, AVX2::ushort_v);
Vc_SIMD_CAST_4(SSE:: int_v, AVX2:: short_v);
Vc_SIMD_CAST_4(SSE:: uint_v, AVX2:: short_v);
Vc_SIMD_CAST_4(SSE:: int_v, AVX2::ushort_v);
Vc_SIMD_CAST_4(SSE:: uint_v, AVX2::ushort_v);
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_5(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_5(SSE::double_v, AVX2::ushort_v);
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_6(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_6(SSE::double_v, AVX2::ushort_v);
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_7(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_7(SSE::double_v, AVX2::ushort_v);
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_8(SSE::double_v, AVX2:: short_v);
Vc_SIMD_CAST_8(SSE::double_v, AVX2::ushort_v);
#endif
Vc_SIMD_CAST_1(AVX2::double_v, SSE::double_v);
Vc_SIMD_CAST_1(AVX2::double_v, SSE:: float_v);
Vc_SIMD_CAST_1(AVX2::double_v, SSE:: int_v);
Vc_SIMD_CAST_1(AVX2::double_v, SSE:: uint_v);
Vc_SIMD_CAST_1(AVX2::double_v, SSE:: short_v);
Vc_SIMD_CAST_1(AVX2::double_v, SSE::ushort_v);
Vc_SIMD_CAST_1(AVX2:: float_v, SSE::double_v);
Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: float_v);
Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: int_v);
Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: uint_v);
Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: short_v);
Vc_SIMD_CAST_1(AVX2:: float_v, SSE::ushort_v);
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_1(AVX2:: int_v, SSE::double_v);
Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: float_v);
Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: uint_v);
Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: int_v);
Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: short_v);
Vc_SIMD_CAST_1(AVX2:: int_v, SSE::ushort_v);
Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::double_v);
Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: float_v);
Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: int_v);
Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: uint_v);
Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: short_v);
Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::ushort_v);
Vc_SIMD_CAST_1(AVX2:: short_v, SSE::double_v);
Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: float_v);
Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: int_v);
Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: uint_v);
Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: short_v);
Vc_SIMD_CAST_1(AVX2:: short_v, SSE::ushort_v);
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::double_v);
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: float_v);
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: int_v);
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: uint_v);
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: short_v);
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::ushort_v);
#endif
Vc_SIMD_CAST_2(AVX2::double_v, SSE:: short_v);
Vc_SIMD_CAST_2(AVX2::double_v, SSE::ushort_v);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, AVX2::double_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7,
enable_if<std::is_same<Return, AVX2::float_v>::value> = nullarg);
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7,
enable_if<std::is_same<Return, AVX2::int_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7,
enable_if<std::is_same<Return, AVX2::uint_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, Scalar::Vector<T> x13,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, Scalar::Vector<T> x13,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
Scalar::Vector<T> x15,
enable_if<std::is_same<Return, AVX2::short_v>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
Scalar::Vector<T> x15,
enable_if<std::is_same<Return, AVX2::ushort_v>::value> = nullarg);
#endif
template <typename To, typename FromT>
Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Vector<FromT> x,
enable_if<Scalar::is_vector<To>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(const AVX2::Mask<T> &k, enable_if<AVX2::is_mask<Return>::value> = nullarg);
Vc_SIMD_CAST_AVX_2(double_m, float_m);
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_2(double_m, int_m);
Vc_SIMD_CAST_AVX_2(double_m, uint_m);
Vc_SIMD_CAST_AVX_2(double_m, short_m);
Vc_SIMD_CAST_AVX_2(double_m, ushort_m);
Vc_SIMD_CAST_AVX_2( float_m, short_m);
Vc_SIMD_CAST_AVX_2( float_m, ushort_m);
Vc_SIMD_CAST_AVX_2( int_m, short_m);
Vc_SIMD_CAST_AVX_2( int_m, ushort_m);
Vc_SIMD_CAST_AVX_2( uint_m, short_m);
Vc_SIMD_CAST_AVX_2( uint_m, ushort_m);
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_4(double_m, short_m);
Vc_SIMD_CAST_AVX_4(double_m, ushort_m);
#endif
Vc_SIMD_CAST_1(SSE::double_m, AVX2::double_m);
Vc_SIMD_CAST_1(SSE::double_m, AVX2:: float_m);
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_1(SSE::double_m, AVX2:: int_m);
Vc_SIMD_CAST_1(SSE::double_m, AVX2:: uint_m);
Vc_SIMD_CAST_1(SSE::double_m, AVX2:: short_m);
Vc_SIMD_CAST_1(SSE::double_m, AVX2::ushort_m);
#endif
Vc_SIMD_CAST_1(SSE:: float_m, AVX2::double_m);
Vc_SIMD_CAST_1(SSE:: int_m, AVX2::double_m);
Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::double_m);
Vc_SIMD_CAST_1(SSE:: short_m, AVX2::double_m);
Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::double_m);
Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: float_m);
Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: float_m);
Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: float_m);
Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: float_m);
Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: float_m);
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: int_m);
Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: uint_m);
Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: int_m);
Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: uint_m);
Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: int_m);
Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: uint_m);
Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: short_m);
Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: short_m);
Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: short_m);
Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: short_m);
Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: short_m);
Vc_SIMD_CAST_1(SSE:: float_m, AVX2::ushort_m);
Vc_SIMD_CAST_1(SSE:: int_m, AVX2::ushort_m);
Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::ushort_m);
Vc_SIMD_CAST_1(SSE:: short_m, AVX2::ushort_m);
Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::ushort_m);
Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: int_m);
Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: uint_m);
Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: int_m);
Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: uint_m);
#endif
Vc_SIMD_CAST_2(SSE::double_m, AVX2::double_m);
Vc_SIMD_CAST_2(SSE::double_m, AVX2:: float_m);
Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: float_m);
Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: float_m);
Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: float_m);
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_2(SSE::double_m, AVX2:: int_m);
Vc_SIMD_CAST_2(SSE::double_m, AVX2:: uint_m);
Vc_SIMD_CAST_2(SSE::double_m, AVX2:: short_m);
Vc_SIMD_CAST_2(SSE::double_m, AVX2::ushort_m);
Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: int_m);
Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: uint_m);
Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: short_m);
Vc_SIMD_CAST_2(SSE:: float_m, AVX2::ushort_m);
Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: int_m);
Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: uint_m);
Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: short_m);
Vc_SIMD_CAST_2(SSE:: int_m, AVX2::ushort_m);
Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: int_m);
Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: uint_m);
Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: short_m);
Vc_SIMD_CAST_2(SSE:: uint_m, AVX2::ushort_m);
Vc_SIMD_CAST_2(SSE:: short_m, AVX2:: short_m);
Vc_SIMD_CAST_2(SSE:: short_m, AVX2::ushort_m);
Vc_SIMD_CAST_2(SSE::ushort_m, AVX2:: short_m);
Vc_SIMD_CAST_2(SSE::ushort_m, AVX2::ushort_m);
#endif
Vc_SIMD_CAST_4(SSE::double_m, AVX2:: float_m);
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_4(SSE::double_m, AVX2:: int_m);
Vc_SIMD_CAST_4(SSE::double_m, AVX2:: uint_m);
Vc_SIMD_CAST_4(SSE::double_m, AVX2:: short_m);
Vc_SIMD_CAST_4(SSE::double_m, AVX2::ushort_m);
Vc_SIMD_CAST_4(SSE:: float_m, AVX2:: short_m);
Vc_SIMD_CAST_4(SSE:: float_m, AVX2::ushort_m);
Vc_SIMD_CAST_4(SSE:: int_m, AVX2:: short_m);
Vc_SIMD_CAST_4(SSE:: int_m, AVX2::ushort_m);
Vc_SIMD_CAST_4(SSE:: uint_m, AVX2:: short_m);
Vc_SIMD_CAST_4(SSE:: uint_m, AVX2::ushort_m);
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Mask<T> k,
enable_if<AVX2::is_mask<Return>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1,
enable_if<AVX2::is_mask<Return>::value> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(
Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 4)> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast(
Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
Scalar::Mask<T> k4, Scalar::Mask<T> k5, Scalar::Mask<T> k6, Scalar::Mask<T> k7,
enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 8)> = nullarg);
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
Scalar::Mask<T> k4, Scalar::Mask<T> k5, Scalar::Mask<T> k6, Scalar::Mask<T> k7,
Scalar::Mask<T> k8, Scalar::Mask<T> k9, Scalar::Mask<T> k10,
Scalar::Mask<T> k11, Scalar::Mask<T> k12, Scalar::Mask<T> k13,
Scalar::Mask<T> k14, Scalar::Mask<T> k15,
enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 16)> = nullarg);
Vc_SIMD_CAST_1(AVX2::double_m, SSE::double_m);
Vc_SIMD_CAST_1(AVX2::double_m, SSE:: float_m);
Vc_SIMD_CAST_1(AVX2::double_m, SSE:: int_m);
Vc_SIMD_CAST_1(AVX2::double_m, SSE:: uint_m);
Vc_SIMD_CAST_1(AVX2::double_m, SSE:: short_m);
Vc_SIMD_CAST_1(AVX2::double_m, SSE::ushort_m);
Vc_SIMD_CAST_1(AVX2:: float_m, SSE::double_m);
Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: float_m);
Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: int_m);
Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: uint_m);
Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: short_m);
Vc_SIMD_CAST_1(AVX2:: float_m, SSE::ushort_m);
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_1(AVX2:: int_m, SSE::double_m);
Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: float_m);
Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: int_m);
Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: uint_m);
Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: short_m);
Vc_SIMD_CAST_1(AVX2:: int_m, SSE::ushort_m);
Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::double_m);
Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: float_m);
Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: int_m);
Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: uint_m);
Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: short_m);
Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::ushort_m);
Vc_SIMD_CAST_1(AVX2:: short_m, SSE::double_m);
Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: float_m);
Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: int_m);
Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: uint_m);
Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: short_m);
Vc_SIMD_CAST_1(AVX2:: short_m, SSE::ushort_m);
Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::double_m);
Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: float_m);
Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: int_m);
Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: uint_m);
Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: short_m);
Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::ushort_m);
#endif
Vc_SIMD_CAST_2(AVX2::double_m, SSE:: short_m);
Vc_SIMD_CAST_2(AVX2::double_m, SSE::ushort_m);
template <typename To, typename FromT>
Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::Mask<FromT> x,
enable_if<Scalar::is_mask<To>::value> = nullarg);
template <typename Return, int offset, typename From>
Vc_INTRINSIC Vc_CONST enable_if<
(offset == 0 &&
((AVX2::is_vector<From>::value && !Scalar::is_vector<Return>::value &&
Traits::is_simd_vector<Return>::value && !Traits::isSimdArray<Return>::value) ||
(AVX2::is_mask<From>::value && !Scalar::is_mask<Return>::value &&
Traits::is_simd_mask<Return>::value &&
!Traits::isSimdMaskArray<Return>::value))),
Return>
simd_cast(const From &x);
template <typename Return, int offset, typename From>
Vc_INTRINSIC Vc_CONST Return simd_cast(
const From &x,
enable_if<offset == 0 && ((SSE::is_vector<From>::value &&
AVX2::is_vector<Return>::value) ||
(SSE::is_mask<From>::value &&
AVX2::is_mask<Return>::value))> = nullarg);
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(AVX2::is_vector<Return>::value && offset != 0),
Return>
simd_cast(AVX2::Vector<T> x);
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector<Return>::value &&
sizeof(AVX2::Vector<T>) == 32),
Return>
simd_cast(AVX2::Vector<T> x);
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector<Return>::value &&
sizeof(AVX2::Vector<T>) == 16),
Return>
simd_cast(AVX2::Vector<T> x);
Vc_SIMD_CAST_OFFSET(SSE:: short_v, AVX2::double_v, 1);
Vc_SIMD_CAST_OFFSET(SSE::ushort_v, AVX2::double_v, 1);
Vc_SIMD_CAST_OFFSET(SSE:: short_m, AVX2::double_m, 1);
Vc_SIMD_CAST_OFFSET(SSE::ushort_m, AVX2::double_m, 1);
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask<Return>::value &&
sizeof(AVX2::Mask<T>) == 32),
Return>
simd_cast(AVX2::Mask<T> x);
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask<Return>::value &&
sizeof(AVX2::Mask<T>) == 16),
Return>
simd_cast(AVX2::Mask<T> x);
#undef Vc_SIMD_CAST_AVX_1
#define Vc_SIMD_CAST_AVX_1(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x, \
enable_if<std::is_same<To, AVX2::to_>::value>)
#undef Vc_SIMD_CAST_AVX_2
#define Vc_SIMD_CAST_AVX_2(from_,to_) \
static_assert(AVX2::from_::size() * 2 <= AVX2::to_::size(), \
"this type combination is wrong"); \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, \
enable_if<std::is_same<To, AVX2::to_>::value>)
#undef Vc_SIMD_CAST_AVX_3
#define Vc_SIMD_CAST_AVX_3(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \
enable_if<std::is_same<To, AVX2::to_>::value>)
#undef Vc_SIMD_CAST_AVX_4
#define Vc_SIMD_CAST_AVX_4(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(AVX2::from_ x0, AVX2::from_ x1, AVX2::from_ x2, \
AVX2::from_ x3, \
enable_if<std::is_same<To, AVX2::to_>::value>)
#undef Vc_SIMD_CAST_1
#define Vc_SIMD_CAST_1(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(from_ x, enable_if<std::is_same<To, to_>::value>)
#undef Vc_SIMD_CAST_2
#define Vc_SIMD_CAST_2(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, \
enable_if<std::is_same<To, to_>::value>)
#undef Vc_SIMD_CAST_3
#define Vc_SIMD_CAST_3(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, \
enable_if<std::is_same<To, to_>::value>)
#undef Vc_SIMD_CAST_4
#define Vc_SIMD_CAST_4(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, \
enable_if<std::is_same<To, to_>::value>)
#undef Vc_SIMD_CAST_5
#define Vc_SIMD_CAST_5(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
enable_if<std::is_same<To, to_>::value>)
#undef Vc_SIMD_CAST_6
#define Vc_SIMD_CAST_6(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
from_ x5, \
enable_if<std::is_same<To, to_>::value>)
#undef Vc_SIMD_CAST_7
#define Vc_SIMD_CAST_7(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
from_ x5, from_ x6, \
enable_if<std::is_same<To, to_>::value>)
#undef Vc_SIMD_CAST_8
#define Vc_SIMD_CAST_8(from_,to_) \
template <typename To> \
Vc_INTRINSIC Vc_CONST To simd_cast(from_ x0, from_ x1, from_ x2, from_ x3, from_ x4, \
from_ x5, from_ x6, from_ x7, \
enable_if<std::is_same<To, to_>::value>)
#undef Vc_SIMD_CAST_OFFSET
#define Vc_SIMD_CAST_OFFSET(from_,to_,offset_) \
static_assert(from_::size() >= to_::size() * (offset_ + 1), \
"this offset cannot exist for this type combination"); \
template <typename To, int offset> \
Vc_INTRINSIC Vc_CONST To simd_cast( \
from_ x, enable_if<(offset == offset_ && std::is_same<To, to_>::value)>)
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
simd_cast(From x, enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
SSE::Vector<typename To::EntryType>::Size == To::Size)>)
{
return simd_cast<SSE::Vector<typename To::EntryType>>(x).data();
}
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
simd_cast(From x0, From x1,
enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
SSE::Vector<typename To::EntryType>::Size == To::Size)>)
{
return simd_cast<SSE::Vector<typename To::EntryType>>(x0, x1).data();
}
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
simd_cast(From x0, From x1, From x2,
enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
SSE::Vector<typename To::EntryType>::Size == To::Size)>)
{
return simd_cast<SSE::Vector<typename To::EntryType>>(x0, x1, x2).data();
}
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
simd_cast(From x0, From x1, From x2, From x3,
enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
SSE::Vector<typename To::EntryType>::Size == To::Size)>)
{
return simd_cast<SSE::Vector<typename To::EntryType>>(x0, x1, x2, x3).data();
}
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
simd_cast(From x0, From x1, From x2, From x3, From x4, From x5, From x6, From x7,
enable_if<(AVX2::is_vector<To>::value && SSE::is_vector<From>::value &&
SSE::Vector<typename To::EntryType>::Size == To::Size)>)
{
return simd_cast<SSE::Vector<typename To::EntryType>>(x0, x1, x2, x3, x4, x5, x6, x7)
.data();
}
Vc_SIMD_CAST_AVX_1( float_v, double_v) { return _mm256_cvtps_pd(AVX::lo128(x.data())); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_1( int_v, double_v) { return AVX::convert< int, double>(AVX::lo128(x.data())); }
Vc_SIMD_CAST_AVX_1( uint_v, double_v) { return AVX::convert< uint, double>(AVX::lo128(x.data())); }
Vc_SIMD_CAST_AVX_1( short_v, double_v) { return AVX::convert< short, double>(AVX::lo128(x.data())); }
Vc_SIMD_CAST_AVX_1(ushort_v, double_v) { return AVX::convert<ushort, double>(AVX::lo128(x.data())); }
#endif
Vc_SIMD_CAST_AVX_1(double_v, float_v) { return AVX::zeroExtend(_mm256_cvtpd_ps(x.data())); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_1( int_v, float_v) { return AVX::convert< int, float>(x.data()); }
Vc_SIMD_CAST_AVX_1( uint_v, float_v) { return AVX::convert< uint, float>(x.data()); }
Vc_SIMD_CAST_AVX_1( short_v, float_v) { return AVX::convert< short, float>(AVX::lo128(x.data())); }
Vc_SIMD_CAST_AVX_1(ushort_v, float_v) { return AVX::convert<ushort, float>(AVX::lo128(x.data())); }
#endif
Vc_SIMD_CAST_AVX_2(double_v, float_v) { return AVX::concat(_mm256_cvtpd_ps(x0.data()), _mm256_cvtpd_ps(x1.data())); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_1(double_v, int_v) { return AVX::zeroExtend(_mm256_cvttpd_epi32(x.data())); }
Vc_SIMD_CAST_AVX_1( float_v, int_v) { return _mm256_cvttps_epi32(x.data()); }
Vc_SIMD_CAST_AVX_1( uint_v, int_v) { return x.data(); }
Vc_SIMD_CAST_AVX_1( short_v, int_v) { return _mm256_cvtepi16_epi32(AVX::lo128(x.data())); }
Vc_SIMD_CAST_AVX_1(ushort_v, int_v) { return _mm256_cvtepu16_epi32(AVX::lo128(x.data())); }
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_2(double_v, int_v) { return AVX::concat(_mm256_cvttpd_epi32(x0.data()), _mm256_cvttpd_epi32(x1.data())); }
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_1(double_v, uint_v) { return AVX::zeroExtend(AVX::convert<double, uint>(x.data())); }
Vc_SIMD_CAST_AVX_1( float_v, uint_v) {
return _mm256_blendv_epi8(
_mm256_cvttps_epi32(x.data()),
_mm256_add_epi32(
_mm256_cvttps_epi32(_mm256_sub_ps(x.data(), AVX::set2power31_ps())),
AVX::set2power31_epu32()),
_mm256_castps_si256(AVX::cmpge_ps(x.data(), AVX::set2power31_ps())));
}
Vc_SIMD_CAST_AVX_1( int_v, uint_v) { return x.data(); }
Vc_SIMD_CAST_AVX_1( short_v, uint_v) { return _mm256_cvtepi16_epi32(AVX::lo128(x.data())); }
Vc_SIMD_CAST_AVX_1(ushort_v, uint_v) { return _mm256_cvtepu16_epi32(AVX::lo128(x.data())); }
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_2(double_v, uint_v) { return AVX::concat(AVX::convert<double, uint>(x0.data()), AVX::convert<double, uint>(x1.data())); }
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_1(double_v, short_v) { return AVX::zeroExtend(_mm_packs_epi32(_mm256_cvttpd_epi32(x.data()), _mm_setzero_si128())); }
Vc_SIMD_CAST_AVX_1( float_v, short_v) {
const auto tmp = _mm256_cvttps_epi32(x.data());
return AVX::zeroExtend(_mm_packs_epi32(AVX::lo128(tmp), AVX::hi128(tmp)));
}
Vc_SIMD_CAST_AVX_1( int_v, short_v) { return AVX::zeroExtend(AVX::convert< int, short>(x.data())); }
Vc_SIMD_CAST_AVX_1( uint_v, short_v) { return AVX::zeroExtend(AVX::convert<uint, short>(x.data())); }
Vc_SIMD_CAST_AVX_1(ushort_v, short_v) { return x.data(); }
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_2(double_v, short_v) {
const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
return AVX::zeroExtend(_mm_packs_epi32(tmp0, tmp1));
}
Vc_SIMD_CAST_AVX_2( float_v, short_v) {
using AVX2::short_v;
using AVX2::int_v;
return simd_cast<short_v>(simd_cast<int_v>(x0), simd_cast<int_v>(x1));
}
Vc_SIMD_CAST_AVX_2( int_v, short_v) {
const auto shuf = _mm256_setr_epi8(
0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80);
auto a = _mm256_shuffle_epi8(x0.data(), shuf);
auto b = _mm256_shuffle_epi8(x1.data(), shuf);
return Mem::permute4x64<X0, X2, X1, X3>(_mm256_unpacklo_epi64(a, b));
}
Vc_SIMD_CAST_AVX_2( uint_v, short_v) {
const auto shuf = _mm256_setr_epi8(
0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80);
auto a = _mm256_shuffle_epi8(x0.data(), shuf);
auto b = _mm256_shuffle_epi8(x1.data(), shuf);
return Mem::permute4x64<X0, X2, X1, X3>(_mm256_unpacklo_epi64(a, b));
}
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_3(double_v, short_v) {
const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
const auto tmp2 = _mm256_cvttpd_epi32(x2.data());
return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, _mm_setzero_si128()));
}
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_4(double_v, short_v) {
const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
const auto tmp2 = _mm256_cvttpd_epi32(x2.data());
const auto tmp3 = _mm256_cvttpd_epi32(x3.data());
return AVX::concat(_mm_packs_epi32(tmp0, tmp1), _mm_packs_epi32(tmp2, tmp3));
}
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_1(double_v, ushort_v) {
const auto tmp = _mm256_cvttpd_epi32(x.data());
return AVX::zeroExtend(_mm_packus_epi32(tmp, _mm_setzero_si128()));
}
Vc_SIMD_CAST_AVX_1( float_v, ushort_v) {
const auto tmp = _mm256_cvttps_epi32(x.data());
return AVX::zeroExtend(_mm_packus_epi32(AVX::lo128(tmp), AVX::hi128(tmp)));
}
Vc_SIMD_CAST_AVX_1( int_v, ushort_v) { return AVX::zeroExtend(AVX::convert< int, ushort>(x.data())); }
Vc_SIMD_CAST_AVX_1( uint_v, ushort_v) { return AVX::zeroExtend(AVX::convert<uint, ushort>(x.data())); }
Vc_SIMD_CAST_AVX_1( short_v, ushort_v) { return x.data(); }
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_2(double_v, ushort_v) {
const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
return AVX::zeroExtend(_mm_packus_epi32(tmp0, tmp1));
}
Vc_SIMD_CAST_AVX_2( float_v, ushort_v) {
using AVX2::ushort_v;
using AVX2::int_v;
return simd_cast<ushort_v>(simd_cast<int_v>(x0), simd_cast<int_v>(x1));
}
Vc_SIMD_CAST_AVX_2( int_v, ushort_v) {
auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data());
auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data());
auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1);
auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1);
return Mem::permute4x64<X0, X2, X1, X3>(_mm256_unpacklo_epi16(tmp2, tmp3));
}
Vc_SIMD_CAST_AVX_2( uint_v, ushort_v) {
auto tmp0 = _mm256_unpacklo_epi16(x0.data(), x1.data());
auto tmp1 = _mm256_unpackhi_epi16(x0.data(), x1.data());
auto tmp2 = _mm256_unpacklo_epi16(tmp0, tmp1);
auto tmp3 = _mm256_unpackhi_epi16(tmp0, tmp1);
return Mem::permute4x64<X0, X2, X1, X3>(_mm256_unpacklo_epi16(tmp2, tmp3));
}
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_3(double_v, ushort_v) {
const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
const auto tmp2 = _mm256_cvttpd_epi32(x2.data());
return AVX::concat(_mm_packus_epi32(tmp0, tmp1),
_mm_packus_epi32(tmp2, _mm_setzero_si128()));
}
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_4(double_v, ushort_v) {
const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
const auto tmp2 = _mm256_cvttpd_epi32(x2.data());
const auto tmp3 = _mm256_cvttpd_epi32(x3.data());
return AVX::concat(_mm_packus_epi32(tmp0, tmp1), _mm_packus_epi32(tmp2, tmp3));
}
#endif
Vc_SIMD_CAST_1(SSE::double_v, AVX2::double_v) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE:: float_v, AVX2::double_v) { return _mm256_cvtps_pd(x.data()); }
Vc_SIMD_CAST_1(SSE:: int_v, AVX2::double_v) { return _mm256_cvtepi32_pd(x.data()); }
Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::double_v) { using namespace AvxIntrinsics; return _mm256_add_pd(_mm256_cvtepi32_pd(_mm_sub_epi32(x.data(), _mm_setmin_epi32())), set1_pd(1u << 31)); }
Vc_SIMD_CAST_1(SSE:: short_v, AVX2::double_v) { return simd_cast<AVX2::double_v>(simd_cast<SSE::int_v>(x)); }
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::double_v) { return simd_cast<AVX2::double_v>(simd_cast<SSE::int_v>(x)); }
Vc_SIMD_CAST_1(SSE::double_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast<SSE:: float_v>(x).data()); }
Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: float_v) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: float_v) { return AVX::zeroExtend(_mm_cvtepi32_ps(x.data())); }
Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast<SSE::float_v>(x).data()); }
Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: float_v) { return AVX::convert< short, float>(x.data()); }
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: float_v) { return AVX::convert<ushort, float>(x.data()); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_1(SSE::double_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast<SSE:: int_v>(x).data()); }
Vc_SIMD_CAST_1(SSE::double_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast<SSE:: uint_v>(x).data()); }
Vc_SIMD_CAST_1(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x).data()); }
Vc_SIMD_CAST_1(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x).data()); }
Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast<SSE::int_v>(x).data()); }
Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast<SSE::uint_v>(x).data()); }
Vc_SIMD_CAST_1(SSE:: float_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE::short_v>(x).data()); }
Vc_SIMD_CAST_1(SSE:: float_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x).data()); }
Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: int_v) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: int_v) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: int_v) { return AVX::convert< short, int>(x.data()); }
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: int_v) { return AVX::convert<ushort, int>(x.data()); }
Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: uint_v) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: uint_v) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: uint_v) { return AVX::convert< short, uint>(x.data()); }
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: uint_v) { return AVX::convert<ushort, uint>(x.data()); }
Vc_SIMD_CAST_1(SSE:: int_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE::short_v>(x).data()); }
Vc_SIMD_CAST_1(SSE:: uint_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE::short_v>(x).data()); }
Vc_SIMD_CAST_1(SSE:: short_v, AVX2:: short_v) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2:: short_v) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE:: int_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x).data()); }
Vc_SIMD_CAST_1(SSE:: uint_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x).data()); }
Vc_SIMD_CAST_1(SSE:: short_v, AVX2::ushort_v) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE::ushort_v, AVX2::ushort_v) { return AVX::zeroExtend(x.data()); }
#endif
Vc_SIMD_CAST_2(SSE::double_v, AVX2::double_v) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE::double_v, AVX2:: float_v) { return AVX::zeroExtend(simd_cast<SSE:: float_v>(x0, x1).data()); }
Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: float_v) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: float_v) { return AVX::convert< int, float>(AVX::concat(x0.data(), x1.data())); }
Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: float_v) { return AVX::convert<uint, float>(AVX::concat(x0.data(), x1.data())); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_2(SSE::double_v, AVX2:: int_v) { return AVX::zeroExtend(simd_cast<SSE:: int_v>(x0, x1).data()); }
Vc_SIMD_CAST_2(SSE::double_v, AVX2:: uint_v) { return AVX::zeroExtend(simd_cast<SSE:: uint_v>(x0, x1).data()); }
Vc_SIMD_CAST_2(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1).data()); }
Vc_SIMD_CAST_2(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1).data()); }
Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: int_v) { return simd_cast<AVX2:: int_v>(simd_cast<AVX2::float_v>(x0, x1)); }
Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: uint_v) { return simd_cast<AVX2::uint_v>(simd_cast<AVX2::float_v>(x0, x1)); }
Vc_SIMD_CAST_2(SSE:: float_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1).data()); }
Vc_SIMD_CAST_2(SSE:: float_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1).data()); }
Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: int_v) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: int_v) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: uint_v) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: uint_v) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: int_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1).data()); }
Vc_SIMD_CAST_2(SSE:: uint_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1).data()); }
Vc_SIMD_CAST_2(SSE:: short_v, AVX2:: short_v) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE::ushort_v, AVX2:: short_v) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: int_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1).data()); }
Vc_SIMD_CAST_2(SSE:: uint_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1).data()); }
Vc_SIMD_CAST_2(SSE:: short_v, AVX2::ushort_v) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE::ushort_v, AVX2::ushort_v) { return AVX::concat(x0.data(), x1.data()); }
#endif
Vc_SIMD_CAST_3(SSE::double_v, AVX2:: float_v) { return simd_cast<AVX2:: float_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2)); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_3(SSE::double_v, AVX2:: int_v) { return simd_cast<AVX2:: int_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2)); }
Vc_SIMD_CAST_3(SSE::double_v, AVX2:: uint_v) { return simd_cast<AVX2::uint_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2)); }
Vc_SIMD_CAST_3(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1, x2).data()); }
Vc_SIMD_CAST_3(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1, x2).data()); }
Vc_SIMD_CAST_3(SSE:: float_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::float_v>(x0, x1), simd_cast<AVX2::float_v>(x2)); }
Vc_SIMD_CAST_3(SSE:: float_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::float_v>(x0, x1), simd_cast<AVX2::float_v>(x2)); }
Vc_SIMD_CAST_3(SSE:: int_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2:: int_v>(x0, x1), simd_cast<AVX2:: int_v>(x2)); }
Vc_SIMD_CAST_3(SSE:: uint_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::uint_v>(x0, x1), simd_cast<AVX2::uint_v>(x2)); }
Vc_SIMD_CAST_3(SSE:: int_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2:: int_v>(x0, x1), simd_cast<AVX2:: int_v>(x2)); }
Vc_SIMD_CAST_3(SSE:: uint_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::uint_v>(x0, x1), simd_cast<AVX2::uint_v>(x2)); }
#endif
Vc_SIMD_CAST_4(SSE::double_v, AVX2:: float_v) { return simd_cast<AVX2:: float_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3)); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_4(SSE::double_v, AVX2:: int_v) { return simd_cast<AVX2:: int_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3)); }
Vc_SIMD_CAST_4(SSE::double_v, AVX2:: uint_v) { return simd_cast<AVX2::uint_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3)); }
Vc_SIMD_CAST_4(SSE::double_v, AVX2:: short_v) { return AVX::zeroExtend(simd_cast<SSE:: short_v>(x0, x1, x2, x3).data()); }
Vc_SIMD_CAST_4(SSE::double_v, AVX2::ushort_v) { return AVX::zeroExtend(simd_cast<SSE::ushort_v>(x0, x1, x2, x3).data()); }
Vc_SIMD_CAST_4(SSE:: float_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::float_v>(x0, x1), simd_cast<AVX2::float_v>(x2, x3)); }
Vc_SIMD_CAST_4(SSE:: float_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::float_v>(x0, x1), simd_cast<AVX2::float_v>(x2, x3)); }
Vc_SIMD_CAST_4(SSE:: int_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2:: int_v>(x0, x1), simd_cast<AVX2:: int_v>(x2, x3)); }
Vc_SIMD_CAST_4(SSE:: uint_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::uint_v>(x0, x1), simd_cast<AVX2::uint_v>(x2, x3)); }
Vc_SIMD_CAST_4(SSE:: int_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2:: int_v>(x0, x1), simd_cast<AVX2:: int_v>(x2, x3)); }
Vc_SIMD_CAST_4(SSE:: uint_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::uint_v>(x0, x1), simd_cast<AVX2::uint_v>(x2, x3)); }
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_5(SSE::double_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4)); }
Vc_SIMD_CAST_5(SSE::double_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4)); }
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_6(SSE::double_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5)); }
Vc_SIMD_CAST_6(SSE::double_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5)); }
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_7(SSE::double_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5), simd_cast<AVX2::double_v>(x6)); }
Vc_SIMD_CAST_7(SSE::double_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5), simd_cast<AVX2::double_v>(x6)); }
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_8(SSE::double_v, AVX2:: short_v) { return simd_cast<AVX2:: short_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5), simd_cast<AVX2::double_v>(x6, x7)); }
Vc_SIMD_CAST_8(SSE::double_v, AVX2::ushort_v) { return simd_cast<AVX2::ushort_v>(simd_cast<AVX2::double_v>(x0, x1), simd_cast<AVX2::double_v>(x2, x3), simd_cast<AVX2::double_v>(x4, x5), simd_cast<AVX2::double_v>(x6, x7)); }
#endif
Vc_SIMD_CAST_1(AVX2::double_v, SSE::double_v) { return AVX::lo128(x.data()); }
Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: float_v) { return AVX::lo128(x.data()); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: int_v) { return AVX::lo128(x.data()); }
Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: uint_v) { return AVX::lo128(x.data()); }
Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: short_v) { return AVX::lo128(x.data()); }
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::ushort_v) { return AVX::lo128(x.data()); }
#endif
Vc_SIMD_CAST_1(AVX2::double_v, SSE:: float_v) { return simd_cast<SSE:: float_v>(simd_cast<AVX2:: float_v>(x)); }
Vc_SIMD_CAST_1(AVX2::double_v, SSE:: int_v) { return AVX::convert<double, int>(x.data()); }
Vc_SIMD_CAST_1(AVX2::double_v, SSE:: uint_v) { return AVX::convert<double, unsigned int>(x.data()); }
Vc_SIMD_CAST_1(AVX2::double_v, SSE:: short_v) { return AVX::convert<double, short>(x.data()); }
Vc_SIMD_CAST_1(AVX2::double_v, SSE::ushort_v) { return AVX::convert<double, unsigned short>(x.data()); }
Vc_SIMD_CAST_1(AVX2:: float_v, SSE::double_v) { return simd_cast<SSE::double_v>(simd_cast<SSE:: float_v>(x)); }
Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: int_v) { return simd_cast<SSE:: int_v>(simd_cast<SSE:: float_v>(x)); }
Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: uint_v) { return simd_cast<SSE:: uint_v>(simd_cast<SSE:: float_v>(x)); }
Vc_SIMD_CAST_1(AVX2:: float_v, SSE:: short_v) { return AVX::convert<float, short>(x.data()); }
Vc_SIMD_CAST_1(AVX2:: float_v, SSE::ushort_v) { return AVX::convert<float, unsigned short>(x.data()); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_1(AVX2:: int_v, SSE::double_v) { return SSE::convert<int, double>(AVX::lo128(x.data())); }
Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: float_v) { return SSE::convert<int, float>(AVX::lo128(x.data())); }
Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: uint_v) { return AVX::lo128(x.data()); }
Vc_SIMD_CAST_1(AVX2:: int_v, SSE:: short_v) { return AVX::convert<int, short>(x.data()); }
Vc_SIMD_CAST_1(AVX2:: int_v, SSE::ushort_v) { return AVX::convert<int, ushort>(x.data()); }
Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::double_v) { return SSE::convert<uint, double>(AVX::lo128(x.data())); }
Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: float_v) { return SSE::convert<uint, float>(AVX::lo128(x.data())); }
Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: int_v) { return AVX::lo128(x.data()); }
Vc_SIMD_CAST_1(AVX2:: uint_v, SSE:: short_v) { return AVX::convert<uint, short>(x.data()); }
Vc_SIMD_CAST_1(AVX2:: uint_v, SSE::ushort_v) { return AVX::convert<uint, ushort>(x.data()); }
Vc_SIMD_CAST_1(AVX2:: short_v, SSE::double_v) { return simd_cast<SSE::double_v>(simd_cast<SSE:: short_v>(x)); }
Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: float_v) { return simd_cast<SSE:: float_v>(simd_cast<SSE:: short_v>(x)); }
Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: int_v) { return simd_cast<SSE:: int_v>(simd_cast<SSE:: short_v>(x)); }
Vc_SIMD_CAST_1(AVX2:: short_v, SSE:: uint_v) { return simd_cast<SSE:: uint_v>(simd_cast<SSE:: short_v>(x)); }
Vc_SIMD_CAST_1(AVX2:: short_v, SSE::ushort_v) { return simd_cast<SSE::ushort_v>(simd_cast<SSE:: short_v>(x)); }
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE::double_v) { return simd_cast<SSE::double_v>(simd_cast<SSE::ushort_v>(x)); }
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: float_v) { return simd_cast<SSE:: float_v>(simd_cast<SSE::ushort_v>(x)); }
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: int_v) { return simd_cast<SSE:: int_v>(simd_cast<SSE::ushort_v>(x)); }
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: uint_v) { return simd_cast<SSE:: uint_v>(simd_cast<SSE::ushort_v>(x)); }
Vc_SIMD_CAST_1(AVX2::ushort_v, SSE:: short_v) { return simd_cast<SSE:: short_v>(simd_cast<SSE::ushort_v>(x)); }
#endif
Vc_SIMD_CAST_2(AVX2::double_v, SSE:: short_v) {
const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
return _mm_packs_epi32(tmp0, tmp1);
}
Vc_SIMD_CAST_2(AVX2::double_v, SSE::ushort_v) {
const auto tmp0 = _mm256_cvttpd_epi32(x0.data());
const auto tmp1 = _mm256_cvttpd_epi32(x1.data());
return _mm_packus_epi32(tmp0, tmp1);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, AVX2::double_v>::value>)
{
return AVX::zeroExtend(_mm_setr_pd(x.data(), 0.));
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, AVX2::float_v>::value>)
{
return AVX::zeroExtend(_mm_setr_ps(x.data(), 0.f, 0.f, 0.f));
}
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, AVX2::int_v>::value>)
{
return _mm256_setr_epi32(x.data(), 0, 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, AVX2::uint_v>::value>)
{
return _mm256_setr_epi32(uint(x.data()), 0, 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x,
enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, AVX2::double_v>::value>)
{
return AVX::zeroExtend(_mm_setr_pd(x0.data(), x1.data()));
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, AVX2::float_v>::value>)
{
return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), 0.f, 0.f));
}
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, AVX2::int_v>::value>)
{
return _mm256_setr_epi32(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, AVX2::uint_v>::value>)
{
return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1,
enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, AVX2::double_v>::value>)
{
return _mm256_setr_pd(x0.data(), x1.data(), x2.data(), 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, AVX2::float_v>::value>)
{
return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), x2.data(), 0));
}
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, AVX2::int_v>::value>)
{
return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, AVX2::uint_v>::value>)
{
return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()), 0, 0, 0,
0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, AVX2::double_v>::value>)
{
return _mm256_setr_pd(x0.data(), x1.data(), x2.data(), x3.data());
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, AVX2::float_v>::value>)
{
return AVX::zeroExtend(_mm_setr_ps(x0.data(), x1.data(), x2.data(), x3.data()));
}
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, AVX2::int_v>::value>)
{
return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, AVX2::uint_v>::value>)
{
return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
uint(x3.data()), 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3,
enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, AVX2::float_v>::value>)
{
return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
}
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, AVX2::int_v>::value>)
{
return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, AVX2::uint_v>::value>)
{
return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
uint(x3.data()), uint(x4.data()), 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4,
enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, AVX2::float_v>::value>)
{
return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), 0, 0);
}
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, AVX2::int_v>::value>)
{
return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, AVX2::uint_v>::value>)
{
return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
uint(x3.data()), uint(x4.data()), uint(x5.data()), 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
enable_if<std::is_same<Return, AVX2::float_v>::value>)
{
return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), 0);
}
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
enable_if<std::is_same<Return, AVX2::int_v>::value>)
{
return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
enable_if<std::is_same<Return, AVX2::uint_v>::value>)
{
return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
uint(x3.data()), uint(x4.data()), uint(x5.data()),
uint(x6.data()), 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6,
enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7,
enable_if<std::is_same<Return, AVX2::float_v>::value>)
{
return _mm256_setr_ps(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data());
}
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7,
enable_if<std::is_same<Return, AVX2::int_v>::value>)
{
return _mm256_setr_epi32(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data());
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7,
enable_if<std::is_same<Return, AVX2::uint_v>::value>)
{
return _mm256_setr_epi32(uint(x0.data()), uint(x1.data()), uint(x2.data()),
uint(x3.data()), uint(x4.data()), uint(x5.data()),
uint(x6.data()), uint(x7.data()));
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7,
enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), 0, 0, 0, 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7,
enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), 0, 0, 0, 0, 0, 0, 0, 0);
}
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), 0, 0, 0, 0, 0, 0,
0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), 0, 0, 0, 0, 0, 0,
0);
}
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), 0, 0,
0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(), 0, 0,
0, 0, 0, 0);
}
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10,
enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
x10.data(), 0, 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10,
enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
x10.data(), 0, 0, 0, 0, 0);
}
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
x10.data(), x11.data(), 0, 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
x10.data(), x11.data(), 0, 0, 0, 0);
}
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
x10.data(), x11.data(), x12.data(), 0, 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
x10.data(), x11.data(), x12.data(), 0, 0, 0);
}
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, Scalar::Vector<T> x13,
enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
x10.data(), x11.data(), x12.data(), x13.data(), 0, 0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, Scalar::Vector<T> x13,
enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
x10.data(), x11.data(), x12.data(), x13.data(), 0, 0);
}
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
x10.data(), x11.data(), x12.data(), x13.data(), x14.data(),
0);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
x10.data(), x11.data(), x12.data(), x13.data(), x14.data(),
0);
}
#endif
#ifdef Vc_IMPL_AVX2
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
Scalar::Vector<T> x15, enable_if<std::is_same<Return, AVX2::short_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
x10.data(), x11.data(), x12.data(), x13.data(), x14.data(),
x15.data());
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Vector<T> x0, Scalar::Vector<T> x1, Scalar::Vector<T> x2,
Scalar::Vector<T> x3, Scalar::Vector<T> x4, Scalar::Vector<T> x5,
Scalar::Vector<T> x6, Scalar::Vector<T> x7, Scalar::Vector<T> x8,
Scalar::Vector<T> x9, Scalar::Vector<T> x10, Scalar::Vector<T> x11,
Scalar::Vector<T> x12, Scalar::Vector<T> x13, Scalar::Vector<T> x14,
Scalar::Vector<T> x15, enable_if<std::is_same<Return, AVX2::ushort_v>::value>)
{
return _mm256_setr_epi16(x0.data(), x1.data(), x2.data(), x3.data(), x4.data(),
x5.data(), x6.data(), x7.data(), x8.data(), x9.data(),
x10.data(), x11.data(), x12.data(), x13.data(), x14.data(),
x15.data());
}
#endif
template <typename To, typename FromT>
Vc_INTRINSIC Vc_CONST To
simd_cast(AVX2::Vector<FromT> x, enable_if<Scalar::is_vector<To>::value>)
{
return static_cast<To>(x[0]);
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(const AVX2::Mask<T> &k, enable_if<AVX2::is_mask<Return>::value>)
{
return {Detail::mask_cast<Mask<T, VectorAbi::Avx>::Size, Return::Size,
typename Return::VectorTypeF>(k.dataI())};
}
Vc_SIMD_CAST_AVX_2(double_m, float_m) { return AVX::concat(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_2(double_m, int_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi32(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_AVX_2(double_m, uint_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi32(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_AVX_2(double_m, short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI())))); }
Vc_SIMD_CAST_AVX_2(double_m, ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI())))); }
Vc_SIMD_CAST_AVX_2( float_m, short_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_AVX_2( float_m, ushort_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_AVX_2( int_m, short_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_AVX_2( int_m, ushort_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_AVX_2( uint_m, short_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_AVX_2( uint_m, ushort_m) { return Mem::permute4x64<X0, X2, X1, X3>(_mm256_packs_epi16(x0.dataI(), x1.dataI())); }
#endif
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_AVX_4(double_m, short_m)
{
using namespace AVX;
const auto tmp = _mm256_packs_epi32(
_mm256_packs_epi32(x0.dataI(), x1.dataI())
,
_mm256_packs_epi32(x2.dataI(), x3.dataI())
);
return concat(_mm_unpacklo_epi32(lo128(tmp), hi128(tmp)),
_mm_unpackhi_epi32(lo128(tmp), hi128(tmp)));
}
Vc_SIMD_CAST_AVX_4(double_m, ushort_m) { return simd_cast<AVX2::short_m>(x0, x1, x2, x3).data(); }
#endif
Vc_SIMD_CAST_1(SSE::double_m, AVX2::double_m) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE::double_m, AVX2:: float_m) { return AVX::zeroExtend(simd_cast<SSE:: float_m>(x).data()); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_1(SSE::double_m, AVX2:: int_m) { return AVX::zeroExtend(simd_cast<SSE:: int_m>(x).data()); }
Vc_SIMD_CAST_1(SSE::double_m, AVX2:: uint_m) { return AVX::zeroExtend(simd_cast<SSE:: uint_m>(x).data()); }
Vc_SIMD_CAST_1(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
Vc_SIMD_CAST_1(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
#endif
Vc_SIMD_CAST_1(SSE:: float_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.dataF(), x.dataF()), _mm_unpackhi_ps(x.dataF(), x.dataF())); }
Vc_SIMD_CAST_1(SSE:: int_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.dataF(), x.dataF()), _mm_unpackhi_ps(x.dataF(), x.dataF())); }
Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::double_m) { return AVX::concat(_mm_unpacklo_ps(x.dataF(), x.dataF()), _mm_unpackhi_ps(x.dataF(), x.dataF())); }
Vc_SIMD_CAST_1(SSE:: short_m, AVX2::double_m) { auto tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); }
Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::double_m) { auto tmp = _mm_unpacklo_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); }
Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: float_m) { return AVX::zeroExtend(x.dataF()); }
Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: float_m) { return AVX::zeroExtend(x.dataF()); }
Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: float_m) { return AVX::zeroExtend(x.dataF()); }
Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: float_m) { return AVX::concat(_mm_unpacklo_epi16(x.dataI(), x.dataI()), _mm_unpackhi_epi16(x.dataI(), x.dataI())); }
Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: float_m) { return AVX::concat(_mm_unpacklo_epi16(x.dataI(), x.dataI()), _mm_unpackhi_epi16(x.dataI(), x.dataI())); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: int_m) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: uint_m) { return AVX::zeroExtend(x.data()); }
Vc_SIMD_CAST_1(SSE:: float_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
Vc_SIMD_CAST_1(SSE:: int_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
Vc_SIMD_CAST_1(SSE:: uint_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: short_m) { return AVX::zeroExtend(simd_cast<SSE:: short_m>(x).data()); }
Vc_SIMD_CAST_1(SSE:: float_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
Vc_SIMD_CAST_1(SSE:: int_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
Vc_SIMD_CAST_1(SSE:: uint_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
Vc_SIMD_CAST_1(SSE:: short_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
Vc_SIMD_CAST_1(SSE::ushort_m, AVX2::ushort_m) { return AVX::zeroExtend(simd_cast<SSE::ushort_m>(x).data()); }
Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: int_m) { const auto v = Mem::permute4x64<X0, X2, X1, X3>(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); }
Vc_SIMD_CAST_1(SSE:: short_m, AVX2:: uint_m) { const auto v = Mem::permute4x64<X0, X2, X1, X3>(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); }
Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: int_m) { const auto v = Mem::permute4x64<X0, X2, X1, X3>(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); }
Vc_SIMD_CAST_1(SSE::ushort_m, AVX2:: uint_m) { const auto v = Mem::permute4x64<X0, X2, X1, X3>(AVX::avx_cast<__m256i>(x.data())); return _mm256_unpacklo_epi16(v, v); }
#endif
Vc_SIMD_CAST_2(SSE::double_m, AVX2::double_m) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE::double_m, AVX2:: float_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: float_m) { return AVX::concat(x0.data(), x1.data()); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_2(SSE::double_m, AVX2:: int_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_2(SSE::double_m, AVX2:: uint_m) { return AVX::zeroExtend(_mm_packs_epi32(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_2(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_setzero_si128())); }
Vc_SIMD_CAST_2(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_setzero_si128())); }
Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: float_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_2(SSE:: float_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: int_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_2(SSE:: int_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: int_m) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: uint_m) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: uint_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_2(SSE:: uint_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(x0.dataI(), x1.dataI())); }
Vc_SIMD_CAST_2(SSE:: short_m, AVX2:: short_m) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE:: short_m, AVX2::ushort_m) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE::ushort_m, AVX2:: short_m) { return AVX::concat(x0.data(), x1.data()); }
Vc_SIMD_CAST_2(SSE::ushort_m, AVX2::ushort_m) { return AVX::concat(x0.data(), x1.data()); }
#endif
Vc_SIMD_CAST_4(SSE::double_m, AVX2:: float_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_4(SSE::double_m, AVX2:: int_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); }
Vc_SIMD_CAST_4(SSE::double_m, AVX2:: uint_m) { return AVX::concat(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI())); }
Vc_SIMD_CAST_4(SSE::double_m, AVX2:: short_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI()))); }
Vc_SIMD_CAST_4(SSE::double_m, AVX2::ushort_m) { return AVX::zeroExtend(_mm_packs_epi16(_mm_packs_epi32(x0.dataI(), x1.dataI()), _mm_packs_epi32(x2.dataI(), x3.dataI()))); }
Vc_SIMD_CAST_4(SSE:: float_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
Vc_SIMD_CAST_4(SSE:: float_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
Vc_SIMD_CAST_4(SSE:: int_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
Vc_SIMD_CAST_4(SSE:: int_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
Vc_SIMD_CAST_4(SSE:: uint_m, AVX2:: short_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
Vc_SIMD_CAST_4(SSE:: uint_m, AVX2::ushort_m) { return AVX::concat(_mm_packs_epi16(x0.dataI(), x1.dataI()), _mm_packs_epi16(x2.dataI(), x3.dataI())); }
#endif
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Mask<T> k, enable_if<AVX2::is_mask<Return>::value>)
{
Return r{false};
r[0] = k.data();
return r;
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1,
enable_if<AVX2::is_mask<Return>::value>)
{
Return r{false};
r[0] = k0.data();
r[1] = k1.data();
return r;
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 4)>)
{
Return r{false};
r[0] = k0.data();
r[1] = k1.data();
r[2] = k2.data();
r[3] = k3.data();
return r;
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
Scalar::Mask<T> k4, Scalar::Mask<T> k5, Scalar::Mask<T> k6, Scalar::Mask<T> k7,
enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 8)>)
{
Return r{false};
r[0] = k0.data();
r[1] = k1.data();
r[2] = k2.data();
r[3] = k3.data();
r[4] = k4.data();
r[5] = k5.data();
r[6] = k6.data();
r[7] = k7.data();
return r;
}
template <typename Return, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(Scalar::Mask<T> k0, Scalar::Mask<T> k1, Scalar::Mask<T> k2, Scalar::Mask<T> k3,
Scalar::Mask<T> k4, Scalar::Mask<T> k5, Scalar::Mask<T> k6, Scalar::Mask<T> k7,
Scalar::Mask<T> k8, Scalar::Mask<T> k9, Scalar::Mask<T> k10,
Scalar::Mask<T> k11, Scalar::Mask<T> k12, Scalar::Mask<T> k13,
Scalar::Mask<T> k14, Scalar::Mask<T> k15,
enable_if<(AVX2::is_mask<Return>::value && Return::Size >= 16)>)
{
Return r{false};
r[0] = k0.data();
r[1] = k1.data();
r[2] = k2.data();
r[3] = k3.data();
r[4] = k4.data();
r[5] = k5.data();
r[6] = k6.data();
r[7] = k7.data();
r[8] = k8.data();
r[9] = k9.data();
r[10] = k10.data();
r[11] = k11.data();
r[12] = k12.data();
r[13] = k13.data();
r[14] = k14.data();
r[15] = k15.data();
return r;
}
Vc_SIMD_CAST_1(AVX2::double_m, SSE::double_m) { return AVX::lo128(x.data()); }
Vc_SIMD_CAST_1(AVX2::double_m, SSE:: float_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
Vc_SIMD_CAST_1(AVX2::double_m, SSE:: int_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
Vc_SIMD_CAST_1(AVX2::double_m, SSE:: uint_m) { return _mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
Vc_SIMD_CAST_1(AVX2::double_m, SSE:: short_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())), _mm_setzero_si128()); }
Vc_SIMD_CAST_1(AVX2::double_m, SSE::ushort_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())), _mm_setzero_si128()); }
Vc_SIMD_CAST_1(AVX2:: float_m, SSE::double_m) { return _mm_unpacklo_ps(AVX::lo128(x.data()), AVX::lo128(x.data())); }
Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: float_m) { return AVX::lo128(x.data()); }
Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: int_m) { return AVX::lo128(x.data()); }
Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: uint_m) { return AVX::lo128(x.data()); }
Vc_SIMD_CAST_1(AVX2:: float_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
Vc_SIMD_CAST_1(AVX2:: float_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
#ifdef Vc_IMPL_AVX2
Vc_SIMD_CAST_1(AVX2:: int_m, SSE::double_m) { return _mm_unpacklo_epi32(AVX::lo128(x.dataI()), AVX::lo128(x.dataI())); }
Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: float_m) { return AVX::lo128(x.dataI()); }
Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: int_m) { return AVX::lo128(x.dataI()); }
Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: uint_m) { return AVX::lo128(x.dataI()); }
Vc_SIMD_CAST_1(AVX2:: int_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
Vc_SIMD_CAST_1(AVX2:: int_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::double_m) { return _mm_unpacklo_epi32(AVX::lo128(x.dataI()), AVX::lo128(x.dataI())); }
Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: float_m) { return AVX::lo128(x.dataI()); }
Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: int_m) { return AVX::lo128(x.dataI()); }
Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: uint_m) { return AVX::lo128(x.dataI()); }
Vc_SIMD_CAST_1(AVX2:: uint_m, SSE:: short_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
Vc_SIMD_CAST_1(AVX2:: uint_m, SSE::ushort_m) { return _mm_packs_epi16(AVX::lo128(x.dataI()), AVX::hi128(x.dataI())); }
Vc_SIMD_CAST_1(AVX2:: short_m, SSE::double_m) { return simd_cast<SSE::double_m>(SSE::short_m(AVX::lo128(x.data()))); }
Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: float_m) { return simd_cast<SSE:: float_m>(SSE::short_m(AVX::lo128(x.data()))); }
Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: int_m) { return simd_cast<SSE:: int_m>(SSE::short_m(AVX::lo128(x.data()))); }
Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: uint_m) { return simd_cast<SSE:: uint_m>(SSE::short_m(AVX::lo128(x.data()))); }
Vc_SIMD_CAST_1(AVX2:: short_m, SSE:: short_m) { return simd_cast<SSE:: short_m>(SSE::short_m(AVX::lo128(x.data()))); }
Vc_SIMD_CAST_1(AVX2:: short_m, SSE::ushort_m) { return simd_cast<SSE::ushort_m>(SSE::short_m(AVX::lo128(x.data()))); }
Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::double_m) { return simd_cast<SSE::double_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: float_m) { return simd_cast<SSE:: float_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: int_m) { return simd_cast<SSE:: int_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: uint_m) { return simd_cast<SSE:: uint_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
Vc_SIMD_CAST_1(AVX2::ushort_m, SSE:: short_m) { return simd_cast<SSE:: short_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
Vc_SIMD_CAST_1(AVX2::ushort_m, SSE::ushort_m) { return simd_cast<SSE::ushort_m>(SSE::ushort_m(AVX::lo128(x.data()))); }
#endif
Vc_SIMD_CAST_2(AVX2::double_m, SSE:: short_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); }
Vc_SIMD_CAST_2(AVX2::double_m, SSE::ushort_m) { return _mm_packs_epi16(_mm_packs_epi32(AVX::lo128(x0.dataI()), AVX::hi128(x0.dataI())), _mm_packs_epi32(AVX::lo128(x1.dataI()), AVX::hi128(x1.dataI()))); }
template <typename To, typename FromT>
Vc_INTRINSIC Vc_CONST To
simd_cast(AVX2::Mask<FromT> x, enable_if<Scalar::is_mask<To>::value>)
{
return static_cast<To>(x[0]);
}
template <typename Return, int offset, typename From>
Vc_INTRINSIC Vc_CONST enable_if<
(offset == 0 &&
((AVX2::is_vector<From>::value && !Scalar::is_vector<Return>::value &&
Traits::is_simd_vector<Return>::value && !Traits::isSimdArray<Return>::value) ||
(AVX2::is_mask<From>::value && !Scalar::is_mask<Return>::value &&
Traits::is_simd_mask<Return>::value &&
!Traits::isSimdMaskArray<Return>::value))),
Return>
simd_cast(const From &x)
{
return simd_cast<Return>(x);
}
template <typename Return, int offset, typename From>
Vc_INTRINSIC Vc_CONST Return
simd_cast(const From &x,
enable_if<offset == 0 && ((SSE::is_vector<From>::value &&
AVX2::is_vector<Return>::value) ||
(SSE::is_mask<From>::value &&
AVX2::is_mask<Return>::value))>)
{
return simd_cast<Return>(x);
}
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(AVX2::is_vector<Return>::value && offset != 0),
Return>
simd_cast(AVX2::Vector<T> x)
{
using V = AVX2::Vector<T>;
constexpr int shift = sizeof(T) * offset * Return::Size;
static_assert(shift > 0 && shift < sizeof(x), "");
if (shift < 16) {
return simd_cast<Return>(V{AVX::avx_cast<typename V::VectorType>(
_mm_srli_si128(AVX::avx_cast<__m128i>(AVX::lo128(x.data())), shift))});
} else if (shift == 16) {
return simd_cast<Return>(V{Mem::permute128<X1, Const0>(x.data())});
} else {
#ifdef Vc_MSVC
#pragma warning(push)
#pragma warning(disable : 4556)
#endif
return simd_cast<Return>(V{AVX::avx_cast<typename V::VectorType>(
_mm_srli_si128(AVX::avx_cast<__m128i>(AVX::hi128(x.data())), shift - 16))});
#ifdef Vc_MSVC
#pragma warning(pop)
#endif
}
}
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector<Return>::value &&
sizeof(AVX2::Vector<T>) == 32),
Return>
simd_cast(AVX2::Vector<T> x)
{
using V = AVX2::Vector<T>;
constexpr int shift = sizeof(V) / V::Size * offset * Return::Size;
static_assert(shift > 0, "");
static_assert(shift < sizeof(V), "");
using SseVector = SSE::Vector<typename V::EntryType>;
if (shift == 16) {
return simd_cast<Return>(SseVector{AVX::hi128(x.data())});
}
using Intrin = typename SseVector::VectorType;
return simd_cast<Return>(SseVector{AVX::avx_cast<Intrin>(
_mm_alignr_epi8(AVX::avx_cast<__m128i>(AVX::hi128(x.data())),
AVX::avx_cast<__m128i>(AVX::lo128(x.data())), shift))});
}
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_vector<Return>::value &&
sizeof(AVX2::Vector<T>) == 16),
Return>
simd_cast(AVX2::Vector<T> x)
{
using V = AVX2::Vector<T>;
constexpr int shift = sizeof(V) / V::Size * offset * Return::Size;
static_assert(shift > 0, "");
static_assert(shift < sizeof(V), "");
using SseVector = SSE::Vector<typename V::EntryType>;
return simd_cast<Return>(SseVector{_mm_srli_si128(x.data(), shift)});
}
Vc_SIMD_CAST_OFFSET(SSE:: short_v, AVX2::double_v, 1) { return simd_cast<AVX2::double_v>(simd_cast<SSE::int_v, 1>(x)); }
Vc_SIMD_CAST_OFFSET(SSE::ushort_v, AVX2::double_v, 1) { return simd_cast<AVX2::double_v>(simd_cast<SSE::int_v, 1>(x)); }
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(const AVX2::Mask<T> &k,
enable_if<(AVX2::is_mask<Return>::value && offset == 1 &&
AVX2::Mask<T>::Size == Return::Size * 2)> = nullarg)
{
const auto tmp = AVX::hi128(k.dataI());
return AVX::concat(_mm_unpacklo_epi8(tmp, tmp), _mm_unpackhi_epi8(tmp, tmp));
}
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(const AVX2::Mask<T> &k,
enable_if<(AVX2::is_mask<Return>::value && offset == 1 &&
AVX2::Mask<T>::Size == Return::Size * 4)> = nullarg)
{
auto tmp = AVX::lo128(k.dataI());
tmp = _mm_unpackhi_epi8(tmp, tmp);
return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp));
}
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(const AVX2::Mask<T> &k,
enable_if<(AVX2::is_mask<Return>::value && offset == 2 &&
AVX2::Mask<T>::Size == Return::Size * 4)> = nullarg)
{
auto tmp = AVX::hi128(k.dataI());
tmp = _mm_unpacklo_epi8(tmp, tmp);
return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp));
}
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST Return
simd_cast(const AVX2::Mask<T> &k,
enable_if<(AVX2::is_mask<Return>::value && offset == 3 &&
AVX2::Mask<T>::Size == Return::Size * 4)> = nullarg)
{
auto tmp = AVX::hi128(k.dataI());
tmp = _mm_unpackhi_epi8(tmp, tmp);
return AVX::concat(_mm_unpacklo_epi16(tmp, tmp), _mm_unpackhi_epi16(tmp, tmp));
}
Vc_SIMD_CAST_OFFSET(SSE:: short_m, AVX2::double_m, 1) { auto tmp = _mm_unpackhi_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); }
Vc_SIMD_CAST_OFFSET(SSE::ushort_m, AVX2::double_m, 1) { auto tmp = _mm_unpackhi_epi16(x.dataI(), x.dataI()); return AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)); }
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask<Return>::value &&
sizeof(AVX2::Mask<T>) == 32),
Return>
simd_cast(AVX2::Mask<T> x)
{
using M = AVX2::Mask<T>;
constexpr int shift = sizeof(M) / M::Size * offset * Return::Size;
static_assert(shift > 0, "");
static_assert(shift < sizeof(M), "");
using SseVector = SSE::Mask<Traits::entry_type_of<typename M::Vector>>;
if (shift == 16) {
return simd_cast<Return>(SseVector{AVX::hi128(x.data())});
}
using Intrin = typename SseVector::VectorType;
return simd_cast<Return>(SseVector{AVX::avx_cast<Intrin>(
_mm_alignr_epi8(AVX::hi128(x.dataI()), AVX::lo128(x.dataI()), shift))});
}
template <typename Return, int offset, typename T>
Vc_INTRINSIC Vc_CONST enable_if<(offset != 0 && SSE::is_mask<Return>::value &&
sizeof(AVX2::Mask<T>) == 16),
Return>
simd_cast(AVX2::Mask<T> x)
{
return simd_cast<Return, offset>(simd_cast<SSE::Mask<T>>(x));
}
#undef Vc_SIMD_CAST_AVX_1
#undef Vc_SIMD_CAST_AVX_2
#undef Vc_SIMD_CAST_AVX_3
#undef Vc_SIMD_CAST_AVX_4
#undef Vc_SIMD_CAST_1
#undef Vc_SIMD_CAST_2
#undef Vc_SIMD_CAST_3
#undef Vc_SIMD_CAST_4
#undef Vc_SIMD_CAST_5
#undef Vc_SIMD_CAST_6
#undef Vc_SIMD_CAST_7
#undef Vc_SIMD_CAST_8
#undef Vc_SIMD_CAST_OFFSET
}
#endif
#endif
#endif
namespace Vc_VERSIONED_NAMESPACE
{
using double_v = Vector<double>;
using float_v = Vector<float>;
using int_v = Vector<int>;
using uint_v = Vector<uint>;
using short_v = Vector<short>;
using ushort_v = Vector<ushort>;
using llong_v = Vector<llong>;
using ullong_v = Vector<ullong>;
using long_v = Vector<long>;
using ulong_v = Vector<ulong>;
using schar_v = Vector<schar>;
using uchar_v = Vector<uchar>;
using double_m = Mask<double>;
using float_m = Mask< float>;
using llong_m = Mask< llong>;
using ullong_m = Mask<ullong>;
using long_m = Mask< long>;
using ulong_m = Mask< ulong>;
using int_m = Mask< int>;
using uint_m = Mask< uint>;
using short_m = Mask< short>;
using ushort_m = Mask<ushort>;
using schar_m = Mask< schar>;
using uchar_m = Mask< uchar>;
typedef Vector<std:: int_least64_t> int_least64_v;
typedef Vector<std::uint_least64_t> uint_least64_v;
typedef Vector<std:: int_least32_t> int_least32_v;
typedef Vector<std::uint_least32_t> uint_least32_v;
typedef Vector<std:: int_least16_t> int_least16_v;
typedef Vector<std::uint_least16_t> uint_least16_v;
typedef Vector<std:: int_least8_t> int_least8_v;
typedef Vector<std:: uint_least8_t> uint_least8_v;
typedef Mask<std:: int_least64_t> int_least64_m;
typedef Mask<std::uint_least64_t> uint_least64_m;
typedef Mask<std:: int_least32_t> int_least32_m;
typedef Mask<std::uint_least32_t> uint_least32_m;
typedef Mask<std:: int_least16_t> int_least16_m;
typedef Mask<std::uint_least16_t> uint_least16_m;
typedef Mask<std:: int_least8_t> int_least8_m;
typedef Mask<std:: uint_least8_t> uint_least8_m;
typedef Vector<std:: int_fast64_t> int_fast64_v;
typedef Vector<std::uint_fast64_t> uint_fast64_v;
typedef Vector<std:: int_fast32_t> int_fast32_v;
typedef Vector<std::uint_fast32_t> uint_fast32_v;
typedef Vector<std:: int_fast16_t> int_fast16_v;
typedef Vector<std::uint_fast16_t> uint_fast16_v;
typedef Vector<std:: int_fast8_t> int_fast8_v;
typedef Vector<std:: uint_fast8_t> uint_fast8_v;
typedef Mask<std:: int_fast64_t> int_fast64_m;
typedef Mask<std::uint_fast64_t> uint_fast64_m;
typedef Mask<std:: int_fast32_t> int_fast32_m;
typedef Mask<std::uint_fast32_t> uint_fast32_m;
typedef Mask<std:: int_fast16_t> int_fast16_m;
typedef Mask<std::uint_fast16_t> uint_fast16_m;
typedef Mask<std:: int_fast8_t> int_fast8_m;
typedef Mask<std:: uint_fast8_t> uint_fast8_m;
#if defined INT64_MAX && defined UINT64_MAX
typedef Vector<std:: int64_t> int64_v;
typedef Vector<std::uint64_t> uint64_v;
typedef Mask<std:: int64_t> int64_m;
typedef Mask<std::uint64_t> uint64_m;
#endif
#if defined INT32_MAX && defined UINT32_MAX
typedef Vector<std:: int32_t> int32_v;
typedef Vector<std::uint32_t> uint32_v;
typedef Mask<std:: int32_t> int32_m;
typedef Mask<std::uint32_t> uint32_m;
#endif
#if defined INT16_MAX && defined UINT16_MAX
typedef Vector<std:: int16_t> int16_v;
typedef Vector<std::uint16_t> uint16_v;
typedef Mask<std:: int16_t> int16_m;
typedef Mask<std::uint16_t> uint16_m;
#endif
#if defined INT8_MAX && defined UINT8_MAX
typedef Vector<std:: int8_t> int8_v;
typedef Vector<std::uint8_t> uint8_v;
typedef Mask<std:: int8_t> int8_m;
typedef Mask<std::uint8_t> uint8_m;
#endif
namespace {
static_assert(double_v::Size == Vc_DOUBLE_V_SIZE, "Vc_DOUBLE_V_SIZE macro defined to an incorrect value");
static_assert(float_v::Size == Vc_FLOAT_V_SIZE , "Vc_FLOAT_V_SIZE macro defined to an incorrect value ");
static_assert(int_v::Size == Vc_INT_V_SIZE , "Vc_INT_V_SIZE macro defined to an incorrect value ");
static_assert(uint_v::Size == Vc_UINT_V_SIZE , "Vc_UINT_V_SIZE macro defined to an incorrect value ");
static_assert(short_v::Size == Vc_SHORT_V_SIZE , "Vc_SHORT_V_SIZE macro defined to an incorrect value ");
static_assert(ushort_v::Size == Vc_USHORT_V_SIZE, "Vc_USHORT_V_SIZE macro defined to an incorrect value");
}
}
#ifndef COMMON_OPERATORS_H_
#define COMMON_OPERATORS_H_
#ifndef VC_COMMON_SIMDARRAY_H_
#define VC_COMMON_SIMDARRAY_H_
#include <array>
#include <limits>
#ifndef VC_COMMON_SIMDARRAYHELPER_H_
#define VC_COMMON_SIMDARRAYHELPER_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace
{
static constexpr struct private_init_t {} private_init = {};
}
namespace Common
{
namespace Operations
{
struct tag {};
#define Vc_DEFINE_OPERATION(name_) \
struct name_ : public tag { \
template <typename V, typename... Args> \
Vc_INTRINSIC void operator()(V &v, Args &&... args) \
{ \
v.name_(std::forward<Args>(args)...); \
} \
}
Vc_DEFINE_OPERATION(gather);
Vc_DEFINE_OPERATION(scatter);
Vc_DEFINE_OPERATION(load);
Vc_DEFINE_OPERATION(store);
Vc_DEFINE_OPERATION(setZero);
Vc_DEFINE_OPERATION(setZeroInverted);
Vc_DEFINE_OPERATION(assign);
#undef Vc_DEFINE_OPERATION
#define Vc_DEFINE_OPERATION(name_,code_) \
struct name_ : public tag { \
template <typename V> Vc_INTRINSIC void operator()(V &v) { code_; } \
}
Vc_DEFINE_OPERATION(increment, ++(v));
Vc_DEFINE_OPERATION(decrement, --(v));
Vc_DEFINE_OPERATION(random, v = V::Random());
#undef Vc_DEFINE_OPERATION
#define Vc_DEFINE_OPERATION_FORWARD(name_) \
struct Forward_##name_ : public tag \
{ \
template <typename... Args, typename = decltype(name_(std::declval<Args>()...))> \
Vc_INTRINSIC void operator()(decltype(name_(std::declval<Args>()...)) &v, \
Args &&... args) \
{ \
v = name_(std::forward<Args>(args)...); \
} \
template <typename... Args, typename = decltype(name_(std::declval<Args>()...))> \
Vc_INTRINSIC void operator()(std::nullptr_t, Args && ... args) \
{ \
name_(std::forward<Args>(args)...); \
} \
}
Vc_DEFINE_OPERATION_FORWARD(abs);
Vc_DEFINE_OPERATION_FORWARD(asin);
Vc_DEFINE_OPERATION_FORWARD(atan);
Vc_DEFINE_OPERATION_FORWARD(atan2);
Vc_DEFINE_OPERATION_FORWARD(cos);
Vc_DEFINE_OPERATION_FORWARD(ceil);
Vc_DEFINE_OPERATION_FORWARD(copysign);
Vc_DEFINE_OPERATION_FORWARD(exp);
Vc_DEFINE_OPERATION_FORWARD(exponent);
Vc_DEFINE_OPERATION_FORWARD(fma);
Vc_DEFINE_OPERATION_FORWARD(floor);
Vc_DEFINE_OPERATION_FORWARD(frexp);
Vc_DEFINE_OPERATION_FORWARD(isfinite);
Vc_DEFINE_OPERATION_FORWARD(isinf);
Vc_DEFINE_OPERATION_FORWARD(isnan);
Vc_DEFINE_OPERATION_FORWARD(isnegative);
Vc_DEFINE_OPERATION_FORWARD(ldexp);
Vc_DEFINE_OPERATION_FORWARD(log);
Vc_DEFINE_OPERATION_FORWARD(log10);
Vc_DEFINE_OPERATION_FORWARD(log2);
Vc_DEFINE_OPERATION_FORWARD(reciprocal);
Vc_DEFINE_OPERATION_FORWARD(round);
Vc_DEFINE_OPERATION_FORWARD(rsqrt);
Vc_DEFINE_OPERATION_FORWARD(sin);
Vc_DEFINE_OPERATION_FORWARD(sincos);
Vc_DEFINE_OPERATION_FORWARD(sqrt);
Vc_DEFINE_OPERATION_FORWARD(trunc);
Vc_DEFINE_OPERATION_FORWARD(min);
Vc_DEFINE_OPERATION_FORWARD(max);
#undef Vc_DEFINE_OPERATION_FORWARD
template<typename T> using is_operation = std::is_base_of<tag, T>;
}
template <typename T_, std::size_t Pieces_, std::size_t Index_> struct Segment
{
static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report.");
using type = T_;
using type_decayed = typename std::decay<type>::type;
static constexpr std::size_t Pieces = Pieces_;
static constexpr std::size_t Index = Index_;
using fixed_size_type =
fixed_size_simd<conditional_t<Traits::is_simd_vector<type_decayed>::value,
typename type_decayed::EntryType, float>,
type_decayed::Size / Pieces>;
type data;
static constexpr std::size_t EntryOffset = Index * type_decayed::Size / Pieces;
decltype(std::declval<const type &>()[0]) operator[](size_t i) const { return data[i + EntryOffset]; }
fixed_size_type to_fixed_size() const
{
return simd_cast<fixed_size_type, Index>(data);
}
};
template <typename T_, std::size_t Pieces_, std::size_t Index_>
struct Segment<T_ *, Pieces_, Index_> {
static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report.");
using type = T_ *;
using type_decayed = typename std::decay<T_>::type;
static constexpr size_t Pieces = Pieces_;
static constexpr size_t Index = Index_;
using fixed_size_type = fixed_size_simd<
typename std::conditional<Traits::is_simd_vector<type_decayed>::value,
typename type_decayed::VectorEntryType, float>::type,
type_decayed::Size / Pieces> *;
type data;
static constexpr std::size_t EntryOffset = Index * type_decayed::size() / Pieces;
fixed_size_type to_fixed_size() const
{
return reinterpret_cast<
#ifdef Vc_GCC
typename std::remove_pointer<fixed_size_type>::type
#else
MayAlias<typename std::remove_pointer<fixed_size_type>::type>
#endif
*>(data) +
Index;
}
};
template <typename T, std::size_t Offset> struct AddOffset
{
constexpr AddOffset() = default;
};
template <std::size_t secondOffset> class Split
{
template <typename U, std::size_t N, typename V, std::size_t M,
typename = enable_if<N != M>>
static Vc_INTRINSIC auto loImpl(const SimdArray<U, N, V, M> &x)
-> decltype(internal_data0(x))
{
return internal_data0(x);
}
template <typename U, std::size_t N, typename V, std::size_t M,
typename = enable_if<N != M>>
static Vc_INTRINSIC auto hiImpl(const SimdArray<U, N, V, M> &x)
-> decltype(internal_data1(x))
{
return internal_data1(x);
}
template <typename U, std::size_t N, typename V, std::size_t M,
typename = enable_if<N != M>>
static Vc_INTRINSIC auto loImpl(SimdArray<U, N, V, M> *x)
-> decltype(&internal_data0(*x))
{
return &internal_data0(*x);
}
template <typename U, std::size_t N, typename V, std::size_t M,
typename = enable_if<N != M>>
static Vc_INTRINSIC auto hiImpl(SimdArray<U, N, V, M> *x)
-> decltype(&internal_data1(*x))
{
return &internal_data1(*x);
}
template <typename U, std::size_t N, typename V>
static Vc_INTRINSIC Segment<V, 2, 0> loImpl(const SimdArray<U, N, V, N> &x)
{
return {internal_data(x)};
}
template <typename U, std::size_t N, typename V>
static Vc_INTRINSIC Segment<V, 2, 1> hiImpl(const SimdArray<U, N, V, N> &x)
{
return {internal_data(x)};
}
template <typename U, std::size_t N, typename V>
static Vc_INTRINSIC Segment<V *, 2, 0> loImpl(SimdArray<U, N, V, N> *x)
{
return {&internal_data(*x)};
}
template <typename U, std::size_t N, typename V>
static Vc_INTRINSIC Segment<V *, 2, 1> hiImpl(SimdArray<U, N, V, N> *x)
{
return {&internal_data(*x)};
}
template <typename U, std::size_t N, typename V, std::size_t M>
static Vc_INTRINSIC auto loImpl(const SimdMaskArray<U, N, V, M> &x) -> decltype(internal_data0(x))
{
return internal_data0(x);
}
template <typename U, std::size_t N, typename V, std::size_t M>
static Vc_INTRINSIC auto hiImpl(const SimdMaskArray<U, N, V, M> &x) -> decltype(internal_data1(x))
{
return internal_data1(x);
}
template <typename U, std::size_t N, typename V>
static Vc_INTRINSIC Segment<typename SimdMaskArray<U, N, V, N>::mask_type, 2, 0> loImpl(
const SimdMaskArray<U, N, V, N> &x)
{
return {internal_data(x)};
}
template <typename U, std::size_t N, typename V>
static Vc_INTRINSIC Segment<typename SimdMaskArray<U, N, V, N>::mask_type, 2, 1> hiImpl(
const SimdMaskArray<U, N, V, N> &x)
{
return {internal_data(x)};
}
#ifdef Vc_IMPL_AVX
template <class T>
static Vc_INTRINSIC SSE::Vector<T> loImpl(Vector<T, VectorAbi::Avx> &&x)
{
return simd_cast<SSE::Vector<T>, 0>(x);
}
template <class T>
static Vc_INTRINSIC SSE::Vector<T> hiImpl(Vector<T, VectorAbi::Avx> &&x)
{
return simd_cast<SSE::Vector<T>, 1>(x);
}
template <class T>
static Vc_INTRINSIC SSE::Mask<T> loImpl(Mask<T, VectorAbi::Avx> &&x)
{
return simd_cast<SSE::Mask<T>, 0>(x);
}
template <class T>
static Vc_INTRINSIC SSE::Mask<T> hiImpl(Mask<T, VectorAbi::Avx> &&x)
{
return simd_cast<SSE::Mask<T>, 1>(x);
}
#endif
template <typename T>
static constexpr bool is_vector_or_mask(){
return (Traits::is_simd_vector<T>::value && !Traits::isSimdArray<T>::value) ||
(Traits::is_simd_mask<T>::value && !Traits::isSimdMaskArray<T>::value);
}
template <typename V>
static Vc_INTRINSIC Segment<V, 2, 0> loImpl(V &&x, enable_if<is_vector_or_mask<V>()> = nullarg)
{
return {std::forward<V>(x)};
}
template <typename V>
static Vc_INTRINSIC Segment<V, 2, 1> hiImpl(V &&x, enable_if<is_vector_or_mask<V>()> = nullarg)
{
return {std::forward<V>(x)};
}
template <class T, class A>
static Vc_INTRINSIC const T *loImpl(const std::vector<T, A> &x)
{
return x.data();
}
template <class T, class A>
static Vc_INTRINSIC const T *hiImpl(const std::vector<T, A> &x)
{
return x.data() + secondOffset;
}
template <typename V, std::size_t Pieces, std::size_t Index>
static Vc_INTRINSIC Segment<V, 2 * Pieces, 2 * Index> loImpl(
const Segment<V, Pieces, Index> &x)
{
return {x.data};
}
template <typename V, std::size_t Pieces, std::size_t Index>
static Vc_INTRINSIC Segment<V, 2 * Pieces, 2 * Index + 1> hiImpl(
const Segment<V, Pieces, Index> &x)
{
return {x.data};
}
template <typename T, typename = decltype(loImpl(std::declval<T>()))>
static std::true_type have_lo_impl(int);
template <typename T> static std::false_type have_lo_impl(float);
template <typename T> static constexpr bool have_lo_impl()
{
return decltype(have_lo_impl<T>(1))::value;
}
template <typename T, typename = decltype(hiImpl(std::declval<T>()))>
static std::true_type have_hi_impl(int);
template <typename T> static std::false_type have_hi_impl(float);
template <typename T> static constexpr bool have_hi_impl()
{
return decltype(have_hi_impl<T>(1))::value;
}
public:
template <typename U>
static Vc_INTRINSIC const U *lo(Operations::gather, const U *ptr)
{
return ptr;
}
template <typename U>
static Vc_INTRINSIC const U *hi(Operations::gather, const U *ptr)
{
return ptr + secondOffset;
}
template <typename U, typename = enable_if<!std::is_pointer<U>::value>>
static Vc_ALWAYS_INLINE decltype(loImpl(std::declval<U>()))
lo(Operations::gather, U &&x)
{
return loImpl(std::forward<U>(x));
}
template <typename U, typename = enable_if<!std::is_pointer<U>::value>>
static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval<U>()))
hi(Operations::gather, U &&x)
{
return hiImpl(std::forward<U>(x));
}
template <typename U>
static Vc_INTRINSIC const U *lo(Operations::scatter, const U *ptr)
{
return ptr;
}
template <typename U>
static Vc_INTRINSIC const U *hi(Operations::scatter, const U *ptr)
{
return ptr + secondOffset;
}
template <typename U>
static Vc_ALWAYS_INLINE decltype(loImpl(std::declval<U>())) lo(U &&x)
{
return loImpl(std::forward<U>(x));
}
template <typename U>
static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval<U>())) hi(U &&x)
{
return hiImpl(std::forward<U>(x));
}
template <typename U>
static Vc_ALWAYS_INLINE enable_if<!have_lo_impl<U>(), U> lo(U &&x)
{
return std::forward<U>(x);
}
template <typename U>
static Vc_ALWAYS_INLINE enable_if<!have_hi_impl<U>(), U> hi(U &&x)
{
return std::forward<U>(x);
}
};
template <typename Op, typename U, std::size_t M, typename V>
static Vc_INTRINSIC const V &actual_value(Op, const SimdArray<U, M, V, M> &x)
{
return internal_data(x);
}
template <typename Op, typename U, std::size_t M, typename V>
static Vc_INTRINSIC V *actual_value(Op, SimdArray<U, M, V, M> *x)
{
return &internal_data(*x);
}
template <typename Op, typename T, size_t Pieces, size_t Index>
static Vc_INTRINSIC typename Segment<T, Pieces, Index>::fixed_size_type actual_value(
Op, Segment<T, Pieces, Index> &&seg)
{
return seg.to_fixed_size();
}
template <typename Op, typename U, std::size_t M, typename V>
static Vc_INTRINSIC const typename V::Mask &actual_value(Op, const SimdMaskArray<U, M, V, M> &x)
{
return internal_data(x);
}
template <typename Op, typename U, std::size_t M, typename V>
static Vc_INTRINSIC typename V::Mask *actual_value(Op, SimdMaskArray<U, M, V, M> *x)
{
return &internal_data(*x);
}
template <typename Op, typename Arg>
Vc_INTRINSIC decltype(actual_value(std::declval<Op &>(), std::declval<Arg>()))
conditionalUnpack(std::true_type, Op op, Arg &&arg)
{
return actual_value(op, std::forward<Arg>(arg));
}
template <typename Op, typename Arg>
Vc_INTRINSIC Arg conditionalUnpack(std::false_type, Op, Arg &&arg)
{
return std::forward<Arg>(arg);
}
template <size_t A, size_t B>
struct selectorType : public std::integral_constant<bool, !((A & (size_t(1) << B)) != 0)> {
};
template <size_t I, typename Op, typename R, typename... Args, size_t... Indexes>
Vc_INTRINSIC decltype(std::declval<Op &>()(std::declval<R &>(),
conditionalUnpack(selectorType<I, Indexes>(),
std::declval<Op &>(),
std::declval<Args>())...))
unpackArgumentsAutoImpl(int, index_sequence<Indexes...>, Op op, R &&r, Args &&... args)
{
op(std::forward<R>(r),
conditionalUnpack(selectorType<I, Indexes>(), op, std::forward<Args>(args))...);
}
template <size_t I, typename Op, typename R, typename... Args, size_t... Indexes>
Vc_INTRINSIC enable_if<(I <= (size_t(1) << sizeof...(Args))), void> unpackArgumentsAutoImpl(
float, index_sequence<Indexes...> is, Op op, R &&r, Args &&... args)
{
static_assert(
I < (1 << sizeof...(Args)) - (std::is_same<R, std::nullptr_t>::value ? 1 : 0),
"Vc or compiler bug. Please report. Failed to find a combination of "
"actual_value(arg) transformations that allows calling Op.");
unpackArgumentsAutoImpl<I + 1, Op, R, Args...>(int(), is, op, std::forward<R>(r),
std::forward<Args>(args)...);
}
#ifdef Vc_ICC
template <size_t, typename... Ts> struct IccWorkaround {
using type = void;
};
template <typename... Ts> struct IccWorkaround<2, Ts...> {
using type = typename std::remove_pointer<typename std::decay<
typename std::tuple_element<1, std::tuple<Ts...>>::type>::type>::type;
};
#endif
template <typename Op, typename R, typename... Args>
Vc_INTRINSIC void unpackArgumentsAuto(Op op, R &&r, Args &&... args)
{
#ifdef Vc_ICC
const int recursionStart =
Traits::isSimdArray<
typename IccWorkaround<sizeof...(Args), Args...>::type>::value &&
(std::is_same<Op, Common::Operations::Forward_frexp>::value ||
std::is_same<Op, Common::Operations::Forward_ldexp>::value)
? 2
: 0;
#else
const int recursionStart = 0;
#endif
unpackArgumentsAutoImpl<recursionStart>(
int(), make_index_sequence<sizeof...(Args)>(), op, std::forward<R>(r),
std::forward<Args>(args)...);
}
}
}
#endif
#ifndef VC_COMMON_SIMDMASKARRAY_H_
#define VC_COMMON_SIMDMASKARRAY_H_
#include <type_traits>
#include <array>
namespace Vc_VERSIONED_NAMESPACE
{
template <typename T, std::size_t N, typename VectorType_>
class SimdMaskArray<T, N, VectorType_, N>
{
public:
using VectorType = VectorType_;
using vector_type = VectorType;
using mask_type = typename vector_type::Mask;
using storage_type = mask_type;
friend storage_type &internal_data(SimdMaskArray &m) { return m.data; }
friend const storage_type &internal_data(const SimdMaskArray &m) { return m.data; }
static constexpr std::size_t size() { return N; }
static constexpr std::size_t Size = size();
static constexpr std::size_t MemoryAlignment = storage_type::MemoryAlignment;
static_assert(Size == vector_type::Size, "size mismatch");
using vectorentry_type = typename mask_type::VectorEntryType;
using value_type = typename mask_type::EntryType;
using Mask = mask_type;
using VectorEntryType = vectorentry_type;
using EntryType = value_type;
using EntryReference = Vc::Detail::ElementReference<storage_type, SimdMaskArray>;
using reference = EntryReference;
using Vector = fixed_size_simd<T, N>;
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(mask_type));
SimdMaskArray() = default;
SimdMaskArray(const SimdMaskArray &) = default;
SimdMaskArray(SimdMaskArray &&) = default;
SimdMaskArray &operator=(const SimdMaskArray &) = default;
SimdMaskArray &operator=(SimdMaskArray &&) = default;
Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerOne one) : data(one) {}
Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerZero zero) : data(zero) {}
Vc_INTRINSIC explicit SimdMaskArray(bool b) : data(b) {}
Vc_INTRINSIC static SimdMaskArray Zero() { return {private_init, storage_type::Zero()}; }
Vc_INTRINSIC static SimdMaskArray One() { return {private_init, storage_type::One()}; }
template <class U, class V, class = enable_if<N == V::Size>>
Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
template <class U, class V, class = enable_if<(N > V::Size && N <= 2 * V::Size)>,
class = U>
Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
template <class U, class V, class = enable_if<(N > 2 * V::Size && N <= 4 * V::Size)>,
class = U, class = U>
Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
template <typename M, std::size_t Pieces, std::size_t Index>
Vc_INTRINSIC_L SimdMaskArray(
Common::Segment<M, Pieces, Index> &&x,
enable_if<Traits::simd_vector_size<M>::value == Size * Pieces> = nullarg) Vc_INTRINSIC_R;
template <class M, class = enable_if<(Traits::is_simd_mask<M>::value &&
!Traits::isSimdMaskArray<M>::value &&
Traits::simd_vector_size<M>::value == Size)>>
Vc_INTRINSIC_L SimdMaskArray(M k) Vc_INTRINSIC_R;
template <class U, class A,
class = enable_if<Vc::Mask<U, A>::Size == N &&
!detail::is_fixed_size_abi<A>::value>>
operator Vc::Mask<U, A>() const
{
return simd_cast<Vc::Mask<U, A>>(data);
}
operator fixed_size_simd_mask<T, N> &()
{
return static_cast<fixed_size_simd_mask<T, N> &>(*this);
}
operator const fixed_size_simd_mask<T, N> &() const
{
return static_cast<const fixed_size_simd_mask<T, N> &>(*this);
}
template <typename Flags = DefaultLoadTag>
Vc_INTRINSIC explicit SimdMaskArray(const bool *mem, Flags f = Flags())
: data(mem, f)
{
}
Vc_INTRINSIC void load(const bool *mem) { data.load(mem); }
template <typename Flags> Vc_INTRINSIC void load(const bool *mem, Flags f)
{
data.load(mem, f);
}
Vc_INTRINSIC void store(bool *mem) const { data.store(mem); }
template <typename Flags> Vc_INTRINSIC void store(bool *mem, Flags f) const
{
data.store(mem, f);
}
Vc_INTRINSIC Vc_PURE bool operator==(const SimdMaskArray &rhs) const
{
return data == rhs.data;
}
Vc_INTRINSIC Vc_PURE bool operator!=(const SimdMaskArray &rhs) const
{
return data != rhs.data;
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator!() const
{
return {private_init, !data};
}
Vc_INTRINSIC SimdMaskArray &operator&=(const SimdMaskArray &rhs)
{
data &= rhs.data;
return *this;
}
Vc_INTRINSIC SimdMaskArray &operator|=(const SimdMaskArray &rhs)
{
data |= rhs.data;
return *this;
}
Vc_INTRINSIC SimdMaskArray &operator^=(const SimdMaskArray &rhs)
{
data ^= rhs.data;
return *this;
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&(
const SimdMaskArray &rhs) const
{
return {private_init, data & rhs.data};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator|(
const SimdMaskArray &rhs) const
{
return {private_init, data | rhs.data};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator^(
const SimdMaskArray &rhs) const
{
return {private_init, data ^ rhs.data};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&&(
const SimdMaskArray &rhs) const
{
return {private_init, data && rhs.data};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator||(
const SimdMaskArray &rhs) const
{
return {private_init, data || rhs.data};
}
Vc_INTRINSIC Vc_PURE bool isFull() const { return data.isFull(); }
Vc_INTRINSIC Vc_PURE bool isNotEmpty() const { return data.isNotEmpty(); }
Vc_INTRINSIC Vc_PURE bool isEmpty() const { return data.isEmpty(); }
Vc_INTRINSIC Vc_PURE bool isMix() const { return data.isMix(); }
Vc_INTRINSIC Vc_PURE int shiftMask() const { return data.shiftMask(); }
Vc_INTRINSIC Vc_PURE int toInt() const { return data.toInt(); }
private:
friend reference;
static Vc_INTRINSIC value_type get(const storage_type &k, int i) noexcept
{
return k[i];
}
template <typename U>
static Vc_INTRINSIC void set(storage_type &k, int i, U &&v) noexcept(
noexcept(std::declval<storage_type &>()[0] = std::declval<U>()))
{
k[i] = std::forward<U>(v);
}
public:
Vc_INTRINSIC Vc_PURE reference operator[](size_t index) noexcept
{
return {data, int(index)};
}
Vc_INTRINSIC Vc_PURE value_type operator[](size_t index) const noexcept
{
return data[index];
}
Vc_INTRINSIC Vc_PURE int count() const { return data.count(); }
Vc_INTRINSIC Vc_PURE int firstOne() const { return data.firstOne(); }
template <typename G>
static Vc_INTRINSIC fixed_size_simd_mask<T, N> generate(const G &gen)
{
return {private_init, mask_type::generate(gen)};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> shifted(int amount) const
{
return {private_init, data.shifted(amount)};
}
template <typename Op, typename... Args>
static Vc_INTRINSIC fixed_size_simd_mask<T, N> fromOperation(Op op, Args &&... args)
{
fixed_size_simd_mask<T, N> r;
Common::unpackArgumentsAuto(op, r.data, std::forward<Args>(args)...);
return r;
}
Vc_INTRINSIC SimdMaskArray(private_init_t, mask_type &&x) : data(std::move(x)) {}
private:
alignas(static_cast<std::size_t>(
Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(VectorType_) /
VectorType_::size()>::value)) storage_type data;
};
template <typename T, std::size_t N, typename VectorType> constexpr std::size_t SimdMaskArray<T, N, VectorType, N>::Size;
template <typename T, std::size_t N, typename VectorType>
constexpr std::size_t SimdMaskArray<T, N, VectorType, N>::MemoryAlignment;
template <typename T, size_t N, typename V, size_t Wt>
class SimdMaskArray
{
static constexpr std::size_t N0 = Common::left_size<N>();
using Split = Common::Split<N0>;
public:
using storage_type0 = fixed_size_simd_mask<T, N0>;
using storage_type1 = fixed_size_simd_mask<T, N - N0>;
static_assert(storage_type0::size() == N0, "");
using vector_type = fixed_size_simd<T, N>;
friend storage_type0 &internal_data0(SimdMaskArray &m) { return m.data0; }
friend storage_type1 &internal_data1(SimdMaskArray &m) { return m.data1; }
friend const storage_type0 &internal_data0(const SimdMaskArray &m) { return m.data0; }
friend const storage_type1 &internal_data1(const SimdMaskArray &m) { return m.data1; }
using mask_type = SimdMaskArray;
static constexpr std::size_t size() { return N; }
static constexpr std::size_t Size = size();
static constexpr std::size_t MemoryAlignment =
storage_type0::MemoryAlignment > storage_type1::MemoryAlignment
? storage_type0::MemoryAlignment
: storage_type1::MemoryAlignment;
static_assert(Size == vector_type::Size, "size mismatch");
using vectorentry_type = typename storage_type0::VectorEntryType;
using value_type = typename storage_type0::EntryType;
using MaskType = mask_type;
using VectorEntryType = vectorentry_type;
using EntryType = value_type;
using EntryReference = Vc::Detail::ElementReference<SimdMaskArray>;
using reference = EntryReference;
using Vector = fixed_size_simd<T, N>;
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(mask_type));
SimdMaskArray() = default;
SimdMaskArray(const SimdMaskArray &) = default;
SimdMaskArray(SimdMaskArray &&) = default;
SimdMaskArray &operator=(const SimdMaskArray &) = default;
SimdMaskArray &operator=(SimdMaskArray &&) = default;
template <typename U, typename W>
Vc_INTRINSIC SimdMaskArray(const SimdMaskArray<U, N, W> &rhs)
: data0(Split::lo(rhs)), data1(Split::hi(rhs))
{
}
template <typename M, std::size_t Pieces, std::size_t Index>
Vc_INTRINSIC SimdMaskArray(
Common::Segment<M, Pieces, Index> &&rhs,
enable_if<Traits::simd_vector_size<M>::value == Size * Pieces> = nullarg)
: data0(Split::lo(rhs)), data1(Split::hi(rhs))
{
}
template <class M, class = enable_if<(Traits::is_simd_mask<M>::value &&
!Traits::isSimdMaskArray<M>::value &&
Traits::simd_vector_size<M>::value == Size)>>
Vc_INTRINSIC SimdMaskArray(M k) : data0(Split::lo(k)), data1(Split::hi(k))
{
}
template <class U, class A,
class = enable_if<Vc::Mask<U, A>::Size == N &&
!detail::is_fixed_size_abi<A>::value>>
operator Vc::Mask<U, A>() const
{
return simd_cast<Vc::Mask<U, A>>(data0, data1);
}
Vc_INTRINSIC operator fixed_size_simd_mask<T, N> &()
{
return static_cast<fixed_size_simd_mask<T, N> &>(*this);
}
Vc_INTRINSIC operator const fixed_size_simd_mask<T, N> &() const
{
return static_cast<const fixed_size_simd_mask<T, N> &>(*this);
}
Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerOne one)
: data0(one), data1(one)
{
}
Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerZero zero)
: data0(zero), data1(zero)
{
}
Vc_INTRINSIC explicit SimdMaskArray(bool b) : data0(b), data1(b) {}
Vc_INTRINSIC static fixed_size_simd_mask<T, N> Zero()
{
return {storage_type0::Zero(), storage_type1::Zero()};
}
Vc_INTRINSIC static fixed_size_simd_mask<T, N> One()
{
return {storage_type0::One(), storage_type1::One()};
}
template <typename Flags = DefaultLoadTag>
Vc_INTRINSIC explicit SimdMaskArray(const bool *mem, Flags f = Flags())
: data0(mem, f), data1(mem + storage_type0::size(), f)
{
}
Vc_INTRINSIC void load(const bool *mem)
{
data0.load(mem);
data1.load(mem + storage_type0::size());
}
template <typename Flags> Vc_INTRINSIC void load(const bool *mem, Flags f)
{
data0.load(mem, f);
data1.load(mem + storage_type0::size(), f);
}
Vc_INTRINSIC void store(bool *mem) const
{
data0.store(mem);
data1.store(mem + storage_type0::size());
}
template <typename Flags> Vc_INTRINSIC void store(bool *mem, Flags f) const
{
data0.store(mem, f);
data1.store(mem + storage_type0::size(), f);
}
Vc_INTRINSIC Vc_PURE bool operator==(const SimdMaskArray &mask) const
{
return data0 == mask.data0 && data1 == mask.data1;
}
Vc_INTRINSIC Vc_PURE bool operator!=(const SimdMaskArray &mask) const
{
return data0 != mask.data0 || data1 != mask.data1;
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator!() const
{
return {!data0, !data1};
}
Vc_INTRINSIC SimdMaskArray &operator&=(const SimdMaskArray &rhs)
{
data0 &= rhs.data0;
data1 &= rhs.data1;
return *this;
}
Vc_INTRINSIC SimdMaskArray &operator|=(const SimdMaskArray &rhs)
{
data0 |= rhs.data0;
data1 |= rhs.data1;
return *this;
}
Vc_INTRINSIC SimdMaskArray &operator^=(const SimdMaskArray &rhs)
{
data0 ^= rhs.data0;
data1 ^= rhs.data1;
return *this;
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&(
const SimdMaskArray &rhs) const
{
return {data0 & rhs.data0, data1 & rhs.data1};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator|(
const SimdMaskArray &rhs) const
{
return {data0 | rhs.data0, data1 | rhs.data1};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator^(
const SimdMaskArray &rhs) const
{
return {data0 ^ rhs.data0, data1 ^ rhs.data1};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&&(
const SimdMaskArray &rhs) const
{
return {data0 && rhs.data0, data1 && rhs.data1};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator||(
const SimdMaskArray &rhs) const
{
return {data0 || rhs.data0, data1 || rhs.data1};
}
Vc_INTRINSIC Vc_PURE bool isFull() const { return data0.isFull() && data1.isFull(); }
Vc_INTRINSIC Vc_PURE bool isNotEmpty() const { return data0.isNotEmpty() || data1.isNotEmpty(); }
Vc_INTRINSIC Vc_PURE bool isEmpty() const { return data0.isEmpty() && data1.isEmpty(); }
Vc_INTRINSIC Vc_PURE bool isMix() const { return !isFull() && !isEmpty(); }
Vc_INTRINSIC Vc_PURE int toInt() const
{
return data0.toInt() | (data1.toInt() << data0.size());
}
private:
friend reference;
static Vc_INTRINSIC value_type get(const SimdMaskArray &o, int i) noexcept
{
if (i < int(o.data0.size())) {
return o.data0[i];
} else {
return o.data1[i - o.data0.size()];
}
}
template <typename U>
static Vc_INTRINSIC void set(SimdMaskArray &o, int i, U &&v) noexcept(
noexcept(std::declval<storage_type0 &>()[0] = std::declval<U>()) &&
noexcept(std::declval<storage_type1 &>()[0] = std::declval<U>()))
{
if (i < int(o.data0.size())) {
o.data0[i] = std::forward<U>(v);
} else {
o.data1[i - o.data0.size()] = std::forward<U>(v);
}
}
public:
Vc_INTRINSIC Vc_PURE reference operator[](size_t index) noexcept
{
return {*this, int(index)};
}
Vc_INTRINSIC Vc_PURE value_type operator[](size_t index) const noexcept
{
return get(*this, index);
}
Vc_INTRINSIC Vc_PURE int count() const { return data0.count() + data1.count(); }
Vc_INTRINSIC Vc_PURE int firstOne() const {
if (data0.isEmpty()) {
return data1.firstOne() + storage_type0::size();
}
return data0.firstOne();
}
template <typename G>
static Vc_INTRINSIC fixed_size_simd_mask<T, N> generate(const G &gen)
{
return {storage_type0::generate(gen),
storage_type1::generate([&](std::size_t i) { return gen(i + N0); })};
}
inline Vc_PURE fixed_size_simd_mask<T, N> shifted(int amount) const
{
if (Vc_IS_UNLIKELY(amount == 0)) {
return *this;
}
return generate([&](unsigned i) {
const unsigned j = i + amount;
return j < size() ? get(*this, j) : false;
});
}
template <typename Op, typename... Args>
static Vc_INTRINSIC fixed_size_simd_mask<T, N> fromOperation(Op op, Args &&... args)
{
fixed_size_simd_mask<T, N> r = {
storage_type0::fromOperation(op, Split::lo(args)...),
storage_type1::fromOperation(op, Split::hi(std::forward<Args>(args))...)};
return r;
}
Vc_INTRINSIC SimdMaskArray(storage_type0 &&x, storage_type1 &&y)
: data0(std::move(x)), data1(std::move(y))
{
}
private:
alignas(static_cast<std::size_t>(
Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(V) /
V::size()>::value)) storage_type0 data0;
storage_type1 data1;
};
template <typename T, std::size_t N, typename V, std::size_t M>
constexpr std::size_t SimdMaskArray<T, N, V, M>::Size;
template <typename T, std::size_t N, typename V, std::size_t M>
constexpr std::size_t SimdMaskArray<T, N, V, M>::MemoryAlignment;
}
#ifndef VC_COMMON_SIMD_CAST_CALLER_TCC_
#define VC_COMMON_SIMD_CAST_CALLER_TCC_
namespace Vc_VERSIONED_NAMESPACE {
template <class T, std::size_t N, class VectorType>
template <class U, class V, class>
Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
const SimdMaskArray<U, N, V> &x)
: data(simd_cast<mask_type>(internal_data(x)))
{
}
template <class T, std::size_t N, class VectorType>
template <class U, class V, class, class>
Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
const SimdMaskArray<U, N, V> &x)
: data(simd_cast<mask_type>(internal_data(internal_data0(x)),
internal_data(internal_data1(x))))
{
}
template <class T, std::size_t N, class VectorType>
template <class U, class V, class, class, class>
Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
const SimdMaskArray<U, N, V> &x)
: data(simd_cast<mask_type>(internal_data(internal_data0(internal_data0(x))),
internal_data(internal_data1(internal_data0(x))),
internal_data(internal_data0(internal_data1(x))),
internal_data(internal_data1(internal_data1(x)))))
{
}
template <class T, std::size_t N, class VectorType>
template <class M, std::size_t Pieces, std::size_t Index>
Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
Common::Segment<M, Pieces, Index> &&x,
enable_if<Traits::simd_vector_size<M>::value == Size * Pieces>)
: data(simd_cast<mask_type, Index>(x.data))
{
}
template <class T, std::size_t N, class VectorType>
template <class M, class>
Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(M k)
: data(simd_cast<mask_type>(k))
{
}
}
#endif
#endif
#ifndef VC_COMMON_INTERLEAVE_H_
#define VC_COMMON_INTERLEAVE_H_
namespace Vc_VERSIONED_NAMESPACE
{
template <typename V, typename = enable_if<Traits::is_simd_vector<V>::value>>
std::pair<V, V> interleave(const V &a, const V &b)
{
return {a.interleaveLow(b), a.interleaveHigh(b)};
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template <std::size_t N, class... Candidates> struct select_best_vector_type_impl;
template <std::size_t N, class T> struct select_best_vector_type_impl<N, T> {
using type = T;
};
template <std::size_t N, class T, class... Candidates>
struct select_best_vector_type_impl<N, T, Candidates...> {
using type = typename std::conditional<
(N < T::Size), typename select_best_vector_type_impl<N, Candidates...>::type,
T>::type;
};
template <class T, std::size_t N>
struct select_best_vector_type : select_best_vector_type_impl<N,
#ifdef Vc_IMPL_AVX2
Vc::AVX2::Vector<T>,
#elif defined Vc_IMPL_AVX
Vc::AVX::Vector<T>,
#endif
#ifdef Vc_IMPL_SSE
Vc::SSE::Vector<T>,
#endif
Vc::Scalar::Vector<T>> {
};
}
namespace internal
{
template <typename T> T Vc_INTRINSIC Vc_PURE product_helper_(const T &l, const T &r) { return l * r; }
template <typename T> T Vc_INTRINSIC Vc_PURE sum_helper_(const T &l, const T &r) { return l + r; }
}
template <typename T, std::size_t N, typename V, std::size_t M>
inline fixed_size_simd<T, N> min(const SimdArray<T, N, V, M> &x,
const SimdArray<T, N, V, M> &y);
template <typename T, std::size_t N, typename V, std::size_t M>
inline fixed_size_simd<T, N> max(const SimdArray<T, N, V, M> &x,
const SimdArray<T, N, V, M> &y);
#define Vc_CURRENT_CLASS_NAME SimdArray
template <typename T, std::size_t N, typename VectorType_>
class SimdArray<T, N, VectorType_, N>
{
static_assert(std::is_same<T, double>::value || std::is_same<T, float>::value ||
std::is_same<T, int32_t>::value ||
std::is_same<T, uint32_t>::value ||
std::is_same<T, int16_t>::value ||
std::is_same<T, uint16_t>::value,
"SimdArray<T, N> may only be used with T = { double, float, int32_t, uint32_t, "
"int16_t, uint16_t }");
static_assert(
std::is_same<VectorType_,
typename Common::select_best_vector_type<T, N>::type>::value &&
VectorType_::size() == N,
"ERROR: leave the third and fourth template parameters with their defaults. They "
"are implementation details.");
public:
static constexpr bool is_atomic = true;
using VectorType = VectorType_;
using vector_type = VectorType;
using storage_type = vector_type;
using vectorentry_type = typename vector_type::VectorEntryType;
using value_type = T;
using mask_type = fixed_size_simd_mask<T, N>;
using index_type = fixed_size_simd<int, N>;
static constexpr std::size_t size() { return N; }
using Mask = mask_type;
using MaskType = Mask;
using MaskArgument = const MaskType &;
using VectorEntryType = vectorentry_type;
using EntryType = value_type;
using IndexType = index_type;
using AsArg = const SimdArray &;
using reference = Detail::ElementReference<SimdArray>;
static constexpr std::size_t Size = size();
static constexpr std::size_t MemoryAlignment = storage_type::MemoryAlignment;
Vc_INTRINSIC SimdArray() = default;
Vc_INTRINSIC SimdArray(const SimdArray &) = default;
Vc_INTRINSIC SimdArray(SimdArray &&) = default;
Vc_INTRINSIC SimdArray &operator=(const SimdArray &) = default;
Vc_INTRINSIC SimdArray(const value_type &a) : data(a) {}
Vc_INTRINSIC SimdArray(value_type &a) : data(a) {}
Vc_INTRINSIC SimdArray(value_type &&a) : data(a) {}
template <
typename U,
typename = enable_if<std::is_same<U, int>::value && !std::is_same<int, value_type>::value>>
Vc_INTRINSIC SimdArray(U a)
: SimdArray(static_cast<value_type>(a))
{
}
template <class U, class V, class = enable_if<N == V::Size>>
Vc_INTRINSIC SimdArray(const SimdArray<U, N, V> &x)
: data(simd_cast<vector_type>(internal_data(x)))
{
}
template <class U, class V, class = enable_if<(N > V::Size && N <= 2 * V::Size)>,
class = U>
Vc_INTRINSIC SimdArray(const SimdArray<U, N, V> &x)
: data(simd_cast<vector_type>(internal_data(internal_data0(x)),
internal_data(internal_data1(x))))
{
}
template <class U, class V, class = enable_if<(N > 2 * V::Size && N <= 4 * V::Size)>,
class = U, class = U>
Vc_INTRINSIC SimdArray(const SimdArray<U, N, V> &x)
: data(simd_cast<vector_type>(internal_data(internal_data0(internal_data0(x))),
internal_data(internal_data1(internal_data0(x))),
internal_data(internal_data0(internal_data1(x))),
internal_data(internal_data1(internal_data1(x)))))
{
}
template <typename V, std::size_t Pieces, std::size_t Index>
Vc_INTRINSIC SimdArray(Common::Segment<V, Pieces, Index> &&x)
: data(simd_cast<vector_type, Index>(x.data))
{
}
Vc_INTRINSIC SimdArray(const std::initializer_list<value_type> &init)
: data(init.begin(), Vc::Unaligned)
{
Vc_ASSERT(init.size() == size());
}
template <
typename V,
typename = enable_if<Traits::is_simd_vector<V>::value && !Traits::isSimdArray<V>::value>>
Vc_INTRINSIC SimdArray(const V &x)
: data(simd_cast<vector_type>(x))
{
}
template <typename U, typename A,
typename =
enable_if<std::is_convertible<T, U>::value && Vector<U, A>::Size == N &&
!std::is_same<A, simd_abi::fixed_size<N>>::value>>
Vc_INTRINSIC operator Vector<U, A>() const
{
return simd_cast<Vector<U, A>>(data);
}
operator fixed_size_simd<T, N> &()
{
return static_cast<fixed_size_simd<T, N> &>(*this);
}
operator const fixed_size_simd<T, N> &() const
{
return static_cast<const fixed_size_simd<T, N> &>(*this);
}
#ifndef Vc_CURRENT_CLASS_NAME
#error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
#endif
private:
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
MaskArgument mask);
public:
#define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<MT, EntryType>::value, \
"The memory pointer needs to point to a type that can be converted to the " \
"EntryType of this SIMD vector type."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
template <typename MT, typename IT,
typename = enable_if<Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
#undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
private:
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes) const;
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
public:
#define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<EntryType, MT>::value, \
"The memory pointer needs to point to a type that the EntryType of this " \
"SIMD vector type can be converted to."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes));
}
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes), mask);
}
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
{
scatter(args.address, args.indexes);
}
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
{
scatter(args.address, args.indexes, mask);
}
#undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerZero) : data() {}
explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerOne o) : data(o) {}
explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerIndexesFromZero i) : data(i)
{
}
template <std::size_t Offset>
explicit Vc_INTRINSIC SimdArray(
Common::AddOffset<VectorSpecialInitializerIndexesFromZero, Offset>)
: data(Vc::IndexesFromZero)
{
data += value_type(Offset);
}
Vc_INTRINSIC void setZero() { data.setZero(); }
Vc_INTRINSIC void setZero(mask_type k) { data.setZero(internal_data(k)); }
Vc_INTRINSIC void setZeroInverted() { data.setZeroInverted(); }
Vc_INTRINSIC void setZeroInverted(mask_type k) { data.setZeroInverted(internal_data(k)); }
Vc_INTRINSIC void setQnan() { data.setQnan(); }
Vc_INTRINSIC void setQnan(mask_type m) { data.setQnan(internal_data(m)); }
template <typename Op, typename... Args>
static Vc_INTRINSIC fixed_size_simd<T, N> fromOperation(Op op, Args &&... args)
{
fixed_size_simd<T, N> r;
Common::unpackArgumentsAuto(op, r.data, std::forward<Args>(args)...);
return r;
}
template <typename Op, typename... Args>
static Vc_INTRINSIC void callOperation(Op op, Args &&... args)
{
Common::unpackArgumentsAuto(op, nullptr, std::forward<Args>(args)...);
}
static Vc_INTRINSIC fixed_size_simd<T, N> Zero()
{
return SimdArray(Vc::Zero);
}
static Vc_INTRINSIC fixed_size_simd<T, N> One()
{
return SimdArray(Vc::One);
}
static Vc_INTRINSIC fixed_size_simd<T, N> IndexesFromZero()
{
return SimdArray(Vc::IndexesFromZero);
}
static Vc_INTRINSIC fixed_size_simd<T, N> Random()
{
return fromOperation(Common::Operations::random());
}
template <class U, class Flags = DefaultLoadTag,
class = enable_if<std::is_arithmetic<U>::value &&
Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC SimdArray(const U *mem, Flags f = {}) : data(mem, f)
{
}
template <typename... Args> Vc_INTRINSIC void load(Args &&... args)
{
data.load(std::forward<Args>(args)...);
}
template <typename... Args> Vc_INTRINSIC void store(Args &&... args) const
{
data.store(std::forward<Args>(args)...);
}
Vc_INTRINSIC mask_type operator!() const
{
return {private_init, !data};
}
Vc_INTRINSIC fixed_size_simd<T, N> operator-() const
{
return {private_init, -data};
}
Vc_INTRINSIC fixed_size_simd<T, N> operator+() const { return *this; }
Vc_INTRINSIC fixed_size_simd<T, N> operator~() const
{
return {private_init, ~data};
}
template <typename U,
typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
Vc_INTRINSIC Vc_CONST fixed_size_simd<T, N> operator<<(U x) const
{
return {private_init, data << x};
}
template <typename U,
typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
Vc_INTRINSIC fixed_size_simd<T, N> &operator<<=(U x)
{
data <<= x;
return *this;
}
template <typename U,
typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
Vc_INTRINSIC Vc_CONST fixed_size_simd<T, N> operator>>(U x) const
{
return {private_init, data >> x};
}
template <typename U,
typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
Vc_INTRINSIC fixed_size_simd<T, N> &operator>>=(U x)
{
data >>= x;
return *this;
}
#define Vc_BINARY_OPERATOR_(op) \
Vc_INTRINSIC fixed_size_simd<T, N> &operator op##=(const SimdArray &rhs) \
{ \
data op## = rhs.data; \
return *this; \
}
Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATOR_);
Vc_ALL_BINARY(Vc_BINARY_OPERATOR_);
Vc_ALL_SHIFTS(Vc_BINARY_OPERATOR_);
#undef Vc_BINARY_OPERATOR_
Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC MaskType isNegative() const
{
return {private_init, isnegative(data)};
}
private:
friend reference;
Vc_INTRINSIC static value_type get(const SimdArray &o, int i) noexcept
{
return o.data[i];
}
template <typename U>
Vc_INTRINSIC static void set(SimdArray &o, int i, U &&v) noexcept(
noexcept(std::declval<value_type &>() = v))
{
o.data[i] = v;
}
public:
Vc_INTRINSIC reference operator[](size_t i) noexcept
{
static_assert(noexcept(reference{std::declval<SimdArray &>(), int()}), "");
return {*this, int(i)};
}
Vc_INTRINSIC value_type operator[](size_t i) const noexcept
{
return get(*this, int(i));
}
Vc_INTRINSIC Common::WriteMaskedVector<SimdArray, mask_type> operator()(const mask_type &k)
{
return {*this, k};
}
Vc_INTRINSIC void assign(const SimdArray &v, const mask_type &k)
{
data.assign(v.data, internal_data(k));
}
#define Vc_REDUCTION_FUNCTION_(name_) \
Vc_INTRINSIC Vc_PURE value_type name_() const { return data.name_(); } \
Vc_INTRINSIC Vc_PURE value_type name_(mask_type mask) const \
{ \
return data.name_(internal_data(mask)); \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_REDUCTION_FUNCTION_(min);
Vc_REDUCTION_FUNCTION_(max);
Vc_REDUCTION_FUNCTION_(product);
Vc_REDUCTION_FUNCTION_(sum);
#undef Vc_REDUCTION_FUNCTION_
Vc_INTRINSIC Vc_PURE fixed_size_simd<T, N> partialSum() const
{
return {private_init, data.partialSum()};
}
template <typename F> Vc_INTRINSIC fixed_size_simd<T, N> apply(F &&f) const
{
return {private_init, data.apply(std::forward<F>(f))};
}
template <typename F> Vc_INTRINSIC fixed_size_simd<T, N> apply(F &&f, const mask_type &k) const
{
return {private_init, data.apply(std::forward<F>(f), k)};
}
Vc_INTRINSIC fixed_size_simd<T, N> shifted(int amount) const
{
return {private_init, data.shifted(amount)};
}
template <std::size_t NN>
Vc_INTRINSIC fixed_size_simd<T, N> shifted(int amount, const SimdArray<value_type, NN> &shiftIn)
const
{
return {private_init, data.shifted(amount, simd_cast<VectorType>(shiftIn))};
}
Vc_INTRINSIC fixed_size_simd<T, N> rotated(int amount) const
{
return {private_init, data.rotated(amount)};
}
Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC fixed_size_simd<T, N> exponent() const
{
return {private_init, exponent(data)};
}
Vc_INTRINSIC fixed_size_simd<T, N> interleaveLow(SimdArray x) const
{
return {private_init, data.interleaveLow(x.data)};
}
Vc_INTRINSIC fixed_size_simd<T, N> interleaveHigh(SimdArray x) const
{
return {private_init, data.interleaveHigh(x.data)};
}
Vc_INTRINSIC fixed_size_simd<T, N> reversed() const
{
return {private_init, data.reversed()};
}
Vc_INTRINSIC fixed_size_simd<T, N> sorted() const
{
return {private_init, data.sorted()};
}
template <class G, class = decltype(std::declval<G>()(std::size_t())),
class = enable_if<!Traits::is_simd_vector<G>::value>>
Vc_INTRINSIC SimdArray(const G &gen) : data(gen)
{
}
template <typename G> static Vc_INTRINSIC fixed_size_simd<T, N> generate(const G &gen)
{
return {private_init, VectorType::generate(gen)};
}
Vc_DEPRECATED("use copysign(x, y) instead")
Vc_INTRINSIC fixed_size_simd<T, N> copySign(const SimdArray &x) const
{
return {private_init, Vc::copysign(data, x.data)};
}
friend VectorType &internal_data<>(SimdArray &x);
friend const VectorType &internal_data<>(const SimdArray &x);
Vc_INTRINSIC SimdArray(private_init_t, VectorType &&x) : data(std::move(x)) {}
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(storage_type));
private:
alignas(static_cast<std::size_t>(
Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(VectorType_) /
VectorType_::size()>::value)) storage_type data;
};
template <typename T, std::size_t N, typename VectorType> constexpr std::size_t SimdArray<T, N, VectorType, N>::Size;
template <typename T, std::size_t N, typename VectorType>
constexpr std::size_t SimdArray<T, N, VectorType, N>::MemoryAlignment;
template <typename T, std::size_t N, typename VectorType>
#ifndef Vc_MSVC
Vc_INTRINSIC
#endif
VectorType &internal_data(SimdArray<T, N, VectorType, N> &x)
{
return x.data;
}
template <typename T, std::size_t N, typename VectorType>
#ifndef Vc_MSVC
Vc_INTRINSIC
#endif
const VectorType &internal_data(const SimdArray<T, N, VectorType, N> &x)
{
return x.data;
}
template <class T> Vc_INTRINSIC T unwrap(const T &x) { return x; }
template <class T, size_t N, class V>
Vc_INTRINSIC V unwrap(const SimdArray<T, N, V, N> &x)
{
return internal_data(x);
}
template <class T, size_t Pieces, size_t Index>
Vc_INTRINSIC auto unwrap(const Common::Segment<T, Pieces, Index> &x)
-> decltype(x.to_fixed_size())
{
return unwrap(x.to_fixed_size());
}
template <typename T, std::size_t N, typename VectorType>
template <class MT, class IT, int Scale>
Vc_INTRINSIC void SimdArray<T, N, VectorType, N>::gatherImplementation(
const Common::GatherArguments<MT, IT, Scale> &args)
{
data.gather(Common::make_gather<Scale>(args.address, unwrap(args.indexes)));
}
template <typename T, std::size_t N, typename VectorType>
template <class MT, class IT, int Scale>
Vc_INTRINSIC void SimdArray<T, N, VectorType, N>::gatherImplementation(
const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
{
data.gather(Common::make_gather<Scale>(args.address, unwrap(args.indexes)),
mask);
}
template <typename T, std::size_t N, typename VectorType>
template <typename MT, typename IT>
inline void SimdArray<T, N, VectorType, N>::scatterImplementation(MT *mem,
IT &&indexes) const
{
data.scatter(mem, unwrap(std::forward<IT>(indexes)));
}
template <typename T, std::size_t N, typename VectorType>
template <typename MT, typename IT>
inline void SimdArray<T, N, VectorType, N>::scatterImplementation(MT *mem,
IT &&indexes,
MaskArgument mask) const
{
data.scatter(mem, unwrap(std::forward<IT>(indexes)), mask);
}
template <typename T, size_t N, typename V, size_t Wt> class SimdArray
{
static_assert(std::is_same<T, double>::value ||
std::is_same<T, float>::value ||
std::is_same<T, int32_t>::value ||
std::is_same<T, uint32_t>::value ||
std::is_same<T, int16_t>::value ||
std::is_same<T, uint16_t>::value, "SimdArray<T, N> may only be used with T = { double, float, int32_t, uint32_t, int16_t, uint16_t }");
static_assert(
std::is_same<V, typename Common::select_best_vector_type<T, N>::type>::value &&
V::size() == Wt,
"ERROR: leave the third and fourth template parameters with their defaults. They "
"are implementation details.");
static_assert(
std::is_same<typename V::EntryType, typename V::VectorEntryType>::value ||
(N % V::size() == 0),
"SimdArray<(un)signed short, N> on MIC only works correctly for N = k * "
"MIC::(u)short_v::size(), i.e. k * 16.");
using my_traits = SimdArrayTraits<T, N>;
static constexpr std::size_t N0 = my_traits::N0;
static constexpr std::size_t N1 = my_traits::N1;
using Split = Common::Split<N0>;
template <typename U, std::size_t K> using CArray = U[K];
public:
static constexpr bool is_atomic = false;
using storage_type0 = typename my_traits::storage_type0;
using storage_type1 = typename my_traits::storage_type1;
static_assert(storage_type0::size() == N0, "");
using vector_type = V;
using vectorentry_type = typename storage_type0::vectorentry_type;
typedef vectorentry_type alias_type Vc_MAY_ALIAS;
using value_type = T;
using mask_type = fixed_size_simd_mask<T, N>;
using index_type = fixed_size_simd<int, N>;
static constexpr std::size_t size() { return N; }
using Mask = mask_type;
using MaskType = Mask;
using MaskArgument = const MaskType &;
using VectorEntryType = vectorentry_type;
using EntryType = value_type;
using IndexType = index_type;
using AsArg = const SimdArray &;
using reference = Detail::ElementReference<SimdArray>;
static constexpr std::size_t MemoryAlignment =
storage_type0::MemoryAlignment > storage_type1::MemoryAlignment
? storage_type0::MemoryAlignment
: storage_type1::MemoryAlignment;
static Vc_INTRINSIC fixed_size_simd<T, N> Zero()
{
return SimdArray(Vc::Zero);
}
static Vc_INTRINSIC fixed_size_simd<T, N> One()
{
return SimdArray(Vc::One);
}
static Vc_INTRINSIC fixed_size_simd<T, N> IndexesFromZero()
{
return SimdArray(Vc::IndexesFromZero);
}
static Vc_INTRINSIC fixed_size_simd<T, N> Random()
{
return fromOperation(Common::Operations::random());
}
template <class G, class = decltype(std::declval<G>()(std::size_t())),
class = enable_if<!Traits::is_simd_vector<G>::value>>
Vc_INTRINSIC SimdArray(const G &gen)
: data0(gen), data1([&](std::size_t i) { return gen(i + storage_type0::size()); })
{
}
template <typename G> static Vc_INTRINSIC fixed_size_simd<T, N> generate(const G &gen)
{
auto tmp = storage_type0::generate(gen);
return {std::move(tmp),
storage_type1::generate([&](std::size_t i) { return gen(i + N0); })};
}
SimdArray() = default;
Vc_INTRINSIC SimdArray(value_type a) : data0(a), data1(a) {}
template <
typename U,
typename = enable_if<std::is_same<U, int>::value && !std::is_same<int, value_type>::value>>
SimdArray(U a)
: SimdArray(static_cast<value_type>(a))
{
}
SimdArray(const SimdArray &) = default;
SimdArray(SimdArray &&) = default;
SimdArray &operator=(const SimdArray &) = default;
template <typename U, typename Flags = DefaultLoadTag,
typename = enable_if<std::is_arithmetic<U>::value &&
Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC SimdArray(const U *mem, Flags f = {})
: data0(mem, f), data1(mem + storage_type0::size(), f)
{
}
#ifndef Vc_MSVC
template <typename U, std::size_t Extent, typename Flags = DefaultLoadTag,
typename = enable_if<std::is_arithmetic<U>::value &&
Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC SimdArray(CArray<U, Extent> &mem, Flags f = {})
: data0(&mem[0], f), data1(&mem[storage_type0::size()], f)
{
}
template <typename U, std::size_t Extent, typename Flags = DefaultLoadTag,
typename = enable_if<std::is_arithmetic<U>::value &&
Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC SimdArray(const CArray<U, Extent> &mem, Flags f = {})
: data0(&mem[0], f), data1(&mem[storage_type0::size()], f)
{
}
#endif
Vc_INTRINSIC SimdArray(const std::initializer_list<value_type> &init)
: data0(init.begin(), Vc::Unaligned)
, data1(init.begin() + storage_type0::size(), Vc::Unaligned)
{
Vc_ASSERT(init.size() == size());
}
#ifndef Vc_CURRENT_CLASS_NAME
#error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
#endif
private:
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
MaskArgument mask);
public:
#define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<MT, EntryType>::value, \
"The memory pointer needs to point to a type that can be converted to the " \
"EntryType of this SIMD vector type."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
template <typename MT, typename IT,
typename = enable_if<Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
#undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
private:
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes) const;
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
public:
#define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<EntryType, MT>::value, \
"The memory pointer needs to point to a type that the EntryType of this " \
"SIMD vector type can be converted to."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes));
}
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes), mask);
}
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
{
scatter(args.address, args.indexes);
}
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
{
scatter(args.address, args.indexes, mask);
}
#undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerZero) : data0(), data1() {}
explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerOne o) : data0(o), data1(o) {}
explicit Vc_INTRINSIC SimdArray(VectorSpecialInitializerIndexesFromZero i)
: data0(i)
, data1(Common::AddOffset<VectorSpecialInitializerIndexesFromZero,
storage_type0::size()>())
{
}
template <size_t Offset>
explicit Vc_INTRINSIC SimdArray(
Common::AddOffset<VectorSpecialInitializerIndexesFromZero, Offset> i)
: data0(i)
, data1(Common::AddOffset<VectorSpecialInitializerIndexesFromZero,
storage_type0::size() + Offset>())
{
}
template <class W, class = enable_if<
(Traits::is_simd_vector<W>::value &&
Traits::simd_vector_size<W>::value == N &&
!(std::is_convertible<Traits::entry_type_of<W>, T>::value &&
Traits::isSimdArray<W>::value))>>
Vc_INTRINSIC explicit SimdArray(W &&x) : data0(Split::lo(x)), data1(Split::hi(x))
{
}
template <class W, class = enable_if<
(Traits::isSimdArray<W>::value &&
Traits::simd_vector_size<W>::value == N &&
std::is_convertible<Traits::entry_type_of<W>, T>::value)>,
class = W>
Vc_INTRINSIC SimdArray(W &&x) : data0(Split::lo(x)), data1(Split::hi(x))
{
}
template <class W, std::size_t Pieces, std::size_t Index>
Vc_INTRINSIC SimdArray(Common::Segment<W, Pieces, Index> &&x)
: data0(Common::Segment<W, 2 * Pieces, 2 * Index>{x.data})
, data1(Common::Segment<W, 2 * Pieces, 2 * Index + 1>{x.data})
{
}
template <typename U, typename A,
typename =
enable_if<std::is_convertible<T, U>::value && Vector<U, A>::Size == N &&
!std::is_same<A, simd_abi::fixed_size<N>>::value>>
operator Vector<U, A>() const
{
auto r = simd_cast<Vector<U, A>>(data0, data1);
return r;
}
Vc_INTRINSIC operator fixed_size_simd<T, N> &()
{
return static_cast<fixed_size_simd<T, N> &>(*this);
}
Vc_INTRINSIC operator const fixed_size_simd<T, N> &() const
{
return static_cast<const fixed_size_simd<T, N> &>(*this);
}
Vc_INTRINSIC void setZero()
{
data0.setZero();
data1.setZero();
}
Vc_INTRINSIC void setZero(const mask_type &k)
{
data0.setZero(Split::lo(k));
data1.setZero(Split::hi(k));
}
Vc_INTRINSIC void setZeroInverted()
{
data0.setZeroInverted();
data1.setZeroInverted();
}
Vc_INTRINSIC void setZeroInverted(const mask_type &k)
{
data0.setZeroInverted(Split::lo(k));
data1.setZeroInverted(Split::hi(k));
}
Vc_INTRINSIC void setQnan() {
data0.setQnan();
data1.setQnan();
}
Vc_INTRINSIC void setQnan(const mask_type &m) {
data0.setQnan(Split::lo(m));
data1.setQnan(Split::hi(m));
}
template <typename Op, typename... Args>
static Vc_INTRINSIC fixed_size_simd<T, N> fromOperation(Op op, Args &&... args)
{
fixed_size_simd<T, N> r = {
storage_type0::fromOperation(op, Split::lo(args)...),
storage_type1::fromOperation(op, Split::hi(std::forward<Args>(args))...)};
return r;
}
template <typename Op, typename... Args>
static Vc_INTRINSIC void callOperation(Op op, Args &&... args)
{
storage_type0::callOperation(op, Split::lo(args)...);
storage_type1::callOperation(op, Split::hi(std::forward<Args>(args))...);
}
template <typename U, typename... Args> Vc_INTRINSIC void load(const U *mem, Args &&... args)
{
data0.load(mem, Split::lo(args)...);
data1.load(mem + storage_type0::size(), Split::hi(std::forward<Args>(args))...);
}
template <typename U, typename... Args> Vc_INTRINSIC void store(U *mem, Args &&... args) const
{
data0.store(mem, Split::lo(args)...);
data1.store(mem + storage_type0::size(), Split::hi(std::forward<Args>(args))...);
}
Vc_INTRINSIC mask_type operator!() const
{
return {!data0, !data1};
}
Vc_INTRINSIC fixed_size_simd<T, N> operator-() const
{
return {-data0, -data1};
}
Vc_INTRINSIC fixed_size_simd<T, N> operator+() const { return *this; }
Vc_INTRINSIC fixed_size_simd<T, N> operator~() const
{
return {~data0, ~data1};
}
template <typename U,
typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
Vc_INTRINSIC Vc_CONST fixed_size_simd<T, N> operator<<(U x) const
{
return {data0 << x, data1 << x};
}
template <typename U,
typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
Vc_INTRINSIC fixed_size_simd<T, N> &operator<<=(U x)
{
data0 <<= x;
data1 <<= x;
return *this;
}
template <typename U,
typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
Vc_INTRINSIC Vc_CONST fixed_size_simd<T, N> operator>>(U x) const
{
return {data0 >> x, data1 >> x};
}
template <typename U,
typename = enable_if<std::is_integral<T>::value && std::is_integral<U>::value>>
Vc_INTRINSIC fixed_size_simd<T, N> &operator>>=(U x)
{
data0 >>= x;
data1 >>= x;
return *this;
}
#define Vc_BINARY_OPERATOR_(op) \
Vc_INTRINSIC fixed_size_simd<T, N> &operator op##=(const SimdArray &rhs) \
{ \
data0 op## = rhs.data0; \
data1 op## = rhs.data1; \
return *this; \
}
Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATOR_);
Vc_ALL_BINARY(Vc_BINARY_OPERATOR_);
Vc_ALL_SHIFTS(Vc_BINARY_OPERATOR_);
#undef Vc_BINARY_OPERATOR_
private:
friend reference;
Vc_INTRINSIC static value_type get(const SimdArray &o, int i) noexcept
{
return reinterpret_cast<const alias_type *>(&o)[i];
}
template <typename U>
Vc_INTRINSIC static void set(SimdArray &o, int i, U &&v) noexcept(
noexcept(std::declval<value_type &>() = v))
{
reinterpret_cast<alias_type *>(&o)[i] = v;
}
public:
Vc_INTRINSIC reference operator[](size_t i) noexcept
{
static_assert(noexcept(reference{std::declval<SimdArray &>(), int()}), "");
return {*this, int(i)};
}
Vc_INTRINSIC value_type operator[](size_t index) const noexcept
{
return get(*this, int(index));
}
Vc_INTRINSIC Common::WriteMaskedVector<SimdArray, mask_type> operator()(
const mask_type &mask)
{
return {*this, mask};
}
Vc_INTRINSIC void assign(const SimdArray &v, const mask_type &k)
{
data0.assign(v.data0, internal_data0(k));
data1.assign(v.data1, internal_data1(k));
}
#define Vc_REDUCTION_FUNCTION_(name_,binary_fun_,scalar_fun_) \
private: \
template <typename ForSfinae = void> \
Vc_INTRINSIC enable_if<std::is_same<ForSfinae, void>::value && \
storage_type0::Size == storage_type1::Size, \
value_type> name_##_impl() const \
{ \
return binary_fun_(data0, data1).name_(); \
} \
\
template <typename ForSfinae = void> \
Vc_INTRINSIC enable_if<std::is_same<ForSfinae, void>::value && \
storage_type0::Size != storage_type1::Size, \
value_type> name_##_impl() const \
{ \
return scalar_fun_(data0.name_(), data1.name_()); \
} \
\
public: \
\
Vc_INTRINSIC value_type name_() const { return name_##_impl(); } \
\
Vc_INTRINSIC value_type name_(const mask_type &mask) const \
{ \
if (Vc_IS_UNLIKELY(Split::lo(mask).isEmpty())) { \
return data1.name_(Split::hi(mask)); \
} else if (Vc_IS_UNLIKELY(Split::hi(mask).isEmpty())) { \
return data0.name_(Split::lo(mask)); \
} else { \
return scalar_fun_(data0.name_(Split::lo(mask)), \
data1.name_(Split::hi(mask))); \
} \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_REDUCTION_FUNCTION_(min, Vc::min, std::min);
Vc_REDUCTION_FUNCTION_(max, Vc::max, std::max);
Vc_REDUCTION_FUNCTION_(product, internal::product_helper_, internal::product_helper_);
Vc_REDUCTION_FUNCTION_(sum, internal::sum_helper_, internal::sum_helper_);
#undef Vc_REDUCTION_FUNCTION_
Vc_INTRINSIC Vc_PURE fixed_size_simd<T, N> partialSum() const
{
auto ps0 = data0.partialSum();
auto tmp = data1;
tmp[0] += ps0[data0.size() - 1];
return {std::move(ps0), tmp.partialSum()};
}
template <typename F> inline fixed_size_simd<T, N> apply(F &&f) const
{
return {data0.apply(f), data1.apply(f)};
}
template <typename F>
inline fixed_size_simd<T, N> apply(F &&f, const mask_type &k) const
{
return {data0.apply(f, Split::lo(k)), data1.apply(f, Split::hi(k))};
}
inline fixed_size_simd<T, N> shifted(int amount) const
{
constexpr int SSize = Size;
constexpr int SSize0 = storage_type0::Size;
constexpr int SSize1 = storage_type1::Size;
if (amount == 0) {
return *this;
}
if (amount < 0) {
if (amount > -SSize0) {
return {data0.shifted(amount), data1.shifted(amount, data0)};
}
if (amount == -SSize0) {
return {storage_type0(0), simd_cast<storage_type1>(data0)};
}
if (amount < -SSize0) {
return {storage_type0(0), simd_cast<storage_type1>(data0.shifted(
amount + SSize0))};
}
return Zero();
} else {
if (amount >= SSize) {
return Zero();
} else if (amount >= SSize0) {
return {
simd_cast<storage_type0>(data1).shifted(amount - SSize0),
storage_type1(0)};
} else if (amount >= SSize1) {
return {data0.shifted(amount, data1), storage_type1(0)};
} else {
return {data0.shifted(amount, data1), data1.shifted(amount)};
}
}
}
template <std::size_t NN>
inline enable_if<
!(std::is_same<storage_type0, storage_type1>::value &&
N == NN),
fixed_size_simd<T, N>>
shifted(int amount, const SimdArray<value_type, NN> &shiftIn) const
{
constexpr int SSize = Size;
if (amount < 0) {
return fixed_size_simd<T, N>([&](int i) -> value_type {
i += amount;
if (i >= 0) {
return operator[](i);
} else if (i >= -SSize) {
return shiftIn[i + SSize];
}
return 0;
});
}
return fixed_size_simd<T, N>([&](int i) -> value_type {
i += amount;
if (i < SSize) {
return operator[](i);
} else if (i < 2 * SSize) {
return shiftIn[i - SSize];
}
return 0;
});
}
private:
template <std::size_t NN> struct bisectable_shift
: public std::integral_constant<bool,
std::is_same<storage_type0, storage_type1>::value &&
N == NN>
{
};
public:
template <std::size_t NN>
inline fixed_size_simd<T, N> shifted(
enable_if<bisectable_shift<NN>::value, int> amount,
const SimdArray<value_type, NN> &shiftIn) const
{
constexpr int SSize = Size;
if (amount < 0) {
if (amount > -static_cast<int>(storage_type0::Size)) {
return {data0.shifted(amount, internal_data1(shiftIn)),
data1.shifted(amount, data0)};
}
if (amount == -static_cast<int>(storage_type0::Size)) {
return {storage_type0(internal_data1(shiftIn)), storage_type1(data0)};
}
if (amount > -SSize) {
return {
internal_data1(shiftIn)
.shifted(amount + static_cast<int>(storage_type0::Size), internal_data0(shiftIn)),
data0.shifted(amount + static_cast<int>(storage_type0::Size), internal_data1(shiftIn))};
}
if (amount == -SSize) {
return shiftIn;
}
if (amount > -2 * SSize) {
return shiftIn.shifted(amount + SSize);
}
}
if (amount == 0) {
return *this;
}
if (amount < static_cast<int>(storage_type0::Size)) {
return {data0.shifted(amount, data1),
data1.shifted(amount, internal_data0(shiftIn))};
}
if (amount == static_cast<int>(storage_type0::Size)) {
return {storage_type0(data1), storage_type1(internal_data0(shiftIn))};
}
if (amount < SSize) {
return {data1.shifted(amount - static_cast<int>(storage_type0::Size), internal_data0(shiftIn)),
internal_data0(shiftIn)
.shifted(amount - static_cast<int>(storage_type0::Size), internal_data1(shiftIn))};
}
if (amount == SSize) {
return shiftIn;
}
if (amount < 2 * SSize) {
return shiftIn.shifted(amount - SSize);
}
return Zero();
}
Vc_INTRINSIC fixed_size_simd<T, N> rotated(int amount) const
{
amount %= int(size());
if (amount == 0) {
return *this;
} else if (amount < 0) {
amount += size();
}
#ifdef Vc_MSVC
alignas(MemoryAlignment) T tmp[N + data0.size()];
data0.store(&tmp[0], Vc::Aligned);
data1.store(&tmp[data0.size()], Vc::Aligned);
data0.store(&tmp[N], Vc::Unaligned);
fixed_size_simd<T, N> r;
r.data0.load(&tmp[amount], Vc::Unaligned);
r.data1.load(&tmp[(amount + data0.size()) % size()], Vc::Unaligned);
return r;
#else
auto &&d0cvtd = simd_cast<storage_type1>(data0);
auto &&d1cvtd = simd_cast<storage_type0>(data1);
constexpr int size0 = storage_type0::size();
constexpr int size1 = storage_type1::size();
if (amount == size0 && std::is_same<storage_type0, storage_type1>::value) {
return {std::move(d1cvtd), std::move(d0cvtd)};
} else if (amount < size1) {
return {data0.shifted(amount, d1cvtd), data1.shifted(amount, d0cvtd)};
} else if (amount == size1) {
return {data0.shifted(amount, d1cvtd), std::move(d0cvtd)};
} else if (int(size()) - amount < size1) {
return {data0.shifted(amount - int(size()), d1cvtd.shifted(size1 - size0)),
data1.shifted(amount - int(size()), data0.shifted(size0 - size1))};
} else if (int(size()) - amount == size1) {
return {data0.shifted(-size1, d1cvtd.shifted(size1 - size0)),
simd_cast<storage_type1>(data0.shifted(size0 - size1))};
} else if (amount <= size0) {
return {data0.shifted(size1, d1cvtd).shifted(amount - size1, data0),
simd_cast<storage_type1>(data0.shifted(amount - size1))};
} else {
return {data0.shifted(size1, d1cvtd).shifted(amount - size1, data0),
simd_cast<storage_type1>(data0.shifted(amount - size1, d1cvtd))};
}
return *this;
#endif
}
Vc_INTRINSIC fixed_size_simd<T, N> interleaveLow(const SimdArray &x) const
{
return {data0.interleaveLow(x.data0),
simd_cast<storage_type1>(data0.interleaveHigh(x.data0))};
}
Vc_INTRINSIC fixed_size_simd<T, N> interleaveHigh(const SimdArray &x) const
{
return interleaveHighImpl(
x,
std::integral_constant<bool, storage_type0::Size == storage_type1::Size>());
}
private:
Vc_INTRINSIC fixed_size_simd<T, N> interleaveHighImpl(const SimdArray &x, std::true_type) const
{
return {data1.interleaveLow(x.data1), data1.interleaveHigh(x.data1)};
}
inline fixed_size_simd<T, N> interleaveHighImpl(const SimdArray &x, std::false_type) const
{
return {data0.interleaveHigh(x.data0)
.shifted(storage_type1::Size,
simd_cast<storage_type0>(data1.interleaveLow(x.data1))),
data1.interleaveHigh(x.data1)};
}
public:
inline fixed_size_simd<T, N> reversed() const
{
if (std::is_same<storage_type0, storage_type1>::value) {
return {simd_cast<storage_type0>(data1).reversed(),
simd_cast<storage_type1>(data0).reversed()};
} else {
#ifdef Vc_MSVC
alignas(MemoryAlignment) T tmp[N];
data1.reversed().store(&tmp[0], Vc::Aligned);
data0.reversed().store(&tmp[data1.size()], Vc::Unaligned);
return fixed_size_simd<T, N>{&tmp[0], Vc::Aligned};
#else
return {data0.shifted(storage_type1::Size, data1).reversed(),
simd_cast<storage_type1>(data0.reversed().shifted(
storage_type0::Size - storage_type1::Size))};
#endif
}
}
inline fixed_size_simd<T, N> sorted() const
{
return sortedImpl(
std::integral_constant<bool, storage_type0::Size == storage_type1::Size>());
}
Vc_INTRINSIC fixed_size_simd<T, N> sortedImpl(std::true_type) const
{
#ifdef Vc_DEBUG_SORTED
std::cerr << "-- " << data0 << data1 << '\n';
#endif
const auto a = data0.sorted();
const auto b = data1.sorted().reversed();
const auto lo = Vc::min(a, b);
const auto hi = Vc::max(a, b);
return {lo.sorted(), hi.sorted()};
}
Vc_INTRINSIC fixed_size_simd<T, N> sortedImpl(std::false_type) const
{
using SortableArray =
fixed_size_simd<value_type, Common::NextPowerOfTwo<size()>::value>;
auto sortable = simd_cast<SortableArray>(*this);
for (std::size_t i = Size; i < SortableArray::Size; ++i) {
using limits = std::numeric_limits<value_type>;
if (limits::has_infinity) {
sortable[i] = limits::infinity();
} else {
sortable[i] = std::numeric_limits<value_type>::max();
}
}
return simd_cast<fixed_size_simd<T, N>>(sortable.sorted());
}
static constexpr std::size_t Size = size();
Vc_DEPRECATED("use exponent(x) instead")
Vc_INTRINSIC fixed_size_simd<T, N> exponent() const
{
return {exponent(data0), exponent(data1)};
}
Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC MaskType isNegative() const
{
return {isnegative(data0), isnegative(data1)};
}
Vc_DEPRECATED("use copysign(x, y) instead")
Vc_INTRINSIC fixed_size_simd<T, N> copySign(const SimdArray &x) const
{
return {Vc::copysign(data0, x.data0),
Vc::copysign(data1, x.data1)};
}
friend storage_type0 &internal_data0<>(SimdArray &x);
friend storage_type1 &internal_data1<>(SimdArray &x);
friend const storage_type0 &internal_data0<>(const SimdArray &x);
friend const storage_type1 &internal_data1<>(const SimdArray &x);
Vc_INTRINSIC SimdArray(storage_type0 &&x, storage_type1 &&y)
: data0(std::move(x)), data1(std::move(y))
{
}
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(storage_type0));
private:
alignas(static_cast<std::size_t>(
Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(V) /
V::size()>::value)) storage_type0 data0;
storage_type1 data1;
};
#undef Vc_CURRENT_CLASS_NAME
template <typename T, std::size_t N, typename V, std::size_t M>
constexpr std::size_t SimdArray<T, N, V, M>::Size;
template <typename T, std::size_t N, typename V, std::size_t M>
constexpr std::size_t SimdArray<T, N, V, M>::MemoryAlignment;
template <typename T, std::size_t N, typename VectorType, std::size_t M>
template <class MT, class IT, int Scale>
inline void SimdArray<T, N, VectorType, M>::gatherImplementation(
const Common::GatherArguments<MT, IT, Scale> &args)
{
data0.gather(Common::make_gather<Scale>(
args.address, Split::lo(Common::Operations::gather(), args.indexes)));
data1.gather(Common::make_gather<Scale>(
args.address, Split::hi(Common::Operations::gather(), args.indexes)));
}
template <typename T, std::size_t N, typename VectorType, std::size_t M>
template <class MT, class IT, int Scale>
inline void SimdArray<T, N, VectorType, M>::gatherImplementation(
const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
{
data0.gather(Common::make_gather<Scale>(
args.address, Split::lo(Common::Operations::gather(), args.indexes)),
Split::lo(mask));
data1.gather(Common::make_gather<Scale>(
args.address, Split::hi(Common::Operations::gather(), args.indexes)),
Split::hi(mask));
}
template <typename T, std::size_t N, typename VectorType, std::size_t M>
template <typename MT, typename IT>
inline void SimdArray<T, N, VectorType, M>::scatterImplementation(MT *mem,
IT &&indexes) const
{
data0.scatter(mem, Split::lo(Common::Operations::gather(),
indexes));
data1.scatter(mem, Split::hi(Common::Operations::gather(), std::forward<IT>(indexes)));
}
template <typename T, std::size_t N, typename VectorType, std::size_t M>
template <typename MT, typename IT>
inline void SimdArray<T, N, VectorType, M>::scatterImplementation(MT *mem,
IT &&indexes, MaskArgument mask) const
{
data0.scatter(mem, Split::lo(Common::Operations::gather(), indexes),
Split::lo(mask));
data1.scatter(mem, Split::hi(Common::Operations::gather(), std::forward<IT>(indexes)),
Split::hi(mask));
}
template <typename T, std::size_t N, typename V, std::size_t M>
#ifndef Vc_MSVC
Vc_INTRINSIC
#endif
typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
SimdArray<T, N, V, M> &x)
{
return x.data0;
}
template <typename T, std::size_t N, typename V, std::size_t M>
#ifndef Vc_MSVC
Vc_INTRINSIC
#endif
typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
SimdArray<T, N, V, M> &x)
{
return x.data1;
}
template <typename T, std::size_t N, typename V, std::size_t M>
#ifndef Vc_MSVC
Vc_INTRINSIC
#endif
const typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
const SimdArray<T, N, V, M> &x)
{
return x.data0;
}
template <typename T, std::size_t N, typename V, std::size_t M>
#ifndef Vc_MSVC
Vc_INTRINSIC
#endif
const typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
const SimdArray<T, N, V, M> &x)
{
return x.data1;
}
#if defined Vc_MSVC && defined Vc_IMPL_SSE && !defined Vc_IMPL_AVX
template <>
Vc_INTRINSIC SimdArray<double, 8>::SimdArray(fixed_size_simd<double, 4> &&x,
fixed_size_simd<double, 4> &&y)
: data0(x), data1(0)
{
data1 = y;
}
#endif
namespace Detail
{
#define Vc_FIXED_OP(op) \
template <class T, int N, \
class = typename std::enable_if<fixed_size_simd<T, N>::is_atomic>::type> \
fixed_size_simd<T, N> operator op(const fixed_size_simd<T, N> &a, \
const fixed_size_simd<T, N> &b) \
{ \
return {private_init, internal_data(a) op internal_data(b)}; \
} \
template <class T, int N, \
class = typename std::enable_if<!fixed_size_simd<T, N>::is_atomic>::type, \
class = T> \
fixed_size_simd<T, N> operator op(const fixed_size_simd<T, N> &a, \
const fixed_size_simd<T, N> &b) \
{ \
return {internal_data0(a) op internal_data0(b), \
internal_data1(a) op internal_data1(b)}; \
}
Vc_ALL_ARITHMETICS(Vc_FIXED_OP);
Vc_ALL_BINARY(Vc_FIXED_OP);
Vc_ALL_SHIFTS(Vc_FIXED_OP);
#undef Vc_FIXED_OP
#define Vc_FIXED_OP(op) \
template <class T, int N, \
class = typename std::enable_if<fixed_size_simd<T, N>::is_atomic>::type> \
fixed_size_simd_mask<T, N> operator op(const fixed_size_simd<T, N> &a, \
const fixed_size_simd<T, N> &b) \
{ \
return {private_init, internal_data(a) op internal_data(b)}; \
} \
template <class T, int N, \
class = typename std::enable_if<!fixed_size_simd<T, N>::is_atomic>::type, \
class = T> \
fixed_size_simd_mask<T, N> operator op(const fixed_size_simd<T, N> &a, \
const fixed_size_simd<T, N> &b) \
{ \
return {internal_data0(a) op internal_data0(b), \
internal_data1(a) op internal_data1(b)}; \
}
Vc_ALL_COMPARES(Vc_FIXED_OP);
#undef Vc_FIXED_OP
}
namespace result_vector_type_internal
{
template <typename T>
using remove_cvref = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
template <typename T>
using is_integer_larger_than_int = std::integral_constant<
bool, std::is_integral<T>::value &&(sizeof(T) > sizeof(int) ||
std::is_same<T, long>::value ||
std::is_same<T, unsigned long>::value)>;
template <
typename L, typename R,
std::size_t N = Traits::isSimdArray<L>::value ? Traits::simd_vector_size<L>::value
: Traits::simd_vector_size<R>::value,
bool = (Traits::isSimdArray<L>::value ||
Traits::isSimdArray<R>::value) &&
!(Traits::is_fixed_size_simd<L>::value &&
Traits::is_fixed_size_simd<R>::value) &&
((std::is_arithmetic<remove_cvref<L>>::value &&
!is_integer_larger_than_int<remove_cvref<L>>::value) ||
(std::is_arithmetic<remove_cvref<R>>::value &&
!is_integer_larger_than_int<remove_cvref<R>>::value) ||
Traits::simd_vector_size<L>::value == Traits::simd_vector_size<R>::value)>
struct evaluate;
template <typename L, typename R, std::size_t N> struct evaluate<L, R, N, true>
{
private:
using LScalar = Traits::entry_type_of<L>;
using RScalar = Traits::entry_type_of<R>;
template <bool B, typename T, typename F>
using conditional = typename std::conditional<B, T, F>::type;
public:
using type = fixed_size_simd<
conditional<(std::is_integral<LScalar>::value &&std::is_integral<RScalar>::value &&
sizeof(LScalar) < sizeof(int) &&
sizeof(RScalar) < sizeof(int)),
conditional<(sizeof(LScalar) == sizeof(RScalar)),
conditional<std::is_unsigned<LScalar>::value, LScalar, RScalar>,
conditional<(sizeof(LScalar) > sizeof(RScalar)), LScalar, RScalar>>,
decltype(std::declval<LScalar>() + std::declval<RScalar>())>,
N>;
};
}
template <typename L, typename R>
using result_vector_type = typename result_vector_type_internal::evaluate<L, R>::type;
#define Vc_BINARY_OPERATORS_(op_) \
\
template <typename L, typename R> \
Vc_INTRINSIC result_vector_type<L, R> operator op_(L &&lhs, R &&rhs) \
{ \
using Return = result_vector_type<L, R>; \
return Vc::Detail::operator op_( \
static_cast<const Return &>(std::forward<L>(lhs)), \
static_cast<const Return &>(std::forward<R>(rhs))); \
}
Vc_ALL_ARITHMETICS(Vc_BINARY_OPERATORS_);
Vc_ALL_BINARY(Vc_BINARY_OPERATORS_);
#undef Vc_BINARY_OPERATORS_
#define Vc_BINARY_OPERATORS_(op_) \
\
template <typename L, typename R> \
Vc_INTRINSIC typename result_vector_type<L, R>::mask_type operator op_(L &&lhs, \
R &&rhs) \
{ \
using Promote = result_vector_type<L, R>; \
return Promote(std::forward<L>(lhs)) op_ Promote(std::forward<R>(rhs)); \
}
Vc_ALL_COMPARES(Vc_BINARY_OPERATORS_);
#undef Vc_BINARY_OPERATORS_
#define Vc_FORWARD_UNARY_OPERATOR(name_) \
\
template <typename T, std::size_t N, typename V, std::size_t M> \
inline fixed_size_simd<T, N> name_(const SimdArray<T, N, V, M> &x) \
{ \
return fixed_size_simd<T, N>::fromOperation( \
Common::Operations::Forward_##name_(), x); \
} \
template <class T, int N> \
fixed_size_simd<T, N> name_(const fixed_size_simd<T, N> &x) \
{ \
return fixed_size_simd<T, N>::fromOperation( \
Common::Operations::Forward_##name_(), x); \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_FORWARD_UNARY_BOOL_OPERATOR(name_) \
\
template <typename T, std::size_t N, typename V, std::size_t M> \
inline fixed_size_simd_mask<T, N> name_(const SimdArray<T, N, V, M> &x) \
{ \
return fixed_size_simd_mask<T, N>::fromOperation( \
Common::Operations::Forward_##name_(), x); \
} \
template <class T, int N> \
fixed_size_simd_mask<T, N> name_(const fixed_size_simd<T, N> &x) \
{ \
return fixed_size_simd_mask<T, N>::fromOperation( \
Common::Operations::Forward_##name_(), x); \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_FORWARD_BINARY_OPERATOR(name_) \
\
template <typename T, std::size_t N, typename V, std::size_t M> \
inline fixed_size_simd<T, N> name_(const SimdArray<T, N, V, M> &x, \
const SimdArray<T, N, V, M> &y) \
{ \
return fixed_size_simd<T, N>::fromOperation( \
Common::Operations::Forward_##name_(), x, y); \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_FORWARD_UNARY_OPERATOR(abs);
Vc_FORWARD_UNARY_OPERATOR(asin);
Vc_FORWARD_UNARY_OPERATOR(atan);
Vc_FORWARD_BINARY_OPERATOR(atan2);
Vc_FORWARD_UNARY_OPERATOR(ceil);
Vc_FORWARD_BINARY_OPERATOR(copysign);
Vc_FORWARD_UNARY_OPERATOR(cos);
Vc_FORWARD_UNARY_OPERATOR(exp);
Vc_FORWARD_UNARY_OPERATOR(exponent);
Vc_FORWARD_UNARY_OPERATOR(floor);
template <typename T, std::size_t N>
inline SimdArray<T, N> fma(const SimdArray<T, N> &a, const SimdArray<T, N> &b,
const SimdArray<T, N> &c)
{
return SimdArray<T, N>::fromOperation(Common::Operations::Forward_fma(), a, b, c);
}
Vc_FORWARD_UNARY_BOOL_OPERATOR(isfinite);
Vc_FORWARD_UNARY_BOOL_OPERATOR(isinf);
Vc_FORWARD_UNARY_BOOL_OPERATOR(isnan);
Vc_FORWARD_UNARY_BOOL_OPERATOR(isnegative);
template <typename T, std::size_t N>
inline SimdArray<T, N> frexp(const SimdArray<T, N> &x, SimdArray<int, N> *e)
{
return SimdArray<T, N>::fromOperation(Common::Operations::Forward_frexp(), x, e);
}
template <typename T, std::size_t N>
inline SimdArray<T, N> ldexp(const SimdArray<T, N> &x, const SimdArray<int, N> &e)
{
return SimdArray<T, N>::fromOperation(Common::Operations::Forward_ldexp(), x, e);
}
Vc_FORWARD_UNARY_OPERATOR(log);
Vc_FORWARD_UNARY_OPERATOR(log10);
Vc_FORWARD_UNARY_OPERATOR(log2);
Vc_FORWARD_UNARY_OPERATOR(reciprocal);
Vc_FORWARD_UNARY_OPERATOR(round);
Vc_FORWARD_UNARY_OPERATOR(rsqrt);
Vc_FORWARD_UNARY_OPERATOR(sin);
template <typename T, std::size_t N>
void sincos(const SimdArray<T, N> &x, SimdArray<T, N> *sin, SimdArray<T, N> *cos)
{
SimdArray<T, N>::callOperation(Common::Operations::Forward_sincos(), x, sin, cos);
}
Vc_FORWARD_UNARY_OPERATOR(sqrt);
Vc_FORWARD_UNARY_OPERATOR(trunc);
Vc_FORWARD_BINARY_OPERATOR(min);
Vc_FORWARD_BINARY_OPERATOR(max);
#undef Vc_FORWARD_UNARY_OPERATOR
#undef Vc_FORWARD_UNARY_BOOL_OPERATOR
#undef Vc_FORWARD_BINARY_OPERATOR
#ifdef Vc_MSVC
#define Vc_DUMMY_ARG0 , int = 0
#define Vc_DUMMY_ARG1 , long = 0
#define Vc_DUMMY_ARG2 , short = 0
#define Vc_DUMMY_ARG3 , char = '0'
#define Vc_DUMMY_ARG4 , unsigned = 0u
#define Vc_DUMMY_ARG5 , unsigned short = 0u
#else
#define Vc_DUMMY_ARG0
#define Vc_DUMMY_ARG1
#define Vc_DUMMY_ARG2
#define Vc_DUMMY_ARG3
#define Vc_DUMMY_ARG4
#define Vc_DUMMY_ARG5
#endif
template <typename Return, std::size_t N, typename T, typename... From>
Vc_INTRINSIC Vc_CONST enable_if<sizeof...(From) != 0, Return>
simd_cast_impl_smaller_input(const From &... xs, const T &last)
{
Return r = simd_cast<Return>(xs...);
for (size_t i = 0; i < N; ++i) {
r[i + N * sizeof...(From)] = static_cast<typename Return::EntryType>(last[i]);
}
return r;
}
template <typename Return, std::size_t N, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast_impl_smaller_input(const T &last)
{
Return r = Return();
for (size_t i = 0; i < N; ++i) {
r[i] = static_cast<typename Return::EntryType>(last[i]);
}
return r;
}
template <typename Return, std::size_t N, typename T, typename... From>
Vc_INTRINSIC Vc_CONST enable_if<sizeof...(From) != 0, Return> simd_cast_impl_larger_input(
const From &... xs, const T &last)
{
Return r = simd_cast<Return>(xs...);
for (size_t i = N * sizeof...(From); i < Return::Size; ++i) {
r[i] = static_cast<typename Return::EntryType>(last[i - N * sizeof...(From)]);
}
return r;
}
template <typename Return, std::size_t N, typename T>
Vc_INTRINSIC Vc_CONST Return simd_cast_impl_larger_input(const T &last)
{
Return r = Return();
for (size_t i = 0; i < Return::size(); ++i) {
r[i] = static_cast<typename Return::EntryType>(last[i]);
}
return r;
}
template <typename Return, typename T, typename... From>
Vc_INTRINSIC_L Vc_CONST_L Return
simd_cast_without_last(const From &... xs, const T &) Vc_INTRINSIC_R Vc_CONST_R;
template <typename... Ts> struct are_all_types_equal;
template <typename T>
struct are_all_types_equal<T> : public std::integral_constant<bool, true>
{
};
template <typename T0, typename T1, typename... Ts>
struct are_all_types_equal<T0, T1, Ts...>
: public std::integral_constant<
bool, std::is_same<T0, T1>::value && are_all_types_equal<T1, Ts...>::value>
{
};
template <typename Return, typename... Ts>
Vc_INTRINSIC Vc_CONST Return
simd_cast_interleaved_argument_order(const Ts &... a, const Ts &... b);
template <typename Return, std::size_t offset, typename From, typename... Froms>
Vc_INTRINSIC Vc_CONST
enable_if<(are_all_types_equal<From, Froms...>::value && offset == 0), Return>
simd_cast_with_offset(const From &x, const Froms &... xs);
template <typename Return, std::size_t offset, typename From>
Vc_INTRINSIC Vc_CONST
enable_if<(From::Size > offset && offset > 0 && offset % Return::Size == 0), Return>
simd_cast_with_offset(const From &x);
template <typename Return, std::size_t offset, typename From>
Vc_INTRINSIC Vc_CONST
enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 &&
((Traits::isSimdArray<Return>::value &&
!Traits::isAtomicSimdArray<Return>::value) ||
(Traits::isSimdMaskArray<Return>::value &&
!Traits::isAtomicSimdMaskArray<Return>::value))),
Return>
simd_cast_with_offset(const From &x);
template <typename Return, std::size_t offset, typename From>
Vc_INTRINSIC Vc_CONST
enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 &&
((Traits::isSimdArray<Return>::value &&
Traits::isAtomicSimdArray<Return>::value) ||
(Traits::isSimdMaskArray<Return>::value &&
Traits::isAtomicSimdMaskArray<Return>::value))),
Return>
simd_cast_with_offset(const From &x);
template <typename Return, std::size_t offset, typename From, typename... Froms>
Vc_INTRINSIC Vc_CONST enable_if<
(are_all_types_equal<From, Froms...>::value && From::Size <= offset), Return>
simd_cast_with_offset(const From &, const Froms &... xs)
{
return simd_cast_with_offset<Return, offset - From::Size>(xs...);
}
template <typename Return, std::size_t offset, typename From>
Vc_INTRINSIC Vc_CONST enable_if<(From::Size <= offset), Return> simd_cast_with_offset(
const From &)
{
return Return(0);
}
template <typename T, typename... Ts> struct first_type_of_impl
{
using type = T;
};
template <typename... Ts> using first_type_of = typename first_type_of_impl<Ts...>::type;
template <typename Return, typename From>
Vc_INTRINSIC Vc_CONST Return simd_cast_drop_arguments(From x);
template <typename Return, typename... Froms>
Vc_INTRINSIC Vc_CONST
enable_if<(are_all_types_equal<Froms...>::value &&
sizeof...(Froms) * first_type_of<Froms...>::Size < Return::Size),
Return>
simd_cast_drop_arguments(Froms... xs, first_type_of<Froms...> x);
template <typename Return, typename From, typename... Froms>
Vc_INTRINSIC Vc_CONST enable_if<
(are_all_types_equal<From, Froms...>::value &&
(1 + sizeof...(Froms)) * From::Size >= Return::Size && sizeof...(Froms) != 0),
Return>
simd_cast_drop_arguments(Froms... xs, From x, From);
template <typename Return, typename From>
Vc_INTRINSIC Vc_CONST
enable_if<(are_all_types_equal<From>::value && From::Size >= Return::Size), Return>
simd_cast_drop_arguments(From x, From);
namespace
{
#ifdef Vc_DEBUG_SIMD_CAST
void debugDoNothing(const std::initializer_list<void *> &) {}
template <typename T0, typename... Ts>
inline void vc_debug_(const char *prefix, const char *suffix, const T0 &arg0,
const Ts &... args)
{
std::cerr << prefix << arg0;
debugDoNothing({&(std::cerr << ", " << args)...});
std::cerr << suffix;
}
#else
template <typename T0, typename... Ts>
Vc_INTRINSIC void vc_debug_(const char *, const char *, const T0 &, const Ts &...)
{
}
#endif
}
template <size_t A, size_t B>
struct is_less : public std::integral_constant<bool, (A < B)> {
};
template <size_t N>
struct is_power_of_2 : public std::integral_constant<bool, ((N - 1) & N) == 0> {
};
#define Vc_SIMDARRAY_CASTS(SimdArrayType_,NativeType_) \
template <typename Return, typename T, typename A, typename... Froms> \
Vc_INTRINSIC Vc_CONST enable_if< \
(Traits::isAtomic##SimdArrayType_<Return>::value && \
is_less<NativeType_<T, A>::Size * sizeof...(Froms), Return::Size>::value && \
are_all_types_equal<NativeType_<T, A>, Froms...>::value && \
!detail::is_fixed_size_abi<A>::value), \
Return> \
simd_cast(NativeType_<T, A> x, Froms... xs) \
{ \
vc_debug_("simd_cast{1}(", ")\n", x, xs...); \
return {private_init, simd_cast<typename Return::storage_type>(x, xs...)}; \
} \
template <typename Return, typename T, typename A, typename... Froms> \
Vc_INTRINSIC Vc_CONST enable_if< \
(Traits::isAtomic##SimdArrayType_<Return>::value && \
!is_less<NativeType_<T, A>::Size * sizeof...(Froms), Return::Size>::value && \
are_all_types_equal<NativeType_<T, A>, Froms...>::value && \
!detail::is_fixed_size_abi<A>::value), \
Return> \
simd_cast(NativeType_<T, A> x, Froms... xs) \
{ \
vc_debug_("simd_cast{2}(", ")\n", x, xs...); \
return {simd_cast_without_last<Return, NativeType_<T, A>, Froms...>(x, xs...)}; \
} \
template <typename Return, typename T, typename A, typename... Froms> \
Vc_INTRINSIC Vc_CONST \
enable_if<(Traits::is##SimdArrayType_<Return>::value && \
!Traits::isAtomic##SimdArrayType_<Return>::value && \
is_less<Common::left_size<Return::Size>(), \
NativeType_<T, A>::Size *(1 + sizeof...(Froms))>::value && \
are_all_types_equal<NativeType_<T, A>, Froms...>::value && \
!detail::is_fixed_size_abi<A>::value), \
Return> \
simd_cast(NativeType_<T, A> x, Froms... xs) \
{ \
vc_debug_("simd_cast{3}(", ")\n", x, xs...); \
using R0 = typename Return::storage_type0; \
using R1 = typename Return::storage_type1; \
return {simd_cast_drop_arguments<R0, Froms...>(x, xs...), \
simd_cast_with_offset<R1, R0::Size>(x, xs...)}; \
} \
template <typename Return, typename T, typename A, typename... Froms> \
Vc_INTRINSIC Vc_CONST \
enable_if<(Traits::is##SimdArrayType_<Return>::value && \
!Traits::isAtomic##SimdArrayType_<Return>::value && \
!is_less<Common::left_size<Return::Size>(), \
NativeType_<T, A>::Size *(1 + sizeof...(Froms))>::value && \
are_all_types_equal<NativeType_<T, A>, Froms...>::value && \
!detail::is_fixed_size_abi<A>::value), \
Return> \
simd_cast(NativeType_<T, A> x, Froms... xs) \
{ \
vc_debug_("simd_cast{4}(", ")\n", x, xs...); \
using R0 = typename Return::storage_type0; \
using R1 = typename Return::storage_type1; \
return {simd_cast<R0>(x, xs...), R1(0)}; \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_SIMDARRAY_CASTS(SimdArray, Vc::Vector);
Vc_SIMDARRAY_CASTS(SimdMaskArray, Vc::Mask);
#undef Vc_SIMDARRAY_CASTS
#define Vc_SIMDARRAY_CASTS(SimdArrayType_,NativeType_) \
\
template <typename Return, int offset, typename T, typename A> \
Vc_INTRINSIC Vc_CONST \
enable_if<Traits::isAtomic##SimdArrayType_<Return>::value, Return> \
simd_cast(NativeType_<T, A> x Vc_DUMMY_ARG0) \
{ \
vc_debug_("simd_cast{offset, atomic}(", ")\n", offset, x); \
return {private_init, simd_cast<typename Return::storage_type, offset>(x)}; \
} \
\
template <typename Return, int offset, typename T, typename A> \
Vc_INTRINSIC Vc_CONST \
enable_if<(Traits::is##SimdArrayType_<Return>::value && \
!Traits::isAtomic##SimdArrayType_<Return>::value && \
Return::Size * offset + Common::left_size<Return::Size>() < \
NativeType_<T, A>::Size), \
Return> \
simd_cast(NativeType_<T, A> x Vc_DUMMY_ARG1) \
{ \
vc_debug_("simd_cast{offset, split Return}(", ")\n", offset, x); \
using R0 = typename Return::storage_type0; \
constexpr int entries_offset = offset * Return::Size; \
constexpr int entries_offset_right = entries_offset + R0::Size; \
return { \
simd_cast_with_offset<typename Return::storage_type0, entries_offset>(x), \
simd_cast_with_offset<typename Return::storage_type1, entries_offset_right>( \
x)}; \
} \
\
\
template <typename Return, int offset, typename T, typename A> \
Vc_INTRINSIC Vc_CONST \
enable_if<(Traits::is##SimdArrayType_<Return>::value && \
!Traits::isAtomic##SimdArrayType_<Return>::value && \
Return::Size * offset + Common::left_size<Return::Size>() >= \
NativeType_<T, A>::Size), \
Return> \
simd_cast(NativeType_<T, A> x Vc_DUMMY_ARG2) \
{ \
vc_debug_("simd_cast{offset, R1::Zero}(", ")\n", offset, x); \
using R0 = typename Return::storage_type0; \
using R1 = typename Return::storage_type1; \
constexpr int entries_offset = offset * Return::Size; \
return {simd_cast_with_offset<R0, entries_offset>(x), R1(0)}; \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_SIMDARRAY_CASTS(SimdArray, Vc::Vector);
Vc_SIMDARRAY_CASTS(SimdMaskArray, Vc::Mask);
#undef Vc_SIMDARRAY_CASTS
#define Vc_SIMDARRAY_CASTS(SimdArrayType_) \
\
template <typename Return, typename T, std::size_t N, typename V, typename... From> \
Vc_INTRINSIC Vc_CONST \
enable_if<(are_all_types_equal<SimdArrayType_<T, N, V, N>, From...>::value && \
(sizeof...(From) == 0 || N * sizeof...(From) < Return::Size) && \
!std::is_same<Return, SimdArrayType_<T, N, V, N>>::value), \
Return> \
simd_cast(const SimdArrayType_<T, N, V, N> &x0, const From &... xs) \
{ \
vc_debug_("simd_cast{indivisible}(", ")\n", x0, xs...); \
return simd_cast<Return>(internal_data(x0), internal_data(xs)...); \
} \
\
template <typename Return, typename T, std::size_t N, typename V, typename... From> \
Vc_INTRINSIC Vc_CONST \
enable_if<(are_all_types_equal<SimdArrayType_<T, N, V, N>, From...>::value && \
(sizeof...(From) > 0 && (N * sizeof...(From) >= Return::Size)) && \
!std::is_same<Return, SimdArrayType_<T, N, V, N>>::value), \
Return> \
simd_cast(const SimdArrayType_<T, N, V, N> &x0, const From &... xs) \
{ \
vc_debug_("simd_cast{indivisible2}(", ")\n", x0, xs...); \
return simd_cast_without_last<Return, \
typename SimdArrayType_<T, N, V, N>::storage_type, \
typename From::storage_type...>( \
internal_data(x0), internal_data(xs)...); \
} \
\
template <typename Return, typename T, std::size_t N, typename V, std::size_t M, \
typename... From> \
Vc_INTRINSIC Vc_CONST enable_if< \
(N != M && are_all_types_equal<SimdArrayType_<T, N, V, M>, From...>::value && \
!std::is_same<Return, SimdArrayType_<T, N, V, M>>::value && \
is_less<N * sizeof...(From), Return::Size>::value && is_power_of_2<N>::value), \
Return> \
simd_cast(const SimdArrayType_<T, N, V, M> &x0, const From &... xs) \
{ \
vc_debug_("simd_cast{bisectable}(", ")\n", x0, xs...); \
return simd_cast_interleaved_argument_order< \
Return, typename SimdArrayType_<T, N, V, M>::storage_type0, \
typename From::storage_type0...>(internal_data0(x0), internal_data0(xs)..., \
internal_data1(x0), internal_data1(xs)...); \
} \
\
template <typename Return, typename T, std::size_t N, typename V, std::size_t M, \
typename... From> \
Vc_INTRINSIC Vc_CONST enable_if< \
(N != M && are_all_types_equal<SimdArrayType_<T, N, V, M>, From...>::value && \
!is_less<N * sizeof...(From), Return::Size>::value && is_power_of_2<N>::value), \
Return> \
simd_cast(const SimdArrayType_<T, N, V, M> &x0, const From &... xs) \
{ \
vc_debug_("simd_cast{bisectable2}(", ")\n", x0, xs...); \
return simd_cast_without_last<Return, SimdArrayType_<T, N, V, M>, From...>( \
x0, xs...); \
} \
\
template <typename Return, typename T, std::size_t N, typename V, std::size_t M, \
typename... From> \
Vc_INTRINSIC Vc_CONST enable_if< \
(N != M && are_all_types_equal<SimdArrayType_<T, N, V, M>, From...>::value && \
N * (1 + sizeof...(From)) <= Return::Size && !is_power_of_2<N>::value), \
Return> \
simd_cast(const SimdArrayType_<T, N, V, M> &x0, const From &... xs) \
{ \
vc_debug_("simd_cast{remaining}(", ")\n", x0, xs...); \
return simd_cast_impl_smaller_input<Return, N, SimdArrayType_<T, N, V, M>, \
From...>(x0, xs...); \
} \
\
template <typename Return, typename T, std::size_t N, typename V, std::size_t M, \
typename... From> \
Vc_INTRINSIC Vc_CONST enable_if< \
(N != M && are_all_types_equal<SimdArrayType_<T, N, V, M>, From...>::value && \
N * (1 + sizeof...(From)) > Return::Size && !is_power_of_2<N>::value), \
Return> \
simd_cast(const SimdArrayType_<T, N, V, M> &x0, const From &... xs) \
{ \
vc_debug_("simd_cast{remaining2}(", ")\n", x0, xs...); \
return simd_cast_impl_larger_input<Return, N, SimdArrayType_<T, N, V, M>, \
From...>(x0, xs...); \
} \
\
template <typename Return, typename T, std::size_t N, typename V, std::size_t M> \
Vc_INTRINSIC Vc_CONST \
enable_if<(N != M && N >= 2 * Return::Size && is_power_of_2<N>::value), Return> \
simd_cast(const SimdArrayType_<T, N, V, M> &x) \
{ \
vc_debug_("simd_cast{single bisectable}(", ")\n", x); \
return simd_cast<Return>(internal_data0(x)); \
} \
template <typename Return, typename T, std::size_t N, typename V, std::size_t M> \
Vc_INTRINSIC Vc_CONST enable_if<(N != M && N > Return::Size && \
N < 2 * Return::Size && is_power_of_2<N>::value), \
Return> \
simd_cast(const SimdArrayType_<T, N, V, M> &x) \
{ \
vc_debug_("simd_cast{single bisectable2}(", ")\n", x); \
return simd_cast<Return>(internal_data0(x), internal_data1(x)); \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_SIMDARRAY_CASTS(SimdArray);
Vc_SIMDARRAY_CASTS(SimdMaskArray);
#undef Vc_SIMDARRAY_CASTS
template <class Return, class T, int N, class... Ts,
class = enable_if<!std::is_same<Return, fixed_size_simd<T, N>>::value>>
Vc_INTRINSIC Return simd_cast(const fixed_size_simd<T, N> &x, const Ts &... xs)
{
return simd_cast<Return>(static_cast<const SimdArray<T, N> &>(x),
static_cast<const SimdArray<T, N> &>(xs)...);
}
template <class Return, class T, int N, class... Ts,
class = enable_if<!std::is_same<Return, fixed_size_simd_mask<T, N>>::value>>
Vc_INTRINSIC Return simd_cast(const fixed_size_simd_mask<T, N> &x, const Ts &... xs)
{
return simd_cast<Return>(static_cast<const SimdMaskArray<T, N> &>(x),
static_cast<const SimdMaskArray<T, N> &>(xs)...);
}
#define Vc_SIMDARRAY_CASTS(SimdArrayType_) \
\
template <typename Return, int offset, typename T, std::size_t N, typename V, \
std::size_t M> \
Vc_INTRINSIC Vc_CONST enable_if<(offset == 0), Return> simd_cast( \
const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG0) \
{ \
vc_debug_("simd_cast{offset == 0}(", ")\n", offset, x); \
return simd_cast<Return>(x); \
} \
\
template <typename Return, int offset, typename T, std::size_t N, typename V> \
Vc_INTRINSIC Vc_CONST enable_if<(offset != 0), Return> simd_cast( \
const SimdArrayType_<T, N, V, N> &x Vc_DUMMY_ARG1) \
{ \
vc_debug_("simd_cast{offset, forward}(", ")\n", offset, x); \
return simd_cast<Return, offset>(internal_data(x)); \
} \
\
template <typename Return, int offset, typename T, std::size_t N, typename V, \
std::size_t M> \
Vc_INTRINSIC Vc_CONST \
enable_if<(N != M && offset * Return::Size >= Common::left_size<N>() && \
offset != 0 && Common::left_size<N>() % Return::Size == 0), \
Return> \
simd_cast(const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG2) \
{ \
vc_debug_("simd_cast{offset, right}(", ")\n", offset, x); \
return simd_cast<Return, offset - Common::left_size<N>() / Return::Size>( \
internal_data1(x)); \
} \
\
template <typename Return, int offset, typename T, std::size_t N, typename V, \
std::size_t M> \
Vc_INTRINSIC Vc_CONST \
enable_if<(N != M && offset * Return::Size >= Common::left_size<N>() && \
offset != 0 && Common::left_size<N>() % Return::Size != 0), \
Return> \
simd_cast(const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG3) \
{ \
vc_debug_("simd_cast{offset, right, nofit}(", ")\n", offset, x); \
return simd_cast_with_offset<Return, \
offset * Return::Size - Common::left_size<N>()>( \
internal_data1(x)); \
} \
\
template <typename Return, int offset, typename T, std::size_t N, typename V, \
std::size_t M> \
Vc_INTRINSIC Vc_CONST enable_if< \
(N != M && \
offset != 0 && (offset + 1) * Return::Size <= Common::left_size<N>()), \
Return> \
simd_cast(const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG4) \
{ \
vc_debug_("simd_cast{offset, left}(", ")\n", offset, x); \
return simd_cast<Return, offset>(internal_data0(x)); \
} \
\
template <typename Return, int offset, typename T, std::size_t N, typename V, \
std::size_t M> \
Vc_INTRINSIC Vc_CONST \
enable_if<(N != M && (offset * Return::Size < Common::left_size<N>()) && \
offset != 0 && (offset + 1) * Return::Size > Common::left_size<N>()), \
Return> \
simd_cast(const SimdArrayType_<T, N, V, M> &x Vc_DUMMY_ARG5) \
{ \
vc_debug_("simd_cast{offset, copy scalars}(", ")\n", offset, x); \
using R = typename Return::EntryType; \
Return r = Return(0); \
for (std::size_t i = offset * Return::Size; \
i < std::min(N, (offset + 1) * Return::Size); ++i) { \
r[i - offset * Return::Size] = static_cast<R>(x[i]); \
} \
return r; \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_SIMDARRAY_CASTS(SimdArray);
Vc_SIMDARRAY_CASTS(SimdMaskArray);
#undef Vc_SIMDARRAY_CASTS
template <typename Return, typename From>
Vc_INTRINSIC Vc_CONST Return simd_cast_drop_arguments(From x)
{
return simd_cast<Return>(x);
}
template <typename Return, typename... Froms>
Vc_INTRINSIC Vc_CONST
enable_if<(are_all_types_equal<Froms...>::value &&
sizeof...(Froms) * first_type_of<Froms...>::Size < Return::Size),
Return>
simd_cast_drop_arguments(Froms... xs, first_type_of<Froms...> x)
{
return simd_cast<Return>(xs..., x);
}
template <typename Return, typename From, typename... Froms>
Vc_INTRINSIC Vc_CONST enable_if<
(are_all_types_equal<From, Froms...>::value &&
(1 + sizeof...(Froms)) * From::Size >= Return::Size && sizeof...(Froms) != 0),
Return>
simd_cast_drop_arguments(Froms... xs, From x, From)
{
return simd_cast_drop_arguments<Return, Froms...>(xs..., x);
}
template <typename Return, typename From>
Vc_INTRINSIC Vc_CONST
enable_if<(are_all_types_equal<From>::value && From::Size >= Return::Size), Return>
simd_cast_drop_arguments(From x, From)
{
return simd_cast_drop_arguments<Return>(x);
}
template <typename Return, std::size_t offset, typename From>
Vc_INTRINSIC Vc_CONST
enable_if<(From::Size > offset && offset > 0 && offset % Return::Size == 0),
Return> simd_cast_with_offset(const From &x)
{
return simd_cast<Return, offset / Return::Size>(x);
}
template <typename Return, std::size_t offset, typename From>
Vc_INTRINSIC Vc_CONST
enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 &&
((Traits::isSimdArray<Return>::value &&
!Traits::isAtomicSimdArray<Return>::value) ||
(Traits::isSimdMaskArray<Return>::value &&
!Traits::isAtomicSimdMaskArray<Return>::value))),
Return>
simd_cast_with_offset(const From &x)
{
using R0 = typename Return::storage_type0;
using R1 = typename Return::storage_type1;
return {simd_cast_with_offset<R0, offset>(x),
simd_cast_with_offset<R1, offset + R0::Size>(x)};
}
template <typename Return, std::size_t offset, typename From>
Vc_INTRINSIC Vc_CONST
enable_if<(From::Size > offset && offset > 0 && offset % Return::Size != 0 &&
((Traits::isSimdArray<Return>::value &&
Traits::isAtomicSimdArray<Return>::value) ||
(Traits::isSimdMaskArray<Return>::value &&
Traits::isAtomicSimdMaskArray<Return>::value))),
Return>
simd_cast_with_offset(const From &x)
{
return simd_cast<Return, offset / Return::Size>(x.shifted(offset % Return::Size));
}
template <typename Return, std::size_t offset, typename From, typename... Froms>
Vc_INTRINSIC Vc_CONST
enable_if<(are_all_types_equal<From, Froms...>::value && offset == 0), Return>
simd_cast_with_offset(const From &x, const Froms &... xs)
{
return simd_cast<Return>(x, xs...);
}
template <typename Return, typename T, typename... From>
Vc_INTRINSIC Vc_CONST Return simd_cast_without_last(const From &... xs, const T &)
{
return simd_cast<Return>(xs...);
}
#ifdef Vc_MSVC
template <std::size_t I, typename T0>
Vc_INTRINSIC Vc_CONST enable_if<(I == 0), T0> extract_interleaved(const T0 &a0, const T0 &)
{
return a0;
}
template <std::size_t I, typename T0>
Vc_INTRINSIC Vc_CONST enable_if<(I == 1), T0> extract_interleaved(const T0 &, const T0 &b0)
{
return b0;
}
#endif
template <std::size_t I, typename T0, typename... Ts>
Vc_INTRINSIC Vc_CONST enable_if<(I == 0), T0> extract_interleaved(const T0 &a0,
const Ts &...,
const T0 &,
const Ts &...)
{
return a0;
}
template <std::size_t I, typename T0, typename... Ts>
Vc_INTRINSIC Vc_CONST enable_if<(I == 1), T0> extract_interleaved(const T0 &,
const Ts &...,
const T0 &b0,
const Ts &...)
{
return b0;
}
template <std::size_t I, typename T0, typename... Ts>
Vc_INTRINSIC Vc_CONST enable_if<(I > 1), T0> extract_interleaved(const T0 &,
const Ts &... a,
const T0 &,
const Ts &... b)
{
return extract_interleaved<I - 2, Ts...>(a..., b...);
}
template <typename Return, typename... Ts, std::size_t... Indexes>
Vc_INTRINSIC Vc_CONST Return
simd_cast_interleaved_argument_order_1(index_sequence<Indexes...>, const Ts &... a,
const Ts &... b)
{
return simd_cast<Return>(extract_interleaved<Indexes, Ts...>(a..., b...)...);
}
template <typename Return, typename... Ts>
Vc_INTRINSIC Vc_CONST Return
simd_cast_interleaved_argument_order(const Ts &... a, const Ts &... b)
{
using seq = make_index_sequence<sizeof...(Ts)*2>;
return simd_cast_interleaved_argument_order_1<Return, Ts...>(seq(), a..., b...);
}
#define Vc_CONDITIONAL_ASSIGN(name_,op_) \
template <Operator O, typename T, std::size_t N, typename V, size_t VN, typename M, \
typename U> \
Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign( \
SimdArray<T, N, V, VN> &lhs, M &&mask, U &&rhs) \
{ \
lhs(mask) op_ rhs; \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN( Assign, =);
Vc_CONDITIONAL_ASSIGN( PlusAssign, +=);
Vc_CONDITIONAL_ASSIGN( MinusAssign, -=);
Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=);
Vc_CONDITIONAL_ASSIGN( DivideAssign, /=);
Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
Vc_CONDITIONAL_ASSIGN( XorAssign, ^=);
Vc_CONDITIONAL_ASSIGN( AndAssign, &=);
Vc_CONDITIONAL_ASSIGN( OrAssign, |=);
Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
#undef Vc_CONDITIONAL_ASSIGN
#define Vc_CONDITIONAL_ASSIGN(name_,expr_) \
template <Operator O, typename T, std::size_t N, typename V, size_t VN, typename M> \
Vc_INTRINSIC enable_if<O == Operator::name_, SimdArray<T, N, V, VN>> \
conditional_assign(SimdArray<T, N, V, VN> &lhs, M &&mask) \
{ \
return expr_; \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++);
Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask));
Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--);
Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask));
#undef Vc_CONDITIONAL_ASSIGN
namespace Common
{
template <typename T, size_t N, typename V>
inline void transpose_impl(
TransposeTag<4, 4>, SimdArray<T, N, V, N> *Vc_RESTRICT r[],
const TransposeProxy<SimdArray<T, N, V, N>, SimdArray<T, N, V, N>,
SimdArray<T, N, V, N>, SimdArray<T, N, V, N>> &proxy)
{
V *Vc_RESTRICT r2[4] = {&internal_data(*r[0]), &internal_data(*r[1]),
&internal_data(*r[2]), &internal_data(*r[3])};
transpose_impl(TransposeTag<4, 4>(), &r2[0],
TransposeProxy<V, V, V, V>{internal_data(std::get<0>(proxy.in)),
internal_data(std::get<1>(proxy.in)),
internal_data(std::get<2>(proxy.in)),
internal_data(std::get<3>(proxy.in))});
}
template <typename T, typename V>
inline void transpose_impl(
TransposeTag<2, 4>, SimdArray<T, 4, V, 1> *Vc_RESTRICT r[],
const TransposeProxy<SimdArray<T, 2, V, 1>, SimdArray<T, 2, V, 1>,
SimdArray<T, 2, V, 1>, SimdArray<T, 2, V, 1>> &proxy)
{
auto &lo = *r[0];
auto &hi = *r[1];
internal_data0(internal_data0(lo)) = internal_data0(std::get<0>(proxy.in));
internal_data1(internal_data0(lo)) = internal_data0(std::get<1>(proxy.in));
internal_data0(internal_data1(lo)) = internal_data0(std::get<2>(proxy.in));
internal_data1(internal_data1(lo)) = internal_data0(std::get<3>(proxy.in));
internal_data0(internal_data0(hi)) = internal_data1(std::get<0>(proxy.in));
internal_data1(internal_data0(hi)) = internal_data1(std::get<1>(proxy.in));
internal_data0(internal_data1(hi)) = internal_data1(std::get<2>(proxy.in));
internal_data1(internal_data1(hi)) = internal_data1(std::get<3>(proxy.in));
}
template <typename T, typename V>
inline void transpose_impl(
TransposeTag<4, 4>, SimdArray<T, 1, V, 1> *Vc_RESTRICT r[],
const TransposeProxy<SimdArray<T, 1, V, 1>, SimdArray<T, 1, V, 1>,
SimdArray<T, 1, V, 1>, SimdArray<T, 1, V, 1>> &proxy)
{
V *Vc_RESTRICT r2[4] = {&internal_data(*r[0]), &internal_data(*r[1]),
&internal_data(*r[2]), &internal_data(*r[3])};
transpose_impl(TransposeTag<4, 4>(), &r2[0],
TransposeProxy<V, V, V, V>{internal_data(std::get<0>(proxy.in)),
internal_data(std::get<1>(proxy.in)),
internal_data(std::get<2>(proxy.in)),
internal_data(std::get<3>(proxy.in))});
}
template <typename T, size_t N, typename V>
inline void transpose_impl(
TransposeTag<4, 4>, SimdArray<T, N, V, 1> *Vc_RESTRICT r[],
const TransposeProxy<SimdArray<T, N, V, 1>, SimdArray<T, N, V, 1>,
SimdArray<T, N, V, 1>, SimdArray<T, N, V, 1>> &proxy)
{
SimdArray<T, N, V, 1> *Vc_RESTRICT r0[4 / 2] = {r[0], r[1]};
SimdArray<T, N, V, 1> *Vc_RESTRICT r1[4 / 2] = {r[2], r[3]};
using H = SimdArray<T, 2>;
transpose_impl(TransposeTag<2, 4>(), &r0[0],
TransposeProxy<H, H, H, H>{internal_data0(std::get<0>(proxy.in)),
internal_data0(std::get<1>(proxy.in)),
internal_data0(std::get<2>(proxy.in)),
internal_data0(std::get<3>(proxy.in))});
transpose_impl(TransposeTag<2, 4>(), &r1[0],
TransposeProxy<H, H, H, H>{internal_data1(std::get<0>(proxy.in)),
internal_data1(std::get<1>(proxy.in)),
internal_data1(std::get<2>(proxy.in)),
internal_data1(std::get<3>(proxy.in))});
}
}
namespace Detail
{
template <class T, size_t N, class V, size_t VSizeof>
struct InterleaveImpl<SimdArray<T, N, V, N>, N, VSizeof> {
template <class I, class... VV>
static Vc_INTRINSIC void interleave(T *const data, const I &i, const VV &... vv)
{
InterleaveImpl<V, N, VSizeof>::interleave(data, i, internal_data(vv)...);
}
template <class I, class... VV>
static Vc_INTRINSIC void deinterleave(T const *const data, const I &i, VV &... vv)
{
InterleaveImpl<V, N, VSizeof>::deinterleave(data, i, internal_data(vv)...);
}
};
}
}
namespace std
{
template <typename T, size_t N, typename V, size_t VN>
struct numeric_limits<Vc::SimdArray<T, N, V, VN>> : public numeric_limits<T> {
private:
using R = Vc::SimdArray<T, N, V, VN>;
public:
static Vc_ALWAYS_INLINE Vc_CONST R max() noexcept { return numeric_limits<T>::max(); }
static Vc_ALWAYS_INLINE Vc_CONST R min() noexcept { return numeric_limits<T>::min(); }
static Vc_ALWAYS_INLINE Vc_CONST R lowest() noexcept
{
return numeric_limits<T>::lowest();
}
static Vc_ALWAYS_INLINE Vc_CONST R epsilon() noexcept
{
return numeric_limits<T>::epsilon();
}
static Vc_ALWAYS_INLINE Vc_CONST R round_error() noexcept
{
return numeric_limits<T>::round_error();
}
static Vc_ALWAYS_INLINE Vc_CONST R infinity() noexcept
{
return numeric_limits<T>::infinity();
}
static Vc_ALWAYS_INLINE Vc_CONST R quiet_NaN() noexcept
{
return numeric_limits<T>::quiet_NaN();
}
static Vc_ALWAYS_INLINE Vc_CONST R signaling_NaN() noexcept
{
return numeric_limits<T>::signaling_NaN();
}
static Vc_ALWAYS_INLINE Vc_CONST R denorm_min() noexcept
{
return numeric_limits<T>::denorm_min();
}
};
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename T, typename Abi, typename U>
enable_if<!std::is_same<T, U>::value, U> is_convertible_to_any_vector(Vector<U, Abi>);
template <typename T, typename Abi> T is_convertible_to_any_vector(Vector<T, Abi>);
template <typename T, typename U, bool = std::is_integral<T>::value,
bool = std::is_integral<U>::value>
struct FundamentalReturnType;
template <class T, class U>
using fundamental_return_t = typename FundamentalReturnType<T, U>::type;
template <typename T, typename U> struct FundamentalReturnType<T, U, false, false> {
using type = typename std::conditional<
std::is_arithmetic<U>::value,
typename std::conditional<(sizeof(T) < sizeof(U)), U, T>::type,
T>::type;
};
template <typename T, typename U> struct FundamentalReturnType<T, U, true, false> {
using type = typename std::conditional<
std::is_arithmetic<U>::value, U,
T>::type;
};
template <typename T, typename U> struct FundamentalReturnType<T, U, false, true> {
using type = T;
};
template <typename T> struct my_make_signed : public std::make_signed<T> {
};
template <> struct my_make_signed<bool> {
using type = bool;
};
template <typename TT, typename UU>
struct higher_conversion_rank {
template <typename A>
using fix_sign =
typename std::conditional<(std::is_unsigned<TT>::value ||
std::is_unsigned<UU>::value),
typename std::make_unsigned<A>::type, A>::type;
using T = typename my_make_signed<TT>::type;
using U = typename my_make_signed<UU>::type;
template <typename Test, typename Otherwise>
using c = typename std::conditional<std::is_same<T, Test>::value ||
std::is_same<U, Test>::value,
Test, Otherwise>::type;
using type = fix_sign<c<long long, c<long, c<int, c<short, c<signed char, void>>>>>>;
};
template <typename T, typename U> struct FundamentalReturnType<T, U, true, true> {
template <bool B, class Then, class E>
using c = typename std::conditional<B, Then, E>::type;
using type =
c<(sizeof(T) > sizeof(U)), T,
c<(sizeof(T) < sizeof(U)), U, typename higher_conversion_rank<T, U>::type>>;
};
template <class V, class T, class Tq, class = void> struct ReturnTypeImpl {
};
template <class T, class U, class Abi, class Uq>
struct ReturnTypeImpl<Vector<T, Abi>, Vector<U, Abi>, Uq, void> {
using type = Vc::Vector<fundamental_return_t<T, U>, Abi>;
};
template <class T, class Abi, class Uq>
struct ReturnTypeImpl<Vector<T, Abi>, int, Uq, void> {
using type = Vc::Vector<T, Abi>;
};
template <class T, class Abi, class Uq>
struct ReturnTypeImpl<Vector<T, Abi>, uint, Uq, void> {
using type = Vc::Vector<
typename std::conditional<std::is_integral<T>::value, std::make_unsigned<T>,
std::enable_if<true, T>>::type::type,
Abi>;
};
template <class T, class U, class Abi, class Uq>
struct ReturnTypeImpl<
Vector<T, Abi>, U, Uq,
enable_if<!std::is_class<U>::value && !std::is_same<U, int>::value &&
!std::is_same<U, uint>::value &&
Traits::is_valid_vector_argument<fundamental_return_t<T, U>>::value,
void>> {
using type = Vc::Vector<fundamental_return_t<T, U>, Abi>;
};
template <class T, class U, class Abi, class Uq>
struct ReturnTypeImpl<
Vector<T, Abi>, U, Uq,
enable_if<std::is_class<U>::value && !Traits::is_simd_vector<U>::value &&
Traits::is_valid_vector_argument<decltype(
is_convertible_to_any_vector<T, Abi>(std::declval<Uq>()))>::value,
void>> {
using type =
Vc::Vector<fundamental_return_t<T, decltype(is_convertible_to_any_vector<T, Abi>(
std::declval<Uq>()))>,
Abi>;
};
template <class V, class Tq, class T = remove_cvref_t<Tq>>
using ReturnType = typename ReturnTypeImpl<V, T, Tq>::type;
template <class T> struct is_a_type : public std::true_type {
};
#ifdef Vc_ENABLE_FLOAT_BIT_OPERATORS
#define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) true
#else
#define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) \
Detail::is_a_type<decltype(std::declval<typename R::value_type>() \
op_ std::declval<typename R::value_type>())>::value
#endif
}
#define Vc_GENERIC_OPERATOR(op_) \
template <class T, class Abi, class U, \
class R = Detail::ReturnType<Vector<T, Abi>, U>> \
Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \
std::is_convertible<Vector<T, Abi>, R>::value && \
std::is_convertible<U, R>::value, \
R> \
operator op_(Vector<T, Abi> x, U &&y) \
{ \
return Detail::operator op_(R(x), R(std::forward<U>(y))); \
} \
template <class T, class Abi, class U, \
class R = Detail::ReturnType<Vector<T, Abi>, U>> \
Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \
!Traits::is_simd_vector<U>::value && \
std::is_convertible<Vector<T, Abi>, R>::value && \
std::is_convertible<U, R>::value, \
R> \
operator op_(U &&x, Vector<T, Abi> y) \
{ \
return Detail::operator op_(R(std::forward<U>(x)), R(y)); \
} \
template <class T, class Abi, class U, \
class R = Detail::ReturnType<Vector<T, Abi>, U>> \
Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \
std::is_convertible<Vector<T, Abi>, R>::value && \
std::is_convertible<U, R>::value, \
Vector<T, Abi> &> \
operator op_##=(Vector<T, Abi> &x, U &&y) \
{ \
x = Detail::operator op_(R(x), R(std::forward<U>(y))); \
return x; \
}
#define Vc_LOGICAL_OPERATOR(op_) \
template <class T, class Abi> \
Vc_ALWAYS_INLINE typename Vector<T, Abi>::Mask operator op_(Vector<T, Abi> x, \
Vector<T, Abi> y) \
{ \
return !!x op_ !!y; \
} \
template <class T, class Abi, class U> \
Vc_ALWAYS_INLINE \
enable_if<std::is_convertible<Vector<T, Abi>, Vector<U, Abi>>::value && \
std::is_convertible<Vector<U, Abi>, Vector<T, Abi>>::value, \
typename Detail::ReturnType<Vector<T, Abi>, Vector<U, Abi>>::Mask> \
operator op_(Vector<T, Abi> x, Vector<U, Abi> y) \
{ \
return !!x op_ !!y; \
} \
template <class T, class Abi, class U> \
Vc_ALWAYS_INLINE enable_if<std::is_same<bool, decltype(!std::declval<U>())>::value, \
typename Vector<T, Abi>::Mask> \
operator op_(Vector<T, Abi> x, U &&y) \
{ \
using M = typename Vector<T, Abi>::Mask; \
return !!x op_ M(!!std::forward<U>(y)); \
} \
template <class T, class Abi, class U> \
Vc_ALWAYS_INLINE enable_if<std::is_same<bool, decltype(!std::declval<U>())>::value, \
typename Vector<T, Abi>::Mask> \
operator op_(U &&x, Vector<T, Abi> y) \
{ \
using M = typename Vector<T, Abi>::Mask; \
return M(!!std::forward<U>(x)) op_ !!y; \
}
#define Vc_COMPARE_OPERATOR(op_) \
template <class T, class Abi, class U, \
class R = Detail::ReturnType<Vector<T, Abi>, U>> \
Vc_ALWAYS_INLINE enable_if<std::is_convertible<Vector<T, Abi>, R>::value && \
std::is_convertible<U, R>::value, \
typename R::Mask> \
operator op_(Vector<T, Abi> x, U &&y) \
{ \
return Detail::operator op_(R(x), R(std::forward<U>(y))); \
} \
template <class T, class Abi, class U, \
class R = Detail::ReturnType<Vector<T, Abi>, U>> \
Vc_ALWAYS_INLINE \
enable_if<!Traits::is_simd_vector_internal<remove_cvref_t<U>>::value && \
std::is_convertible<Vector<T, Abi>, R>::value && \
std::is_convertible<U, R>::value, \
typename R::Mask> \
operator op_(U &&x, Vector<T, Abi> y) \
{ \
return Detail::operator op_(R(std::forward<U>(x)), R(y)); \
}
Vc_ALL_LOGICAL (Vc_LOGICAL_OPERATOR);
Vc_ALL_BINARY (Vc_GENERIC_OPERATOR);
Vc_ALL_ARITHMETICS(Vc_GENERIC_OPERATOR);
Vc_ALL_COMPARES (Vc_COMPARE_OPERATOR);
#undef Vc_LOGICAL_OPERATOR
#undef Vc_GENERIC_OPERATOR
#undef Vc_COMPARE_OPERATOR
#undef Vc_INVALID_OPERATOR
}
#endif
#ifndef VC_COMMON_ALIGNEDBASE_H_
#define VC_COMMON_ALIGNEDBASE_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename T> constexpr T max(T a) { return a; }
template <typename T, typename... Ts> constexpr T max(T a, T b, Ts... rest)
{
return a > b ? max(a, rest...) : max(b, rest...);
}
}
namespace Common
{
template <std::size_t> Vc_INTRINSIC void *aligned_malloc(std::size_t);
Vc_ALWAYS_INLINE void free(void *);
}
template <std::size_t Alignment> struct alignas(Alignment) AlignedBase
{
Vc_FREE_STORE_OPERATORS_ALIGNED(Alignment);
};
using VectorAlignedBase = AlignedBase<
Detail::max(alignof(Vector<float>), alignof(Vector<double>), alignof(Vector<ullong>),
alignof(Vector<llong>), alignof(Vector<ulong>), alignof(Vector<long>),
alignof(Vector<uint>), alignof(Vector<int>), alignof(Vector<ushort>),
alignof(Vector<short>), alignof(Vector<uchar>), alignof(Vector<schar>))>;
template <typename V> using VectorAlignedBaseT = AlignedBase<alignof(V)>;
using MemoryAlignedBase = AlignedBase<
Detail::max(Vector<float>::MemoryAlignment, Vector<double>::MemoryAlignment,
Vector<ullong>::MemoryAlignment, Vector<llong>::MemoryAlignment,
Vector<ulong>::MemoryAlignment, Vector<long>::MemoryAlignment,
Vector<uint>::MemoryAlignment, Vector<int>::MemoryAlignment,
Vector<ushort>::MemoryAlignment, Vector<short>::MemoryAlignment,
Vector<uchar>::MemoryAlignment, Vector<schar>::MemoryAlignment)>;
template <typename V> using MemoryAlignedBaseT = AlignedBase<V::MemoryAlignment>;
}
#endif
namespace Vc_VERSIONED_NAMESPACE {
constexpr std::size_t VectorAlignment = alignof(VectorAlignedBase);
constexpr std::size_t MemoryAlignment = alignof(MemoryAlignedBase);
}
#define Vc_VECTOR_DECLARED_ 1
#ifndef VC_SCALAR_DEINTERLEAVE_H_
#define VC_SCALAR_DEINTERLEAVE_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename T, typename M, typename A>
Vc_ALWAYS_INLINE void deinterleave(Scalar::Vector<T> &a, Scalar::Vector<T> &b,
const M *mem, A)
{
a = mem[0];
b = mem[1];
}
Vc_ALWAYS_INLINE void prefetchForOneRead(const void *, VectorAbi::Scalar) {}
Vc_ALWAYS_INLINE void prefetchForModify(const void *, VectorAbi::Scalar) {}
Vc_ALWAYS_INLINE void prefetchClose(const void *, VectorAbi::Scalar) {}
Vc_ALWAYS_INLINE void prefetchMid(const void *, VectorAbi::Scalar) {}
Vc_ALWAYS_INLINE void prefetchFar(const void *, VectorAbi::Scalar) {}
}
}
#endif
#ifndef VC_SCALAR_MATH_H_
#define VC_SCALAR_MATH_H_
#include <cstdlib>
namespace Vc_VERSIONED_NAMESPACE
{
Vc_INTRINSIC Scalar::float_v copysign(Scalar::float_v mag, Scalar::float_v sign)
{
union {
float f;
unsigned int i;
} value, s;
value.f = mag.data();
s.f = sign.data();
value.i = (s.i & 0x80000000u) | (value.i & 0x7fffffffu);
return Scalar::float_v{value.f};
}
Vc_INTRINSIC Vc_CONST Scalar::double_v copysign(Scalar::double_v mag,
Scalar::double_v sign)
{
union {
double f;
unsigned long long i;
} value, s;
value.f = mag.data();
s.f = sign.data();
value.i = (s.i & 0x8000000000000000ull) | (value.i & 0x7fffffffffffffffull);
return Scalar::double_v{value.f};
}
#define Vc_MINMAX(V) \
static Vc_ALWAYS_INLINE Scalar::V min(const Scalar::V &x, const Scalar::V &y) \
{ \
return Scalar::V(std::min(x.data(), y.data())); \
} \
static Vc_ALWAYS_INLINE Scalar::V max(const Scalar::V &x, const Scalar::V &y) \
{ \
return Scalar::V(std::max(x.data(), y.data())); \
}
Vc_ALL_VECTOR_TYPES(Vc_MINMAX);
#undef Vc_MINMAX
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> sqrt (const Scalar::Vector<T> &x)
{
return Scalar::Vector<T>(std::sqrt(x.data()));
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> rsqrt(const Scalar::Vector<T> &x)
{
const typename Vector<T, VectorAbi::Scalar>::EntryType one = 1; return Scalar::Vector<T>(one / std::sqrt(x.data()));
}
template <typename T,
typename = enable_if<std::is_same<T, double>::value || std::is_same<T, float>::value ||
std::is_same<T, int>::value>>
Vc_ALWAYS_INLINE Vc_PURE Scalar::Vector<T> abs(Scalar::Vector<T> x)
{
return std::abs(x.data());
}
Vc_ALWAYS_INLINE Vc_PURE Scalar::Vector<short> abs(Scalar::Vector<short> x)
{
return std::abs(static_cast<int>(x.data()));
}
template<typename T> static Vc_ALWAYS_INLINE void sincos(const Scalar::Vector<T> &x, Scalar::Vector<T> *sin, Scalar::Vector<T> *cos)
{
#if defined(_WIN32) || defined(__APPLE__)
sin->data() = std::sin(x.data());
cos->data() = std::cos(x.data());
#elif Vc_HAS_BUILTIN(__builtin_sincosf) || defined Vc_GCC
__builtin_sincosf(x.data(), &sin->data(), &cos->data());
#else
sincosf(x.data(), &sin->data(), &cos->data());
#endif
}
template<> Vc_ALWAYS_INLINE void sincos(const Scalar::Vector<double> &x, Scalar::Vector<double> *sin, Scalar::Vector<double> *cos)
{
#if defined(_WIN32) || defined(__APPLE__)
sin->data() = std::sin(x.data());
cos->data() = std::cos(x.data());
#elif Vc_HAS_BUILTIN(__builtin_sincos) || defined Vc_GCC
__builtin_sincos(x.data(), &sin->data(), &cos->data());
#else
::sincos(x.data(), &sin->data(), &cos->data());
#endif
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> sin (const Scalar::Vector<T> &x)
{
return Scalar::Vector<T>(std::sin(x.data()));
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> asin (const Scalar::Vector<T> &x)
{
return Scalar::Vector<T>(std::asin(x.data()));
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> cos (const Scalar::Vector<T> &x)
{
return Scalar::Vector<T>(std::cos(x.data()));
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> log (const Scalar::Vector<T> &x)
{
return Scalar::Vector<T>(std::log(x.data()));
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> log10(const Scalar::Vector<T> &x)
{
return Scalar::Vector<T>(std::log10(x.data()));
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> log2(const Scalar::Vector<T> &x)
{
return Scalar::Vector<T>(std::log2(x.data()));
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> exp (const Scalar::Vector<T> &x)
{
return Scalar::Vector<T>(std::exp(x.data()));
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> atan (const Scalar::Vector<T> &x)
{
return Scalar::Vector<T>(std::atan( x.data() ));
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> atan2(const Scalar::Vector<T> &x, const Scalar::Vector<T> &y)
{
return Scalar::Vector<T>(std::atan2( x.data(), y.data() ));
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> trunc(const Scalar::Vector<T> &x)
{
return std::trunc(x.data());
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> floor(const Scalar::Vector<T> &x)
{
return Scalar::Vector<T>(std::floor(x.data()));
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> ceil(const Scalar::Vector<T> &x)
{
return Scalar::Vector<T>(std::ceil(x.data()));
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> round(const Scalar::Vector<T> &x)
{
return x;
}
namespace
{
template<typename T> bool _realIsEvenHalf(T x) {
const T two = 2;
const T half = 0.5;
const T f = std::floor(x * half) * two;
return (x - f) == half;
}
}
template<> Vc_ALWAYS_INLINE Scalar::Vector<float> round(const Scalar::Vector<float> &x)
{
return Scalar::float_v(std::floor(x.data() + 0.5f) - (_realIsEvenHalf(x.data()) ? 1.f : 0.f));
}
template<> Vc_ALWAYS_INLINE Scalar::Vector<double> round(const Scalar::Vector<double> &x)
{
return Scalar::double_v(std::floor(x.data() + 0.5 ) - (_realIsEvenHalf(x.data()) ? 1. : 0. ));
}
template<typename T> static Vc_ALWAYS_INLINE Scalar::Vector<T> reciprocal(const Scalar::Vector<T> &x)
{
const typename Vector<T, VectorAbi::Scalar>::EntryType one = 1; return Scalar::Vector<T>(one / x.data());
}
#ifdef isfinite
#undef isfinite
#endif
#ifdef isnan
#undef isnan
#endif
template<typename T> static Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Scalar>::Mask isfinite(const Scalar::Vector<T> &x)
{
return typename Vector<T, VectorAbi::Scalar>::Mask(
#ifdef _MSC_VER
!!_finite(x.data())
#elif defined(__INTEL_COMPILER) && __INTEL_COMPILER < 1500
::isfinite(x.data())
#else
std::isfinite(x.data())
#endif
);
}
template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Scalar>::Mask isinf(const Scalar::Vector<T> &x)
{
return typename Vector<T, VectorAbi::Scalar>::Mask(std::isinf(x.data()));
}
template<typename T> static Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Scalar>::Mask isnan(const Scalar::Vector<T> &x)
{
return typename Vector<T, VectorAbi::Scalar>::Mask(
#ifdef _MSC_VER
!!_isnan(x.data())
#elif defined(__INTEL_COMPILER) && __INTEL_COMPILER < 1500
::isnan(x.data())
#else
std::isnan(x.data())
#endif
);
}
Vc_ALWAYS_INLINE Scalar::Vector<float> frexp(Scalar::Vector<float> x, SimdArray<int, 1, Scalar::Vector<int>, 1> *e) {
return Scalar::float_v(std::frexp(x.data(), &internal_data(*e).data()));
}
Vc_ALWAYS_INLINE Scalar::Vector<double> frexp(Scalar::Vector<double> x, SimdArray<int, 1, Scalar::Vector<int>, 1> *e) {
return Scalar::double_v(std::frexp(x.data(), &internal_data(*e).data()));
}
Vc_ALWAYS_INLINE Scalar::Vector<float> ldexp(Scalar::Vector<float> x, const SimdArray<int, 1, Scalar::Vector<int>, 1> &e) {
return Scalar::float_v(std::ldexp(x.data(), internal_data(e).data()));
}
Vc_ALWAYS_INLINE Scalar::Vector<double> ldexp(Scalar::Vector<double> x, const SimdArray<int, 1, Scalar::Vector<int>, 1> &e) {
return Scalar::double_v(std::ldexp(x.data(), internal_data(e).data()));
}
template <typename T>
Vc_ALWAYS_INLINE Vector<T, VectorAbi::Scalar> fma(Vector<T, VectorAbi::Scalar> a,
Vector<T, VectorAbi::Scalar> b,
Vector<T, VectorAbi::Scalar> c)
{
if (std::is_integral<T>::value) {
return a * b + c;
} else {
return std::fma(a.data(), b.data(), c.data());
}
}
}
#endif
#ifndef Vc_SCALAR_SIMD_CAST_CALLER_TCC_
#define Vc_SCALAR_SIMD_CAST_CALLER_TCC_
namespace Vc_VERSIONED_NAMESPACE
{
#if Vc_IS_VERSION_1
template <typename T>
template <typename U>
Vc_INTRINSIC Mask<T, VectorAbi::Scalar>::Mask(
U &&rhs, Common::enable_if_mask_converts_explicitly<T, U>)
: Mask(simd_cast<Mask>(std::forward<U>(rhs)))
{
}
#endif
}
#endif
#if defined(Vc_IMPL_SSE)
#ifndef VC_SSE_DEINTERLEAVE_H_
#define VC_SSE_DEINTERLEAVE_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename A>
inline void deinterleave(SSE::float_v &, SSE::float_v &, const float *, A);
template <typename A>
inline void deinterleave(SSE::float_v &, SSE::float_v &, const short *, A);
template <typename A>
inline void deinterleave(SSE::float_v &, SSE::float_v &, const ushort *, A);
template <typename A>
inline void deinterleave(SSE::double_v &, SSE::double_v &, const double *, A);
template <typename A>
inline void deinterleave(SSE::int_v &, SSE::int_v &, const int *, A);
template <typename A>
inline void deinterleave(SSE::int_v &, SSE::int_v &, const short *, A);
template <typename A>
inline void deinterleave(SSE::uint_v &, SSE::uint_v &, const uint *, A);
template <typename A>
inline void deinterleave(SSE::uint_v &, SSE::uint_v &, const ushort *, A);
template <typename A>
inline void deinterleave(SSE::short_v &, SSE::short_v &, const short *, A);
template <typename A>
inline void deinterleave(SSE::ushort_v &, SSE::ushort_v &, const ushort *, A);
Vc_ALWAYS_INLINE_L void prefetchForOneRead(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L void prefetchForModify(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L void prefetchClose(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L void prefetchMid(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L void prefetchFar(const void *addr, VectorAbi::Sse) Vc_ALWAYS_INLINE_R;
}
}
namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{
inline void deinterleave(Vector<float> &a, Vector<float> &b)
{
const __m128 tmp0 = _mm_unpacklo_ps(a.data(), b.data());
const __m128 tmp1 = _mm_unpackhi_ps(a.data(), b.data());
a.data() = _mm_unpacklo_ps(tmp0, tmp1);
b.data() = _mm_unpackhi_ps(tmp0, tmp1);
}
inline void deinterleave(Vector<float> &a, Vector<float> &b, Vector<short>::AsArg tmp)
{
a.data() = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(tmp.data(), 16), 16));
b.data() = _mm_cvtepi32_ps(_mm_srai_epi32(tmp.data(), 16));
}
inline void deinterleave(Vector<float> &a, Vector<float> &b, Vector<unsigned short>::AsArg tmp)
{
a.data() = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_slli_epi32(tmp.data(), 16), 16));
b.data() = _mm_cvtepi32_ps(_mm_srli_epi32(tmp.data(), 16));
}
inline void deinterleave(Vector<double> &a, Vector<double> &b)
{
__m128d tmp = _mm_unpacklo_pd(a.data(), b.data());
b.data() = _mm_unpackhi_pd(a.data(), b.data());
a.data() = tmp;
}
inline void deinterleave(Vector<int> &a, Vector<int> &b)
{
const __m128i tmp0 = _mm_unpacklo_epi32(a.data(), b.data());
const __m128i tmp1 = _mm_unpackhi_epi32(a.data(), b.data());
a.data() = _mm_unpacklo_epi32(tmp0, tmp1);
b.data() = _mm_unpackhi_epi32(tmp0, tmp1);
}
inline void deinterleave(Vector<unsigned int> &a, Vector<unsigned int> &b)
{
const __m128i tmp0 = _mm_unpacklo_epi32(a.data(), b.data());
const __m128i tmp1 = _mm_unpackhi_epi32(a.data(), b.data());
a.data() = _mm_unpacklo_epi32(tmp0, tmp1);
b.data() = _mm_unpackhi_epi32(tmp0, tmp1);
}
inline void deinterleave(Vector<short> &a, Vector<short> &b)
{
__m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data());
__m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data());
__m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
__m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
a.data() = _mm_unpacklo_epi16(tmp2, tmp3);
b.data() = _mm_unpackhi_epi16(tmp2, tmp3);
}
inline void deinterleave(Vector<unsigned short> &a, Vector<unsigned short> &b)
{
__m128i tmp0 = _mm_unpacklo_epi16(a.data(), b.data());
__m128i tmp1 = _mm_unpackhi_epi16(a.data(), b.data());
__m128i tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
__m128i tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
a.data() = _mm_unpacklo_epi16(tmp2, tmp3);
b.data() = _mm_unpackhi_epi16(tmp2, tmp3);
}
inline void deinterleave(Vector<int> &a, Vector<int> &b, Vector<short>::AsArg tmp)
{
a.data() = _mm_srai_epi32(_mm_slli_epi32(tmp.data(), 16), 16);
b.data() = _mm_srai_epi32(tmp.data(), 16);
}
inline void deinterleave(Vector<unsigned int> &a, Vector<unsigned int> &b, Vector<unsigned short>::AsArg tmp)
{
a.data() = _mm_srli_epi32(_mm_slli_epi32(tmp.data(), 16), 16);
b.data() = _mm_srli_epi32(tmp.data(), 16);
}
}
}
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template<typename A> inline void deinterleave(
SSE::float_v &a, SSE::float_v &b, const float *m, A align)
{
a.load(m, align);
b.load(m + SSE::float_v::Size, align);
Vc::SSE::deinterleave(a, b);
}
template<typename A> inline void deinterleave(
SSE::float_v &a, SSE::float_v &b, const short *m, A align)
{
SSE::short_v tmp(m, align);
Vc::SSE::deinterleave(a, b, tmp);
}
template<typename A> inline void deinterleave(
SSE::float_v &a, SSE::float_v &b, const unsigned short *m, A align)
{
SSE::ushort_v tmp(m, align);
Vc::SSE::deinterleave(a, b, tmp);
}
template<typename A> inline void deinterleave(
SSE::double_v &a, SSE::double_v &b, const double *m, A align)
{
a.load(m, align);
b.load(m + SSE::double_v::Size, align);
Vc::SSE::deinterleave(a, b);
}
template<typename A> inline void deinterleave(
SSE::int_v &a, SSE::int_v &b, const int *m, A align)
{
a.load(m, align);
b.load(m + SSE::int_v::Size, align);
Vc::SSE::deinterleave(a, b);
}
template<typename A> inline void deinterleave(
SSE::int_v &a, SSE::int_v &b, const short *m, A align)
{
SSE::short_v tmp(m, align);
Vc::SSE::deinterleave(a, b, tmp);
}
template<typename A> inline void deinterleave(
SSE::uint_v &a, SSE::uint_v &b, const unsigned int *m, A align)
{
a.load(m, align);
b.load(m + SSE::uint_v::Size, align);
Vc::SSE::deinterleave(a, b);
}
template<typename A> inline void deinterleave(
SSE::uint_v &a, SSE::uint_v &b, const unsigned short *m, A align)
{
SSE::ushort_v tmp(m, align);
Vc::SSE::deinterleave(a, b, tmp);
}
template<typename A> inline void deinterleave(
SSE::short_v &a, SSE::short_v &b, const short *m, A align)
{
a.load(m, align);
b.load(m + SSE::short_v::Size, align);
Vc::SSE::deinterleave(a, b);
}
template<typename A> inline void deinterleave(
SSE::ushort_v &a, SSE::ushort_v &b, const unsigned short *m, A align)
{
a.load(m, align);
b.load(m + SSE::ushort_v::Size, align);
Vc::SSE::deinterleave(a, b);
}
}
}
#ifndef VC_SSE_PREFETCHES_TCC_
#define VC_SSE_PREFETCHES_TCC_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr, VectorAbi::Sse)
{
_mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_NTA);
}
Vc_ALWAYS_INLINE void prefetchClose(const void *addr, VectorAbi::Sse)
{
_mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T0);
}
Vc_ALWAYS_INLINE void prefetchMid(const void *addr, VectorAbi::Sse)
{
_mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T1);
}
Vc_ALWAYS_INLINE void prefetchFar(const void *addr, VectorAbi::Sse)
{
_mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T2);
}
Vc_ALWAYS_INLINE void prefetchForModify(const void *addr, VectorAbi::Sse)
{
#ifdef __3dNOW__
_m_prefetchw(const_cast<void *>(addr));
#else
_mm_prefetch(static_cast<char *>(const_cast<void *>(addr)), _MM_HINT_T0);
#endif
}
}
}
#endif
#endif
#ifndef VC_SSE_MATH_H_
#define VC_SSE_MATH_H_
#ifndef VC_SSE_CONST_H_
#define VC_SSE_CONST_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace SSE
{
template<typename T> struct Const
{
typedef Vector<T> V;
typedef Mask<T> M;
enum Constants { Stride = 16 / sizeof(T) };
static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return load(&c_trig<T>::data[0 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return load(&c_trig<T>::data[1 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return load(&c_trig<T>::data[2 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return load(&c_trig<T>::data[3 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return load(&c_trig<T>::data[4 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V _16() { return load(&c_trig<T>::data[5 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return load(&c_trig<T>::data[(12 + i) * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return load(&c_trig<T>::data[(17 + i) * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return load(&c_trig<T>::data[22 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return load(&c_trig<T>::data[23 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return load(&c_trig<T>::data[24 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return load(&c_trig<T>::data[8 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return load(&c_trig<T>::data[9 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return load(&c_trig<T>::data[10 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return load(&c_trig<T>::data[11 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return load(&c_trig<T>::data[(28 + i) * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return load(&c_trig<T>::data[(33 + i) * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return load(&c_trig<T>::data[(37 + i) * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return load(&c_trig<T>::data[(43 + i) * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return load(&c_trig<T>::data[25 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return load(&c_trig<T>::data[26 * Stride]); }
static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(load(c_log<T>::d(1)).data()); }
static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return load(c_log<T>::d(18)); }
static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return load(c_log<T>::d(15)); }
static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return load(c_log<T>::d(2 + i)); }
static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return load(c_log<T>::d(8 + i)); }
static Vc_ALWAYS_INLINE Vc_CONST V min() { return load(c_log<T>::d(14)); }
static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return load(c_log<T>::d(17)); }
static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return load(c_log<T>::d(16)); }
static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return load(c_log<T>::d(13)); }
static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return load(c_log<T>::d(19)); }
static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return load(c_log<T>::d(20)); }
static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R;
static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask(int bits) Vc_ALWAYS_INLINE_R Vc_CONST_R;
private:
static Vc_ALWAYS_INLINE_L Vc_CONST_L V load(const T *mem) Vc_ALWAYS_INLINE_R Vc_CONST_R;
};
template<typename T> Vc_ALWAYS_INLINE Vc_CONST Vector<T> Const<T>::load(const T *mem) { return V(mem); }
template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask()
{
return Vector<float>(reinterpret_cast<const float *>(&c_general::highMaskFloat));
}
template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask()
{
return Vector<double>(
reinterpret_cast<const double *>(&c_general::highMaskDouble));
}
template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask(int bits)
{
return _mm_castsi128_ps(_mm_slli_epi32(_mm_setallone_si128(), bits));
}
template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask(int bits)
{
return _mm_castsi128_pd(_mm_slli_epi64(_mm_setallone_si128(), bits));
}
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
Vc_INTRINSIC Vc_CONST SSE::float_v copysign(SSE::float_v mag, SSE::float_v sign)
{
return _mm_or_ps(_mm_and_ps(sign.data(), SSE::_mm_setsignmask_ps()),
_mm_and_ps(mag.data(), SSE::_mm_setabsmask_ps()));
}
Vc_INTRINSIC Vc_CONST SSE::double_v copysign(SSE::double_v mag, SSE::double_v sign)
{
return _mm_or_pd(_mm_and_pd(sign.data(), SSE::_mm_setsignmask_pd()),
_mm_and_pd(mag.data(), SSE::_mm_setabsmask_pd()));
}
inline SSE::double_v frexp(const SSE::double_v &v,
SimdArray<int, 2, Scalar::int_v, 1> *e)
{
const __m128i exponentBits = SSE::Const<double>::exponentMask().dataI();
const __m128i exponentPart = _mm_and_si128(_mm_castpd_si128(v.data()), exponentBits);
SSE::int_v exponent =
_mm_sub_epi32(_mm_srli_epi64(exponentPart, 52), _mm_set1_epi32(0x3fe));
const __m128d exponentMaximized = _mm_or_pd(v.data(), _mm_castsi128_pd(exponentBits));
SSE::double_v ret = _mm_and_pd(
exponentMaximized,
_mm_load_pd(reinterpret_cast<const double *>(&SSE::c_general::frexpMask[0])));
SSE::double_m zeroMask = v == SSE::double_v::Zero();
ret(isnan(v) || !isfinite(v) || zeroMask) = v;
exponent.setZero(zeroMask.data());
(*e)[0] = exponent[0];
(*e)[1] = exponent[2];
return ret;
}
inline SSE::float_v frexp(const SSE::float_v &v, SimdArray<int, 4, SSE::int_v, 4> *e)
{
const __m128i exponentBits = SSE::Const<float>::exponentMask().dataI();
const __m128i exponentPart = _mm_and_si128(_mm_castps_si128(v.data()), exponentBits);
internal_data(*e) =
_mm_sub_epi32(_mm_srli_epi32(exponentPart, 23), _mm_set1_epi32(0x7e));
const __m128 exponentMaximized = _mm_or_ps(v.data(), _mm_castsi128_ps(exponentBits));
SSE::float_v ret =
_mm_and_ps(exponentMaximized, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu)));
ret(isnan(v) || !isfinite(v) || v == SSE::float_v::Zero()) = v;
e->setZero(v == SSE::float_v::Zero());
return ret;
}
inline SSE::double_v ldexp(SSE::double_v::AsArg v,
const SimdArray<int, 2, Scalar::int_v, 1> &_e)
{
SSE::int_v e = _mm_setr_epi32(_e[0], 0, _e[1], 0);
e.setZero((v == SSE::double_v::Zero()).dataI());
const __m128i exponentBits = _mm_slli_epi64(e.data(), 52);
return _mm_castsi128_pd(_mm_add_epi64(_mm_castpd_si128(v.data()), exponentBits));
}
inline SSE::float_v ldexp(SSE::float_v::AsArg v,
const SimdArray<int, 4, SSE::int_v, 4> &_e)
{
SSE::int_v e = internal_data(_e);
e.setZero(simd_cast<SSE::int_m>(v == SSE::float_v::Zero()));
return reinterpret_components_cast<SSE::float_v>(
reinterpret_components_cast<SSE::int_v>(v) + (e << 23));
}
#ifdef Vc_IMPL_SSE4_1
inline SSE::double_v trunc(SSE::double_v::AsArg v) { return _mm_round_pd(v.data(), 0x3); }
inline SSE::float_v trunc(SSE::float_v::AsArg v) { return _mm_round_ps(v.data(), 0x3); }
inline SSE::double_v floor(SSE::double_v::AsArg v) { return _mm_floor_pd(v.data()); }
inline SSE::float_v floor(SSE::float_v::AsArg v) { return _mm_floor_ps(v.data()); }
inline SSE::double_v ceil(SSE::double_v::AsArg v) { return _mm_ceil_pd(v.data()); }
inline SSE::float_v ceil(SSE::float_v::AsArg v) { return _mm_ceil_ps(v.data()); }
#else
inline SSE::Vector<float> trunc(SSE::Vector<float> x)
{
const auto truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(x.data()));
const auto no_fractional_values = _mm_castsi128_ps(_mm_cmplt_epi32(
_mm_and_si128(_mm_castps_si128(x.data()), _mm_set1_epi32(0x7f800000u)),
_mm_set1_epi32(0x4b000000)));
return _mm_or_ps(_mm_andnot_ps(no_fractional_values, x.data()),
_mm_and_ps(no_fractional_values, truncated));
}
inline SSE::Vector<double> trunc(SSE::Vector<double> x)
{
const auto abs_x = Vc::abs(x).data();
const auto min_no_fractional_bits =
_mm_castsi128_pd(_mm_set1_epi64x(0x4330000000000000ull));
__m128d truncated =
_mm_sub_pd(_mm_add_pd(abs_x, min_no_fractional_bits), min_no_fractional_bits);
truncated = _mm_sub_pd(truncated,
_mm_and_pd(_mm_cmplt_pd(abs_x, truncated), _mm_set1_pd(1.)));
return _mm_or_pd(
_mm_and_pd(_mm_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ull)), x.data()),
truncated);
}
template <typename T> inline SSE::Vector<T> floor(SSE::Vector<T> x)
{
auto y = trunc(x);
y(!(y == x) && x < 0) -= 1;
return y;
}
template <typename T> inline SSE::Vector<T> ceil(SSE::Vector<T> x)
{
auto y = trunc(x);
y(!(y == x || x < 0)) += 1;
return y;
}
#endif
template <typename T>
Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> fma(Vector<T, VectorAbi::Sse> a,
Vector<T, VectorAbi::Sse> b,
Vector<T, VectorAbi::Sse> c)
{
SSE::VectorHelper<T>::fma(a.data(), b.data(), c.data());
return a;
}
}
#endif
#ifndef Vc_SSE_SIMD_CAST_CALLER_TCC_
#define Vc_SSE_SIMD_CAST_CALLER_TCC_
namespace Vc_VERSIONED_NAMESPACE
{
#if Vc_IS_VERSION_1
template <typename T>
template <typename U>
Vc_INTRINSIC Mask<T, VectorAbi::Sse>::Mask(U &&rhs, Common::enable_if_mask_converts_explicitly<T, U>)
: Mask(Vc::simd_cast<Mask>(std::forward<U>(rhs)))
{
}
#endif
}
#endif
#endif
#if defined(Vc_IMPL_AVX)
#ifndef VC_AVX_HELPERIMPL_H_
#define VC_AVX_HELPERIMPL_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename A>
inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const float *, A);
template <typename A>
inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const short *, A);
template <typename A>
inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const ushort *, A);
template <typename A>
inline void deinterleave(AVX2::double_v &, AVX2::double_v &, const double *, A);
template <typename A>
inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const int *, A);
template <typename A>
inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const short *, A);
template <typename A>
inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const uint *, A);
template <typename A>
inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const ushort *, A);
template <typename A>
inline void deinterleave(AVX2::short_v &, AVX2::short_v &, const short *, A);
template <typename A>
inline void deinterleave(AVX2::ushort_v &, AVX2::ushort_v &, const ushort *, A);
template <typename T, typename M, typename A>
Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
AVX2::Vector<T> &Vc_RESTRICT b,
AVX2::Vector<T> &Vc_RESTRICT c,
const M *Vc_RESTRICT memory,
A align) Vc_ALWAYS_INLINE_R;
template <typename T, typename M, typename A>
Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
AVX2::Vector<T> &Vc_RESTRICT b,
AVX2::Vector<T> &Vc_RESTRICT c,
AVX2::Vector<T> &Vc_RESTRICT d,
const M *Vc_RESTRICT memory,
A align) Vc_ALWAYS_INLINE_R;
template <typename T, typename M, typename A>
Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
AVX2::Vector<T> &Vc_RESTRICT b,
AVX2::Vector<T> &Vc_RESTRICT c,
AVX2::Vector<T> &Vc_RESTRICT d,
AVX2::Vector<T> &Vc_RESTRICT e,
const M *Vc_RESTRICT memory,
A align) Vc_ALWAYS_INLINE_R;
template <typename T, typename M, typename A>
Vc_ALWAYS_INLINE_L void deinterleave(
AVX2::Vector<T> &Vc_RESTRICT a, AVX2::Vector<T> &Vc_RESTRICT b,
AVX2::Vector<T> &Vc_RESTRICT c, AVX2::Vector<T> &Vc_RESTRICT d,
AVX2::Vector<T> &Vc_RESTRICT e, AVX2::Vector<T> &Vc_RESTRICT f,
const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R;
template <typename T, typename M, typename A>
Vc_ALWAYS_INLINE_L void deinterleave(
AVX2::Vector<T> &Vc_RESTRICT a, AVX2::Vector<T> &Vc_RESTRICT b,
AVX2::Vector<T> &Vc_RESTRICT c, AVX2::Vector<T> &Vc_RESTRICT d,
AVX2::Vector<T> &Vc_RESTRICT e, AVX2::Vector<T> &Vc_RESTRICT f,
AVX2::Vector<T> &Vc_RESTRICT g, AVX2::Vector<T> &Vc_RESTRICT h,
const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr, VectorAbi::Avx)
{
prefetchForOneRead(addr, VectorAbi::Sse());
}
Vc_ALWAYS_INLINE void prefetchForModify(const void *addr, VectorAbi::Avx)
{
prefetchForModify(addr, VectorAbi::Sse());
}
Vc_ALWAYS_INLINE void prefetchClose(const void *addr, VectorAbi::Avx)
{
prefetchClose(addr, VectorAbi::Sse());
}
Vc_ALWAYS_INLINE void prefetchMid(const void *addr, VectorAbi::Avx)
{
prefetchMid(addr, VectorAbi::Sse());
}
Vc_ALWAYS_INLINE void prefetchFar(const void *addr, VectorAbi::Avx)
{
prefetchFar(addr, VectorAbi::Sse());
}
}
}
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX2
{
inline void deinterleave(double_v &Vc_RESTRICT a, double_v &Vc_RESTRICT b, double_v &Vc_RESTRICT c)
{
const m256d tmp0 = Mem::shuffle128<X0, Y1>(a.data(), b.data());
const m256d tmp1 = Mem::shuffle128<X1, Y0>(a.data(), c.data());
const m256d tmp2 = Mem::shuffle128<X0, Y1>(b.data(), c.data());
a.data() = Mem::shuffle<X0, Y1, X2, Y3>(tmp0, tmp1);
b.data() = Mem::shuffle<X1, Y0, X3, Y2>(tmp0, tmp2);
c.data() = Mem::shuffle<X0, Y1, X2, Y3>(tmp1, tmp2);
}
inline void deinterleave(float_v &Vc_RESTRICT a, float_v &Vc_RESTRICT b, float_v &Vc_RESTRICT c)
{
const m256 ac0 = Mem::shuffle128<X0, Y0>(a.data(), c.data());
const m256 ac1 = Mem::shuffle128<X1, Y1>(a.data(), c.data());
m256 tmp0 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>( ac0, b.data());
tmp0 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(tmp0, ac1);
m256 tmp1 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>( ac0, b.data());
tmp1 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(tmp1, ac1);
m256 tmp2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>( ac0, b.data());
tmp2 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>(tmp2, ac1);
a.data() = Mem::permute<X0, X3, X2, X1>(tmp0);
b.data() = Mem::permute<X1, X0, X3, X2>(tmp1);
c.data() = Mem::permute<X2, X1, X0, X3>(tmp2);
}
inline void deinterleave(int_v &Vc_RESTRICT a, int_v &Vc_RESTRICT b, int_v &Vc_RESTRICT c)
{
deinterleave(reinterpret_cast<float_v &>(a), reinterpret_cast<float_v &>(b),
reinterpret_cast<float_v &>(c));
}
inline void deinterleave(uint_v &Vc_RESTRICT a, uint_v &Vc_RESTRICT b, uint_v &Vc_RESTRICT c)
{
deinterleave(reinterpret_cast<float_v &>(a), reinterpret_cast<float_v &>(b),
reinterpret_cast<float_v &>(c));
}
inline void deinterleave(Vector<short> &Vc_RESTRICT , Vector<short> &Vc_RESTRICT ,
Vector<short> &Vc_RESTRICT )
{
return;
}
inline void deinterleave(Vector<unsigned short> &Vc_RESTRICT a, Vector<unsigned short> &Vc_RESTRICT b,
Vector<unsigned short> &Vc_RESTRICT c)
{
deinterleave(reinterpret_cast<Vector<short> &>(a), reinterpret_cast<Vector<short> &>(b),
reinterpret_cast<Vector<short> &>(c));
}
inline void deinterleave(Vector<float> &a, Vector<float> &b)
{
const m256 tmp0 = Reg::permute128<Y0, X0>(a.data(), b.data());
const m256 tmp1 = Reg::permute128<Y1, X1>(a.data(), b.data());
const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
a.data() = _mm256_unpacklo_ps(tmp2, tmp3);
b.data() = _mm256_unpackhi_ps(tmp2, tmp3);
}
inline void deinterleave(Vector<short> &a,
Vector<short> &b)
{
auto v0 = Mem::shuffle128<X0, Y0>(a.data(), b.data());
auto v1 = Mem::shuffle128<X1, Y1>(a.data(), b.data());
auto v2 = AVX::unpacklo_epi16(v0, v1);
auto v3 = AVX::unpackhi_epi16(v0, v1);
v0 = AVX::unpacklo_epi16(v2, v3);
v1 = AVX::unpackhi_epi16(v2, v3);
a.data() = AVX::unpacklo_epi16(v0, v1);
b.data() = AVX::unpackhi_epi16(v0, v1);
}
inline void deinterleave(Vector<ushort> &a, Vector<ushort> &b)
{
auto v0 = Mem::shuffle128<X0, Y0>(a.data(), b.data());
auto v1 = Mem::shuffle128<X1, Y1>(a.data(), b.data());
auto v2 = AVX::unpacklo_epi16(v0, v1);
auto v3 = AVX::unpackhi_epi16(v0, v1);
v0 = AVX::unpacklo_epi16(v2, v3);
v1 = AVX::unpackhi_epi16(v2, v3);
a.data() = AVX::unpacklo_epi16(v0, v1);
b.data() = AVX::unpackhi_epi16(v0, v1);
}
}
namespace Detail
{
template <typename Flags>
inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const float *m, Flags align)
{
a.load(m, align);
b.load(m + AVX2::float_v::Size, align);
Vc::AVX2::deinterleave(a, b);
}
template <typename Flags>
inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const short *m, Flags f)
{
using namespace Vc::AVX2;
const auto tmp = Detail::load32(m, f);
a.data() =
_mm256_cvtepi32_ps(concat(_mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
_mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16)));
b.data() = _mm256_cvtepi32_ps(
concat(_mm_srai_epi32(lo128(tmp), 16), _mm_srai_epi32(hi128(tmp), 16)));
}
template <typename Flags>
inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const unsigned short *m, Flags f)
{
using namespace Vc::AVX2;
const auto tmp = Detail::load32(m, f);
a.data() = _mm256_cvtepi32_ps(
concat(_mm_blend_epi16(lo128(tmp), _mm_setzero_si128(), 0xaa),
_mm_blend_epi16(hi128(tmp), _mm_setzero_si128(), 0xaa)));
b.data() = _mm256_cvtepi32_ps(
concat(_mm_srli_epi32(lo128(tmp), 16), _mm_srli_epi32(hi128(tmp), 16)));
}
template <typename Flags>
inline void deinterleave(AVX2::double_v &a, AVX2::double_v &b, const double *m, Flags align)
{
using namespace Vc::AVX2;
a.load(m, align);
b.load(m + AVX2::double_v::Size, align);
m256d tmp0 = Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data());
m256d tmp1 = Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data());
a.data() = _mm256_unpacklo_pd(tmp0, tmp1);
b.data() = _mm256_unpackhi_pd(tmp0, tmp1);
}
template <typename Flags>
inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const int *m, Flags align)
{
using namespace AVX;
a.load(m, align);
b.load(m + AVX2::int_v::Size, align);
const m256 tmp0 = avx_cast<m256>(Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()));
const m256 tmp1 = avx_cast<m256>(Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()));
const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
a.data() = avx_cast<m256i>(_mm256_unpacklo_ps(tmp2, tmp3));
b.data() = avx_cast<m256i>(_mm256_unpackhi_ps(tmp2, tmp3));
}
template <typename Flags>
inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const short *m, Flags f)
{
using namespace Vc::AVX;
const AVX2::short_v tmp0(m, f);
const m256i tmp = tmp0.data();
a.data() = concat(
_mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
_mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16));
b.data() = concat(
_mm_srai_epi32(lo128(tmp), 16),
_mm_srai_epi32(hi128(tmp), 16));
}
template <typename Flags>
inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned int *m, Flags align)
{
using namespace AVX;
a.load(m, align);
b.load(m + AVX2::uint_v::Size, align);
const m256 tmp0 = avx_cast<m256>(Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()));
const m256 tmp1 = avx_cast<m256>(Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()));
const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1);
const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1);
a.data() = avx_cast<m256i>(_mm256_unpacklo_ps(tmp2, tmp3));
b.data() = avx_cast<m256i>(_mm256_unpackhi_ps(tmp2, tmp3));
}
template <typename Flags>
inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned short *m, Flags f)
{
using namespace Vc::AVX;
const AVX2::ushort_v tmp0(m, f);
const m256i tmp = tmp0.data();
a.data() = concat(
_mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
_mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16));
b.data() = concat(
_mm_srai_epi32(lo128(tmp), 16),
_mm_srai_epi32(hi128(tmp), 16));
}
template <typename Flags>
inline void deinterleave(AVX2::short_v &a, AVX2::short_v &b, const short *m, Flags align)
{
a.load(m, align);
b.load(m + AVX2::short_v::Size, align);
Vc::AVX2::deinterleave(a, b);
}
template <typename Flags>
inline void deinterleave(AVX2::ushort_v &a, AVX2::ushort_v &b, const unsigned short *m, Flags align)
{
a.load(m, align);
b.load(m + AVX2::ushort_v::Size, align);
Vc::AVX2::deinterleave(a, b);
}
template <typename T, typename M, typename Flags>
Vc_ALWAYS_INLINE void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
AVX2::Vector<T> &Vc_RESTRICT b,
AVX2::Vector<T> &Vc_RESTRICT c,
const M *Vc_RESTRICT memory, Flags align)
{
using V = AVX2::Vector<T>;
a.load(&memory[0 * V::Size], align);
b.load(&memory[1 * V::Size], align);
c.load(&memory[2 * V::Size], align);
Vc::AVX2::deinterleave(a, b, c);
}
}
}
#endif
#ifndef VC_AVX_MATH_H_
#define VC_AVX_MATH_H_
namespace Vc_VERSIONED_NAMESPACE
{
#ifdef Vc_IMPL_AVX2
Vc_ALWAYS_INLINE AVX2::int_v min(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_min_epi32(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::uint_v min(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_min_epu32(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::short_v min(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_min_epi16(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::ushort_v min(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_min_epu16(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::int_v max(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_max_epi32(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::uint_v max(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_max_epu32(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::short_v max(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_max_epi16(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::ushort_v max(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_max_epu16(x.data(), y.data()); }
#endif
Vc_ALWAYS_INLINE AVX2::float_v min(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_min_ps(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::double_v min(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_min_pd(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::float_v max(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_max_ps(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::double_v max(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_max_pd(x.data(), y.data()); }
template <typename T>
Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> sqrt(const AVX2::Vector<T> &x)
{
return AVX::VectorHelper<T>::sqrt(x.data());
}
template <typename T>
Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> rsqrt(const AVX2::Vector<T> &x)
{
return AVX::VectorHelper<T>::rsqrt(x.data());
}
template <typename T>
Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> reciprocal(const AVX2::Vector<T> &x)
{
return AVX::VectorHelper<T>::reciprocal(x.data());
}
template <typename T>
Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> round(const AVX2::Vector<T> &x)
{
return AVX::VectorHelper<T>::round(x.data());
}
Vc_INTRINSIC Vc_CONST AVX2::double_v abs(AVX2::double_v x)
{
return Detail::and_(x.data(), AVX::setabsmask_pd());
}
Vc_INTRINSIC Vc_CONST AVX2::float_v abs(AVX2::float_v x)
{
return Detail::and_(x.data(), AVX::setabsmask_ps());
}
#ifdef Vc_IMPL_AVX2
Vc_INTRINSIC Vc_CONST AVX2::int_v abs(AVX2::int_v x)
{
return _mm256_abs_epi32(x.data());
}
Vc_INTRINSIC Vc_CONST AVX2::short_v abs(AVX2::short_v x)
{
return _mm256_abs_epi16(x.data());
}
#endif
Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isfinite(const AVX2::double_v &x)
{
return AVX::cmpord_pd(x.data(), _mm256_mul_pd(Detail::zero<__m256d>(), x.data()));
}
Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isfinite(const AVX2::float_v &x)
{
return AVX::cmpord_ps(x.data(), _mm256_mul_ps(Detail::zero<__m256>(), x.data()));
}
Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isinf(const AVX2::double_v &x)
{
return _mm256_castsi256_pd(AVX::cmpeq_epi64(
_mm256_castpd_si256(abs(x).data()),
_mm256_castpd_si256(Detail::avx_broadcast(AVX::c_log<double>::d(1)))));
}
Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isinf(const AVX2::float_v &x)
{
return _mm256_castsi256_ps(
AVX::cmpeq_epi32(_mm256_castps_si256(abs(x).data()),
_mm256_castps_si256(Detail::avx_broadcast(AVX::c_log<float>::d(1)))));
}
Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isnan(const AVX2::double_v &x)
{
return AVX::cmpunord_pd(x.data(), x.data());
}
Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isnan(const AVX2::float_v &x)
{
return AVX::cmpunord_ps(x.data(), x.data());
}
Vc_INTRINSIC Vc_CONST AVX2::float_v copysign(AVX2::float_v mag, AVX2::float_v sign)
{
return _mm256_or_ps(_mm256_and_ps(sign.data(), AVX::setsignmask_ps()),
_mm256_and_ps(mag.data(), AVX::setabsmask_ps()));
}
Vc_INTRINSIC Vc_CONST AVX2::double_v copysign(AVX2::double_v::AsArg mag,
AVX2::double_v::AsArg sign)
{
return _mm256_or_pd(_mm256_and_pd(sign.data(), AVX::setsignmask_pd()),
_mm256_and_pd(mag.data(), AVX::setabsmask_pd()));
}
inline AVX2::double_v frexp(AVX2::double_v::AsArg v, SimdArray<int, 4> *e)
{
const __m256d exponentBits = AVX::Const<double>::exponentMask().dataD();
const __m256d exponentPart = _mm256_and_pd(v.data(), exponentBits);
auto lo = AVX::avx_cast<__m128i>(AVX::lo128(exponentPart));
auto hi = AVX::avx_cast<__m128i>(AVX::hi128(exponentPart));
lo = _mm_sub_epi32(_mm_srli_epi64(lo, 52), _mm_set1_epi64x(0x3fe));
hi = _mm_sub_epi32(_mm_srli_epi64(hi, 52), _mm_set1_epi64x(0x3fe));
SSE::int_v exponent = Mem::shuffle<X0, X2, Y0, Y2>(lo, hi);
const __m256d exponentMaximized = _mm256_or_pd(v.data(), exponentBits);
AVX2::double_v ret =
_mm256_and_pd(exponentMaximized,
_mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask)));
const double_m zeroMask = v == AVX2::double_v::Zero();
ret(isnan(v) || !isfinite(v) || zeroMask) = v;
exponent.setZero(simd_cast<SSE::int_m>(zeroMask));
internal_data(*e) = exponent;
return ret;
}
#ifdef Vc_IMPL_AVX2
inline SimdArray<double, 8> frexp(const SimdArray<double, 8> &v, SimdArray<int, 8> *e)
{
const __m256d exponentBits = AVX::Const<double>::exponentMask().dataD();
const __m256d w[2] = {internal_data(internal_data0(v)).data(),
internal_data(internal_data1(v)).data()};
const __m256i exponentPart[2] = {
_mm256_castpd_si256(_mm256_and_pd(w[0], exponentBits)),
_mm256_castpd_si256(_mm256_and_pd(w[1], exponentBits))};
const __m256i lo = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[0], 52),
_mm256_set1_epi32(0x3fe));
const __m256i hi = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[1], 52),
_mm256_set1_epi32(0x3fe));
const __m256i a = _mm256_unpacklo_epi32(lo, hi);
const __m256i b = _mm256_unpackhi_epi32(lo, hi);
const __m256i tmp = _mm256_unpacklo_epi32(a, b);
const __m256i exponent =
AVX::concat(_mm_unpacklo_epi64(AVX::lo128(tmp), AVX::hi128(tmp)),
_mm_unpackhi_epi64(AVX::lo128(tmp), AVX::hi128(tmp)));
const __m256d exponentMaximized[2] = {_mm256_or_pd(w[0], exponentBits),
_mm256_or_pd(w[1], exponentBits)};
const auto frexpMask =
_mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask));
fixed_size_simd<double, 8> ret = {
fixed_size_simd<double, 4>(
AVX::double_v(_mm256_and_pd(exponentMaximized[0], frexpMask))),
fixed_size_simd<double, 4>(
AVX::double_v(_mm256_and_pd(exponentMaximized[1], frexpMask)))};
const auto zeroMask = v == v.Zero();
ret(isnan(v) || !isfinite(v) || zeroMask) = v;
internal_data(*e) =
Detail::andnot_(simd_cast<AVX2::int_m>(zeroMask).dataI(), exponent);
return ret;
}
#endif
namespace Detail
{
Vc_INTRINSIC AVX2::float_v::IndexType extractExponent(__m256 e)
{
SimdArray<uint, float_v::Size> exponentPart;
const auto ee = AVX::avx_cast<__m256i>(e);
#ifdef Vc_IMPL_AVX2
exponentPart = AVX2::uint_v(ee);
#else
internal_data(internal_data0(exponentPart)) = AVX::lo128(ee);
internal_data(internal_data1(exponentPart)) = AVX::hi128(ee);
#endif
return (exponentPart >> 23) - 0x7e;
}
}
inline AVX2::float_v frexp(AVX2::float_v::AsArg v, SimdArray<int, 8> *e)
{
using namespace Detail;
using namespace AVX2;
const __m256 exponentBits = Const<float>::exponentMask().data();
*e = extractExponent(and_(v.data(), exponentBits));
const __m256 exponentMaximized = or_(v.data(), exponentBits);
AVX2::float_v ret = _mm256_and_ps(exponentMaximized, avx_cast<__m256>(set1_epi32(0xbf7fffffu)));
ret(isnan(v) || !isfinite(v) || v == AVX2::float_v::Zero()) = v;
e->setZero(simd_cast<decltype(*e == *e)>(v == AVX2::float_v::Zero()));
return ret;
}
inline AVX2::double_v ldexp(AVX2::double_v::AsArg v, const SimdArray<int, 4> &_e)
{
SSE::int_v e = internal_data(_e);
e.setZero(simd_cast<SSE::int_m>(v == AVX2::double_v::Zero()));
const __m256i exponentBits =
AVX::concat(_mm_slli_epi64(_mm_unpacklo_epi32(e.data(), e.data()), 52),
_mm_slli_epi64(_mm_unpackhi_epi32(e.data(), e.data()), 52));
return AVX::avx_cast<__m256d>(
AVX::add_epi64(AVX::avx_cast<__m256i>(v.data()), exponentBits));
}
inline AVX2::float_v ldexp(AVX2::float_v::AsArg v, SimdArray<int, 8> e)
{
e.setZero(simd_cast<decltype(e == e)>(v == AVX2::float_v::Zero()));
e <<= 23;
#ifdef Vc_IMPL_AVX2
return {AVX::avx_cast<__m256>(
AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())),
AVX::lo128(internal_data(e).data())),
_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())),
AVX::hi128(internal_data(e).data()))))};
#else
return {AVX::avx_cast<__m256>(
AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())),
internal_data(internal_data0(e)).data()),
_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())),
internal_data(internal_data1(e)).data())))};
#endif
}
Vc_ALWAYS_INLINE AVX2::float_v trunc(AVX2::float_v::AsArg v)
{
return _mm256_round_ps(v.data(), 0x3);
}
Vc_ALWAYS_INLINE AVX2::double_v trunc(AVX2::double_v::AsArg v)
{
return _mm256_round_pd(v.data(), 0x3);
}
Vc_ALWAYS_INLINE AVX2::float_v floor(AVX2::float_v::AsArg v)
{
return _mm256_floor_ps(v.data());
}
Vc_ALWAYS_INLINE AVX2::double_v floor(AVX2::double_v::AsArg v)
{
return _mm256_floor_pd(v.data());
}
Vc_ALWAYS_INLINE AVX2::float_v ceil(AVX2::float_v::AsArg v)
{
return _mm256_ceil_ps(v.data());
}
Vc_ALWAYS_INLINE AVX2::double_v ceil(AVX2::double_v::AsArg v)
{
return _mm256_ceil_pd(v.data());
}
template <typename T>
Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx> fma(Vector<T, VectorAbi::Avx> a,
Vector<T, VectorAbi::Avx> b,
Vector<T, VectorAbi::Avx> c)
{
return Detail::fma(a.data(), b.data(), c.data(), T());
}
}
#endif
#ifndef Vc_AVX_SIMD_CAST_CALLER_TCC_
#define Vc_AVX_SIMD_CAST_CALLER_TCC_
namespace Vc_VERSIONED_NAMESPACE
{
#if Vc_IS_VERSION_1
template <typename T>
template <typename U, typename>
Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(U &&x)
: d(simd_cast<Vector>(std::forward<U>(x)).data())
{
}
template <typename T>
template <typename U>
Vc_INTRINSIC Mask<T, VectorAbi::Avx>::Mask(U &&rhs,
Common::enable_if_mask_converts_explicitly<T, U>)
: Mask(simd_cast<Mask>(std::forward<U>(rhs)))
{
}
#endif
}
#endif
#endif
#ifndef VC_COMMON_MATH_H_
#define VC_COMMON_MATH_H_
#define Vc_COMMON_MATH_H_INTERNAL 1
#ifndef VC_COMMON_TRIGONOMETRIC_H_
#define VC_COMMON_TRIGONOMETRIC_H_
#ifdef Vc_HAVE_LIBMVEC
extern "C" {
__m128 _ZGVbN4v_sinf(__m128);
__m128d _ZGVbN2v_sin(__m128d);
__m128 _ZGVbN4v_cosf(__m128);
__m128d _ZGVbN2v_cos(__m128d);
__m256 _ZGVdN8v_sinf(__m256);
__m256d _ZGVdN4v_sin(__m256d);
__m256 _ZGVdN8v_cosf(__m256);
__m256d _ZGVdN4v_cos(__m256d);
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template<Vc::Implementation Impl> struct MapImpl { enum Dummy { Value = Impl }; };
template<> struct MapImpl<Vc::SSE42Impl> { enum Dummy { Value = MapImpl<Vc::SSE41Impl>::Value }; };
template<Vc::Implementation Impl> using TrigonometricImplementation =
ImplementationT<MapImpl<Impl>::Value
#if defined(Vc_IMPL_XOP) && defined(Vc_IMPL_FMA4)
+ Vc::XopInstructions
+ Vc::Fma4Instructions
#endif
>;
}
namespace Common
{
template<typename Impl> struct Trigonometric
{
template<typename T> static T Vc_VDECL sin(const T &_x);
template<typename T> static T Vc_VDECL cos(const T &_x);
template<typename T> static void Vc_VDECL sincos(const T &_x, T *_sin, T *_cos);
template<typename T> static T Vc_VDECL asin (const T &_x);
template<typename T> static T Vc_VDECL atan (const T &_x);
template<typename T> static T Vc_VDECL atan2(const T &y, const T &x);
};
}
#if defined Vc_IMPL_SSE || defined DOXYGEN
namespace Detail
{
template <typename T, typename Abi>
using Trig = Common::Trigonometric<Detail::TrigonometricImplementation<
(std::is_same<Abi, VectorAbi::Sse>::value
? SSE42Impl
: std::is_same<Abi, VectorAbi::Avx>::value ? AVXImpl : ScalarImpl)>>;
}
#ifdef Vc_HAVE_LIBMVEC
Vc_INTRINSIC __m128 sin_dispatch(__m128 x) { return ::_ZGVbN4v_sinf(x); }
Vc_INTRINSIC __m128d sin_dispatch(__m128d x) { return ::_ZGVbN2v_sin (x); }
Vc_INTRINSIC __m128 cos_dispatch(__m128 x) { return ::_ZGVbN4v_cosf(x); }
Vc_INTRINSIC __m128d cos_dispatch(__m128d x) { return ::_ZGVbN2v_cos (x); }
#ifdef Vc_IMPL_AVX
Vc_INTRINSIC __m256 sin_dispatch(__m256 x) { return ::_ZGVdN8v_sinf(x); }
Vc_INTRINSIC __m256d sin_dispatch(__m256d x) { return ::_ZGVdN4v_sin (x); }
Vc_INTRINSIC __m256 cos_dispatch(__m256 x) { return ::_ZGVdN8v_cosf(x); }
Vc_INTRINSIC __m256d cos_dispatch(__m256d x) { return ::_ZGVdN4v_cos (x); }
#endif
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> sin(const Vector<T, Abi> &x)
{
return sin_dispatch(x.data());
}
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> cos(const Vector<T, Abi> &x)
{
return cos_dispatch(x.data());
}
#else
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> sin(const Vector<T, Abi> &x)
{
return Detail::Trig<T, Abi>::sin(x);
}
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> cos(const Vector<T, Abi> &x)
{
return Detail::Trig<T, Abi>::cos(x);
}
#endif
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> asin(const Vector<T, Abi> &x)
{
return Detail::Trig<T, Abi>::asin(x);
}
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> atan(const Vector<T, Abi> &x)
{
return Detail::Trig<T, Abi>::atan(x);
}
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> atan2(const Vector<T, Abi> &y,
const Vector<T, Abi> &x)
{
return Detail::Trig<T, Abi>::atan2(y, x);
}
template <typename T, typename Abi>
Vc_INTRINSIC void sincos(const Vector<T, Abi> &x,
Vector<T, detail::not_fixed_size_abi<Abi>> *sin,
Vector<T, Abi> *cos)
{
Detail::Trig<T, Abi>::sincos(x, sin, cos);
}
#endif
}
#endif
#ifndef VC_COMMON_CONST_H_
#define VC_COMMON_CONST_H_
#include <type_traits>
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <int exponent> constexpr double exponentToFloat(std::integral_constant<bool, true>);
template <int exponent> constexpr double exponentToFloat(std::integral_constant<bool, false>);
template <> constexpr double exponentToFloat<0>(std::integral_constant<bool, true>)
{
return 1.;
}
template <> constexpr double exponentToFloat<0>(std::integral_constant<bool, false>)
{
return 1.;
}
template <> constexpr double exponentToFloat<-32>(std::integral_constant<bool, true>)
{
return 1. / (65536. * 65536.);
}
template <> constexpr double exponentToFloat<32>(std::integral_constant<bool, false>)
{
return 65536. * 65536.;
}
template <> constexpr double exponentToFloat<-64>(std::integral_constant<bool, true>)
{
return 1. / (65536. * 65536. * 65536. * 65536.);
}
template <> constexpr double exponentToFloat<64>(std::integral_constant<bool, false>)
{
return 65536. * 65536. * 65536. * 65536.;
}
template <int exponent>
constexpr double exponentToFloat(std::integral_constant<bool, false> negative)
{
return exponentToFloat<exponent - 1>(negative) * 2.0;
}
template <int exponent>
constexpr double exponentToFloat(std::integral_constant<bool, true> negative)
{
return exponentToFloat<exponent + 1>(negative) * 0.5;
}
template <int sign, unsigned long long mantissa, int exponent> constexpr double doubleConstant()
{
return (static_cast<double>((mantissa & 0x000fffffffffffffull) | 0x0010000000000000ull) /
0x0010000000000000ull) *
exponentToFloat<exponent>(std::integral_constant<bool, (exponent < 0)>()) * sign;
}
template <int sign, unsigned int mantissa, int exponent> constexpr float floatConstant()
{
return (static_cast<float>((mantissa & 0x007fffffu) | 0x00800000u) / 0x00800000u) *
static_cast<float>(
exponentToFloat<exponent>(std::integral_constant<bool, (exponent < 0)>())) *
sign;
}
}
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
template <class T, class Abi>
SimdArray<int, Vector<T, Abi>::size()> fpclassify(const Vector<T, Abi> &x)
{
return SimdArray<int, Vector<T, Abi>::size()>(
[&](std::size_t i) { return std::fpclassify(x[i]); });
}
template <class T, size_t N> SimdArray<int, N> fpclassify(const SimdArray<T, N> &x)
{
return SimdArray<int, N>([&](std::size_t i) { return std::fpclassify(x[i]); });
}
#ifdef Vc_IMPL_SSE
#ifdef Vc_COMMON_MATH_H_INTERNAL
enum LogarithmBase {
BaseE, Base10, Base2
};
namespace Detail
{
template <typename T, typename Abi>
using Const = typename std::conditional<std::is_same<Abi, VectorAbi::Avx>::value,
AVX::Const<T>, SSE::Const<T>>::type;
template<LogarithmBase Base>
struct LogImpl
{
template<typename T, typename Abi> static Vc_ALWAYS_INLINE void log_series(Vector<T, Abi> &Vc_RESTRICT x, typename Vector<T, Abi>::AsArg exponent) {
typedef Vector<T, Abi> V;
typedef Detail::Const<T, Abi> C;
const V x2 = x * x;
#ifdef Vc_LOG_ILP
V y2 = (C::P(6) * x2 + C::P(7) * x) + C::P(8);
V y0 = (C::P(0) * x2 + C::P(1) * x) + C::P(2);
V y1 = (C::P(3) * x2 + C::P(4) * x) + C::P(5);
const V x3 = x2 * x;
const V x6 = x3 * x3;
const V x9 = x6 * x3;
V y = (y0 * x9 + y1 * x6) + y2 * x3;
#elif defined Vc_LOG_ILP2
const V x3 = x2 * x;
const V x4 = x2 * x2;
const V x5 = x2 * x3;
const V x6 = x3 * x3;
const V x7 = x4 * x3;
const V x8 = x4 * x4;
const V x9 = x5 * x4;
const V x10 = x5 * x5;
const V x11 = x5 * x6;
V y = C::P(0) * x11 + C::P(1) * x10 + C::P(2) * x9 + C::P(3) * x8 + C::P(4) * x7
+ C::P(5) * x6 + C::P(6) * x5 + C::P(7) * x4 + C::P(8) * x3;
#else
V y = C::P(0);
Vc::Common::unrolled_loop<int, 1, 9>([&](int i) { y = y * x + C::P(i); });
y *= x * x2;
#endif
switch (Base) {
case BaseE:
y += exponent * C::ln2_small();
y -= x2 * C::_1_2();
x += y;
x += exponent * C::ln2_large();
break;
case Base10:
y += exponent * C::ln2_small();
y -= x2 * C::_1_2();
x += y;
x += exponent * C::ln2_large();
x *= C::log10_e();
break;
case Base2:
{
const V x_ = x;
x *= C::log2_e();
y *= C::log2_e();
y -= x_ * x * C::_1_2();
x += y;
x += exponent;
break;
}
}
}
template <typename Abi>
static Vc_ALWAYS_INLINE void log_series(Vector<double, Abi> &Vc_RESTRICT x,
typename Vector<double, Abi>::AsArg exponent)
{
typedef Vector<double, Abi> V;
typedef Detail::Const<double, Abi> C;
const V x2 = x * x;
V y = C::P(0);
V y2 = C::Q(0) + x;
Vc::Common::unrolled_loop<int, 1, 5>([&](int i) {
y = y * x + C::P(i);
y2 = y2 * x + C::Q(i);
});
y2 = x / y2;
y = y * x + C::P(5);
y = x2 * y * y2;
switch (Base) {
case BaseE:
y += exponent * C::ln2_small();
y -= x2 * C::_1_2();
x += y;
x += exponent * C::ln2_large();
break;
case Base10:
y += exponent * C::ln2_small();
y -= x2 * C::_1_2();
x += y;
x += exponent * C::ln2_large();
x *= C::log10_e();
break;
case Base2:
{
const V x_ = x;
x *= C::log2_e();
y *= C::log2_e();
y -= x_ * x * C::_1_2();
x += y;
x += exponent;
break;
}
}
}
template <typename T, typename Abi, typename V = Vector<T, Abi>>
static inline Vector<T, Abi> calc(V _x)
{
typedef typename V::Mask M;
typedef Detail::Const<T, Abi> C;
V x(_x);
const M invalidMask = x < V::Zero();
const M infinityMask = x == V::Zero();
const M denormal = x <= C::min();
x(denormal) *= V(Vc::Detail::doubleConstant<1, 0, 54>());
V exponent = Detail::exponent(x.data());
exponent(denormal) -= 54;
x.setZero(C::exponentMask());
x = Detail::operator|(x,
C::_1_2());
const M smallX = x < C::_1_sqrt2();
x(smallX) += x;
x -= V::One();
exponent(!smallX) += V::One();
log_series(x, exponent);
x.setQnan(invalidMask);
x(infinityMask) = C::neginf();
return x;
}
};
}
template <typename T, typename Abi>
Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log(
const Vector<T, Abi> &x)
{
return Detail::LogImpl<BaseE>::calc<T, Abi>(x);
}
template <typename T, typename Abi>
Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log10(
const Vector<T, Abi> &x)
{
return Detail::LogImpl<Base10>::calc<T, Abi>(x);
}
template <typename T, typename Abi>
Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log2(
const Vector<T, Abi> &x)
{
return Detail::LogImpl<Base2>::calc<T, Abi>(x);
}
#endif
#ifdef Vc_COMMON_MATH_H_INTERNAL
constexpr float log2_e = 1.44269504088896341f;
constexpr float MAXLOGF = 88.722831726074219f;
constexpr float MINLOGF = -88.029685974121094f;
constexpr float MAXNUMF = 3.4028234663852885981170418348451692544e38f;
template <typename Abi, typename = enable_if<std::is_same<Abi, VectorAbi::Sse>::value ||
std::is_same<Abi, VectorAbi::Avx>::value>>
inline Vector<float, detail::not_fixed_size_abi<Abi>> exp(Vector<float, Abi> x)
{
using V = Vector<float, Abi>;
typedef typename V::Mask M;
typedef Detail::Const<float, Abi> C;
const M overflow = x > MAXLOGF;
const M underflow = x < MINLOGF;
V z = floor(C::log2_e() * x + 0.5f);
const auto n = static_cast<Vc::SimdArray<int, V::Size>>(z);
x -= z * C::ln2_large();
x -= z * C::ln2_small();
z = ((((( 1.9875691500E-4f * x
+ 1.3981999507E-3f) * x
+ 8.3334519073E-3f) * x
+ 4.1665795894E-2f) * x
+ 1.6666665459E-1f) * x
+ 5.0000001201E-1f) * (x * x)
+ x
+ 1.0f;
x = ldexp(z, n);
x(overflow) = std::numeric_limits<typename V::EntryType>::infinity();
x.setZero(underflow);
return x;
}
#endif
#ifdef Vc_IMPL_AVX
inline AVX::double_v exp(AVX::double_v _x)
{
AVX::Vector<double> x = _x;
typedef AVX::Vector<double> V;
typedef V::Mask M;
typedef AVX::Const<double> C;
const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>();
const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>();
V px = floor(C::log2_e() * x + 0.5);
__m128i tmp = _mm256_cvttpd_epi32(px.data());
const SimdArray<int, V::Size> n = SSE::int_v{tmp};
x -= px * C::ln2_large();
x -= px * C::ln2_small();
const double P[] = {
Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(),
Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(),
Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>()
};
const double Q[] = {
Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(),
Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(),
Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(),
Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>()
};
const V x2 = x * x;
px = x * ((P[0] * x2 + P[1]) * x2 + P[2]);
x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px);
x = V::One() + 2.0 * x;
x = ldexp(x, n);
x(overflow) = std::numeric_limits<double>::infinity();
x.setZero(underflow);
return x;
}
#endif
inline SSE::double_v exp(SSE::double_v::AsArg _x) {
SSE::Vector<double> x = _x;
typedef SSE::Vector<double> V;
typedef V::Mask M;
typedef SSE::Const<double> C;
const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>();
const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>();
V px = floor(C::log2_e() * x + 0.5);
SimdArray<int, V::Size> n;
_mm_storel_epi64(reinterpret_cast<__m128i *>(&n), _mm_cvttpd_epi32(px.data()));
x -= px * C::ln2_large();
x -= px * C::ln2_small();
const double P[] = {
Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(),
Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(),
Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>()
};
const double Q[] = {
Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(),
Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(),
Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(),
Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>()
};
const V x2 = x * x;
px = x * ((P[0] * x2 + P[1]) * x2 + P[2]);
x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px);
x = V::One() + 2.0 * x;
x = ldexp(x, n);
x(overflow) = std::numeric_limits<double>::infinity();
x.setZero(underflow);
return x;
}
#endif
}
#undef Vc_COMMON_MATH_H_INTERNAL
#endif
#ifdef isfinite
#undef isfinite
#endif
#ifdef isnan
#undef isnan
#endif
#ifndef VC_COMMON_VECTORTUPLE_H_
#define VC_COMMON_VECTORTUPLE_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template<size_t StructSize, typename V, typename I, bool Readonly = true> struct InterleavedMemoryReadAccess;
template <int Length, typename V> class VectorReferenceArray
{
typedef typename V::EntryType T;
typedef V &Vc_RESTRICT Reference;
std::array<V * Vc_RESTRICT, Length> r;
typedef make_index_sequence<Length> IndexSequence;
template <typename VV, std::size_t... Indexes>
constexpr VectorReferenceArray<Length + 1, VV> appendOneReference(
VV &a, index_sequence<Indexes...>) const
{
return {*r[Indexes]..., a};
}
template <typename A, std::size_t... Indexes>
Vc_INTRINSIC void callDeinterleave(const A &access, index_sequence<Indexes...>) const
{
access.deinterleave(*r[Indexes]...);
}
public:
template <typename... Us, typename = enable_if<(sizeof...(Us) == Length)>>
constexpr VectorReferenceArray(Us &&... args)
: r{{std::addressof(std::forward<Us>(args))...}}
{
}
template <typename VV, typename = enable_if<!std::is_const<V>::value &&
std::is_same<VV, V>::value>>
Vc_DEPRECATED("build the tuple with Vc::tie instead") constexpr VectorReferenceArray<
Length + 1, V>
operator,(VV &a) const &&
{
return appendOneReference(a, IndexSequence());
}
Vc_DEPRECATED("build the tuple with Vc::tie instead") constexpr VectorReferenceArray<
Length + 1, const V>
operator,(const V &a) const &&
{
return appendOneReference(a, IndexSequence());
}
template <size_t StructSize, typename I, bool RO>
Vc_ALWAYS_INLINE enable_if<(Length <= StructSize), void> operator=(
const InterleavedMemoryReadAccess<StructSize, V, I, RO> &access) &&
{
callDeinterleave(access, IndexSequence());
}
template <size_t StructSize, typename I, bool RO>
enable_if<(Length > StructSize), void> operator=(
const InterleavedMemoryReadAccess<StructSize, V, I, RO> &access) && =
delete;
template <typename... Inputs> void operator=(TransposeProxy<Inputs...> &&proxy) &&
{
transpose_impl(TransposeTag<Length, sizeof...(Inputs)>(), &r[0], proxy);
}
template <typename T, typename IndexVector, typename Scale, bool Flag>
void operator=(SubscriptOperation<T, IndexVector, Scale, Flag> &&sub) &&
{
const auto &args = std::move(sub).gatherArguments();
Common::InterleavedMemoryReadAccess<1, V, Traits::decay<decltype(args.indexes)>>
deinterleaver(args.address, args.indexes);
callDeinterleave(deinterleaver, IndexSequence());
}
Vc_ALWAYS_INLINE Reference operator[](std::size_t i) { return *r[i]; }
};
}
template <typename T, typename Abi>
Vc_DEPRECATED("build the tuple with Vc::tie instead")
constexpr Common::VectorReferenceArray<2, Vc::Vector<T, Abi>>
operator,(Vc::Vector<T, Abi> &a, Vc::Vector<T, Abi> &b)
{
return {a, b};
}
template <typename T, typename Abi>
Vc_DEPRECATED("build the tuple with Vc::tie instead")
constexpr Common::VectorReferenceArray<2, const Vc::Vector<T, Abi>>
operator,(const Vc::Vector<T, Abi> &a, const Vc::Vector<T, Abi> &b)
{
return {a, b};
}
template <typename V, typename... Vs>
constexpr Common::VectorReferenceArray<sizeof...(Vs) + 1,
typename std::remove_reference<V>::type>
tie(V &&a, Vs &&... b)
{
return {std::forward<V>(a), std::forward<Vs>(b)...};
}
}
#endif
#ifndef VC_COMMON_IIF_H_
#define VC_COMMON_IIF_H_
#ifndef VC_TYPE_TRAITS_
#define VC_TYPE_TRAITS_
#include <type_traits>
namespace Vc_VERSIONED_NAMESPACE
{
using Traits::is_simd_mask;
using Traits::is_simd_vector;
using Traits::is_integral;
using Traits::is_floating_point;
using Traits::is_arithmetic;
using Traits::is_signed;
using Traits::is_unsigned;
template<typename T>
struct memory_alignment : public std::integral_constant<size_t, alignof(T)> {};
template<> struct memory_alignment<short_v> : public std::integral_constant<size_t, short_v::MemoryAlignment> {};
template<> struct memory_alignment<ushort_v> : public std::integral_constant<size_t, ushort_v::MemoryAlignment> {};
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
template <typename Mask, typename T>
Vc_ALWAYS_INLINE enable_if<is_simd_mask<Mask>::value && is_simd_vector<T>::value, T> iif(
const Mask &condition, const T &trueValue, const T &falseValue)
{
T result(falseValue);
Vc::where(condition) | result = trueValue;
return result;
}
template <typename Mask, typename T>
enable_if<is_simd_mask<Mask>::value && !is_simd_vector<T>::value, T> iif(
const Mask &, const T &, const T &) = delete;
template<typename T> constexpr T iif (bool condition, const T &trueValue, const T &falseValue)
{
return condition ? trueValue : falseValue;
}
}
#endif
#ifndef Vc_NO_STD_FUNCTIONS
namespace std
{
using Vc::min;
using Vc::max;
using Vc::abs;
using Vc::asin;
using Vc::atan;
using Vc::atan2;
using Vc::ceil;
using Vc::cos;
using Vc::exp;
using Vc::fma;
using Vc::trunc;
using Vc::floor;
using Vc::frexp;
using Vc::ldexp;
using Vc::log;
using Vc::log10;
using Vc::log2;
using Vc::round;
using Vc::sin;
using Vc::sqrt;
using Vc::isfinite;
using Vc::isnan;
}
#endif
Vc_RESET_DIAGNOSTICS
#endif