Compare commits

...

No commits in common. "pristine-tar" and "openkylin/yangtze" have entirely different histories.

313 changed files with 92292 additions and 1 deletions

130
.clang-format Normal file
View File

@ -0,0 +1,130 @@
BasedOnStyle: Google
# The extra indent or outdent of access modifiers, e.g. public:.
AccessModifierOffset: -4
# If true, aligns escaped newlines as far left as possible. Otherwise puts them into the right-most column.
AlignEscapedNewlinesLeft: false
# If true, aligns trailing comments.
AlignTrailingComments: true
# Allow putting all parameters of a function declaration onto the next line even if BinPackParameters is false.
AllowAllParametersOfDeclarationOnNextLine: false
# If true, if (a) return; can be put on a single line.
AllowShortIfStatementsOnASingleLine: false
# If true, while (true) continue; can be put on a single line.
AllowShortLoopsOnASingleLine: false
AllowShortFunctionsOnASingleLine: true
# If true, always break before multiline string literals.
AlwaysBreakBeforeMultilineStrings: false
# If true, always break after the template<...> of a template declaration.
AlwaysBreakTemplateDeclarations: false
# If false, a function calls or function definitions parameters will either all be on the same line or will have one line each.
BinPackParameters: true
# If true, binary operators will be placed after line breaks.
BreakBeforeBinaryOperators: false
# The brace breaking style to use.
# Possible values:
# BS_Attach (in configuration: Attach) Always attach braces to surrounding context.
# BS_Linux (in configuration: Linux) Like Attach, but break before braces on function, namespace and class definitions.
# BS_Stroustrup (in configuration: Stroustrup) Like Attach, but break before function definitions.
# BS_Allman (in configuration: Allman) Always break before braces.
BreakBeforeBraces: Linux
# Always break constructor initializers before commas and align the commas with the colon.
BreakConstructorInitializersBeforeComma: true
# The column limit.
# A column limit of 0 means that there is no column limit. In this case, clang-format will respect the inputs line breaking decisions within statements.
ColumnLimit: 90
# If the constructor initializers dont fit on a line, put each initializer on its own line.
#ConstructorInitializerAllOnOneLineOrOnePerLine (bool)
# The number of characters to use for indentation of constructor initializer lists.
#ConstructorInitializerIndentWidth (unsigned)
# If true, format braced lists as best suited for C++11 braced lists.
# Important differences: - No spaces inside the braced list. - No line break before the closing brace. - Indentation with the continuation indent, not with the block indent.
# Fundamentally, C++11 braced lists are formatted exactly like function calls would be formatted in their place. If the braced list follows a name (e.g. a type or variable name), clang-format formats as if the {} were the parentheses of a function call with that name. If there is no name, a zero-length name is assumed.
Cpp11BracedListStyle: true
# If true, analyze the formatted file for the most common binding.
#DerivePointerBinding (bool)
# If true, clang-format detects whether function calls and definitions are formatted with one parameter per line.
# Each call can be bin-packed, one-per-line or inconclusive. If it is inconclusive, e.g. completely on one line, but a decision needs to be made, clang-format analyzes whether there are other bin-packed cases in the input file and act accordingly.
# NOTE: This is an experimental flag, that might go away or be renamed. Do not use this in config files, etc. Use at your own risk.
#ExperimentalAutoDetectBinPacking (bool)
# Indent case labels one level from the switch statement.
# When false, use the same indentation level as for the switch statement. Switch statement body is always indented one level more than case labels.
IndentCaseLabels: false
# If true, indent when breaking function declarations which are not also definitions after the type.
#IndentFunctionDeclarationAfterType (bool)
# The number of characters to use for indentation.
IndentWidth: 4
# The maximum number of consecutive empty lines to keep.
MaxEmptyLinesToKeep: 1
# The indentation used for namespaces.
# Possible values:
# NI_None (in configuration: None) Dont indent in namespaces.
# NI_Inner (in configuration: Inner) Indent only in inner namespaces (nested in other namespaces).
# NI_All (in configuration: All) Indent in all namespaces.
NamespaceIndentation: None
# Add a space in front of an Objective-C protocol list, i.e. use Foo <Protocol> instead of Foo<Protocol>.
#ObjCSpaceBeforeProtocolList (bool)
# The penalty for each line break introduced inside a comment.
#PenaltyBreakComment (unsigned)
# The penalty for breaking before the first <<.
#PenaltyBreakFirstLessLess (unsigned)
# The penalty for each line break introduced inside a string literal.
#PenaltyBreakString (unsigned)
# The penalty for each character outside of the column limit.
#PenaltyExcessCharacter (unsigned)
# Penalty for putting the return type of a function onto its own line.
#PenaltyReturnTypeOnItsOwnLine (unsigned)
# Set whether & and * bind to the type as opposed to the variable.
#PointerBindsToType: false
# If true, spaces will be inserted between for/if/while/... and (.
#SpaceAfterControlStatementKeyword: true
# If false, spaces will be removed before =, +=, etc.
#SpaceBeforeAssignmentOperators: true
# If false, spaces may be inserted into ().
#SpaceInEmptyParentheses: false
# The number of spaces to before trailing line comments.
#SpacesBeforeTrailingComments (unsigned)
# If false, spaces may be inserted into C style casts.
#SpacesInCStyleCastParentheses (bool)
# If true, spaces will be inserted after every ( and before every ).
SpacesInParentheses: false
# Format compatible with this standard, e.g. use A<A<int> > instead of A<A<int>> for LS_Cpp03.
# Possible values:
# LS_Cpp03 (in configuration: Cpp03) Use C++03-compatible syntax.
# LS_Cpp11 (in configuration: Cpp11) Use features of C++11 (e.g. A<A<int>> instead of A<A<int> >).
# LS_Auto (in configuration: Auto) Automatic detection based on the input.
Standard: Cpp11
# If true, IndentWidth consecutive spaces will be replaced with tab characters.
UseTab: false
# vim: ft=yaml

77
.github/CONTRIBUTING.md vendored Normal file
View File

@ -0,0 +1,77 @@
## Copyright and License
Vc is licensed with the [3-clause BSD license](http://opensource.org/licenses/BSD-3-Clause).
Your contributions to Vc must be released under the same license. You must add
your copyright information to the files you modified/added.
## Code Formatting & Style
The recommended way is to format the code according to `clang-format` using the
`.clang-format` file in the repository.
In addition to the `clang-format` style, `if`, `else`, `for`, `while`, and `do`
*must* use braces.
If, for some reason, you cannot use `clang-format`, here's a quick overview of
the style rules:
* Constrain the code to no more than 90 characters per line.
* Use four spaces for indent. No tabs.
* Opening braces attach to the preceding expression, except for functions,
namespaces, and classes/structs/unions/enums.
* Namespaces introduce no additional indent
* `case` labels are aligned with the `switch` statement
* No more than one empty line.
* No spaces in parentheses, but spaces between keywords and opening paren, i.e.
`if (foo) { bar(); }`
### Naming Rules
* Naming is very important. Take time to choose a name that clearly explains the
intended functionality & usage of the entity.
* Type names typically use `CamelCase`. No underscores.
* Function and variable names use `camelCase`. No underscores.
* Acronyms that appear in camel case names must use lowercase letters for all
characters after the first characters. (e.g. `SimdArray`, `simdFunction`)
* Traits use `lower_case_with_underscores`.
* Macros are prefixed with `Vc_` and use `Vc_ALL_CAPITALS_WITH_UNDERSCORES`.
Macro arguments use a single underscore suffix.
Include guards are prefixed with `VC_` instead.
* File names use `alllowercasewithoutunderscores`. Basically, it is the type name
declared/defined in the file with all letters in lower case.
* There are exceptions and inconsistencies in the code. Don't bother.
### Design Guidelines
* *Avoid out parameters.* Use the return value insted. Use `std::tuple` if you
need to return multiple values.
* *Look for alternatives to in-out parameters.* An obvious exception (and thus
design alternative) is the implicit `this` parameter to non-static member
functions.
* Consequently, *pass function parameters by const-ref or by value.*
Use const-ref for types that (potentially) require more than two CPU
registers. (Consider fundamental types and the fundamental `Vector<T>` types
to require one register, each.)
By value otherwise.
* *Ensure const-correctness.* Member functions use the `const` qualifier if they
do not modify observable state. Use `mutable` members for unobservable state.
* *Avoid macros.* Possible alternatives are constexpr variables and template
code.
## Git History
Git history should be flat, if feasible. Feel free to use merges on your private
branch. However, once you submit a pull request, the history should apply
cleanly on top of master. Use `git rebase [-i]` to straighten the history.
Use different branches for different issues.
## Git Commit Logs
1. Write meaningful summaries and strive to use no more than 50 characters
1. Use imperative mood in the subject line (and possibly in bullet points in the
summary)
1. Wrap the body at 72 characters
1. Use the body to explain *what* and *why* (normally it is irrelevant *how* you
did it)
See also [Chris Beams article](http://chris.beams.io/posts/git-commit/).

18
.github/ISSUE_TEMPLATE.md vendored Normal file
View File

@ -0,0 +1,18 @@
<!--
Vc is now in maintenance mode and no longer actively developed.
However, we continue to review pull requests with bugfixes from the community.
If your issue is trivial to fix, we might be able to address it.
Otherwise, please provide a pull request in addition to your issue.
-->
Vc version / revision | Operating System | Compiler & Version | Compiler Flags | Assembler & Version | CPU
----------------------|------------------|--------------------|----------------|---------------------|----
| | | | |
## Testcase
```cpp
```
## Actual Results
## Expected Results

85
.github/workflows/ci.yaml vendored Normal file
View File

@ -0,0 +1,85 @@
name: CI
on:
push:
pull_request:
schedule:
- cron: '0 3 * * *'
jobs:
#clang-format:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v2
# - uses: DoozyX/clang-format-lint-action@v0.12
# with:
# exclude: './thirdparty'
# clangFormatVersion: 12
build-ubuntu:
runs-on: ubuntu-latest
env:
dashboard_model: Experimental
build_type: ${{ matrix.build_type }}
NUMBER_OF_PROCESSORS: 2
CXX: ${{ matrix.cxx }}
strategy:
fail-fast: false
matrix:
build_type: [Debug, Release]
cxx: [g++-9, g++-10, g++-11, clang++-10, clang++-11, clang++-12, icpc]
include:
- cxx: g++-11
INSTALL_EXTRA: g++-11
- cxx: clang++-11
INSTALL_EXTRA: clang-11
- cxx: clang++-12
INSTALL_EXTRA: clang-12
- cxx: icpc
INSTALL_ONEAPI: true
exclude:
# icpc in debug mode runs out of memory in CI
- cxx: icpc
build_type: Debug
steps:
- uses: actions/checkout@v2
with:
submodules: true
- name: install OneAPI
if: ${{ matrix.INSTALL_ONEAPI }}
run: |
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic
- name: install extras
if: ${{ matrix.INSTALL_EXTRA }}
run: |
sudo apt update
sudo apt install ${{ matrix.INSTALL_EXTRA }}
- name: ctest
run: |
if [ ${{ matrix.INSTALL_ONEAPI }} ]
then
source /opt/intel/oneapi/setvars.sh
export LC_ALL=en_US.utf8
fi
$CXX --version
ctest -VV -S test.cmake
build-windows:
runs-on: ${{ matrix.os }}
env:
build_type: ${{ matrix.build_type }}
strategy:
fail-fast: false
matrix:
build_type: [Debug, Release]
os: [windows-2019]
steps:
- uses: actions/checkout@v2
with:
submodules: true
- uses: egor-tensin/vs-shell@v2
- name: ctest
run: |
ctest -VV -S test.cmake

11
.gitignore vendored Normal file
View File

@ -0,0 +1,11 @@
doc/html
doc/latex
doc/man
vc-benchmarks
*.swp
*~
.makeApidox.stamp
.makeApidox.stamp.new
build-*
.vs
out

6
.gitmodules vendored Normal file
View File

@ -0,0 +1,6 @@
[submodule "tests/testdata"]
path = tests/testdata
url = https://github.com/VcDevel/vc-testdata
[submodule "tests/virtest"]
path = tests/virtest
url = https://github.com/mattkretz/virtest

275
CMakeLists.txt Normal file
View File

@ -0,0 +1,275 @@
cmake_minimum_required(VERSION 3.0)
cmake_policy(SET CMP0028 NEW) # Double colon in target name means ALIAS or IMPORTED target.
cmake_policy(SET CMP0048 NEW) # The ``project()`` command manages VERSION variables.
if(PROJECT_SOURCE_DIR STREQUAL PROJECT_BINARY_DIR)
message(FATAL_ERROR "You don't want to configure in the source directory!")
endif()
if(NOT DEFINED CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebug RelWithDebInfo MinSizeRel."
FORCE)
endif()
# read version parts from version.h
file(STRINGS ${CMAKE_CURRENT_SOURCE_DIR}/Vc/version.h _version_lines REGEX "^#define Vc_VERSION_STRING ")
string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" _version_matches "${_version_lines}")
project(Vc VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}" LANGUAGES C CXX)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
set(disabled_targets)
include (VcMacros)
include (AddTargetProperty)
include (OptimizeForArchitecture)
vc_determine_compiler()
if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(i686|x86|AMD64|amd64)")
set(Vc_X86 TRUE)
elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(arm|aarch32|aarch64)")
message(WARNING "No optimized implementation of the Vc types available for ${CMAKE_SYSTEM_PROCESSOR}")
set(Vc_ARM TRUE)
else()
message(WARNING "No optimized implementation of the Vc types available for ${CMAKE_SYSTEM_PROCESSOR}")
endif()
option(USE_CCACHE "If enabled, ccache will be used (if it exists on the system) to speed up recompiles." OFF)
if(USE_CCACHE)
find_program(CCACHE_COMMAND ccache)
if(CCACHE_COMMAND)
mark_as_advanced(CCACHE_COMMAND)
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_COMMAND}")
endif()
endif()
if(NOT Vc_COMPILER_IS_MSVC)
AddCompilerFlag("-std=c++14" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS)
if(NOT _ok)
AddCompilerFlag("-std=c++1y" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS)
if(NOT _ok)
AddCompilerFlag("-std=c++11" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS)
if(NOT _ok)
AddCompilerFlag("-std=c++0x" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS)
if(NOT _ok)
message(FATAL_ERROR "Vc 1.x requires C++11, better even C++14. It seems this is not available. If this was incorrectly determined please notify vc-devel@compeng.uni-frankfurt.de")
endif()
endif()
endif()
endif()
elseif(MSVC_VERSION LESS 1920)
message(FATAL_ERROR "Vc 1.x requires at least Visual Studio 2019.")
AddCompilerFlag("/std:c++14" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS)
endif()
if(MSVC AND (NOT DEFINED Vc_USE_MSVC_SSA_OPTIMIZER_DESPITE_BUGGY_EXP OR NOT Vc_USE_MSVC_SSA_OPTIMIZER_DESPITE_BUGGY_EXP))
# bug report: https://developercommunity.visualstudio.com/t/AVX-codegen-bug-on-Vc-with-MSVC-2019/1470844#T-N1521672
message(STATUS "WARNING! MSVC starting with 19.20 uses a new optimizer that has a bug causing Vc::exp() to return slighly wrong results.\
You can set Vc_USE_MSVC_SSA_OPTIMIZER_DESPITE_BUGGY_EXP=ON to still use the new optimizer on the affected MSVC versions.")
AddCompilerFlag("/d2SSAOptimizer-" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS)
endif()
if(Vc_COMPILER_IS_GCC)
if(Vc_GCC_VERSION VERSION_GREATER "5.0.0" AND Vc_GCC_VERSION VERSION_LESS "6.0.0")
UserWarning("GCC 5 goes into an endless loop comiling example_scaling_scalar. Therefore, this target is disabled.")
list(APPEND disabled_targets
example_scaling_scalar
)
endif()
elseif(Vc_COMPILER_IS_MSVC)
# Disable warning "C++ exception specification ignored except to indicate a function is not __declspec(nothrow)"
# MSVC emits the warning for the _UnitTest_Compare desctructor which needs the throw declaration so that it doesn't std::terminate
AddCompilerFlag("/wd4290")
endif()
vc_set_preferred_compiler_flags(WARNING_FLAGS BUILDTYPE_FLAGS)
add_definitions(${Vc_DEFINITIONS})
add_compile_options(${Vc_COMPILE_FLAGS})
if(Vc_COMPILER_IS_INTEL)
# per default icc is not IEEE compliant, but we need that for verification
AddCompilerFlag("-fp-model source")
endif()
if(CMAKE_BUILD_TYPE STREQUAL "" AND NOT CMAKE_CXX_FLAGS MATCHES "-O[123]")
message(STATUS "WARNING! It seems you are compiling without optimization. Please set CMAKE_BUILD_TYPE.")
endif(CMAKE_BUILD_TYPE STREQUAL "" AND NOT CMAKE_CXX_FLAGS MATCHES "-O[123]")
include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # ${CMAKE_CURRENT_SOURCE_DIR}/include)
add_custom_target(other VERBATIM)
add_custom_target(Scalar COMMENT "build Scalar code" VERBATIM)
add_custom_target(SSE COMMENT "build SSE code" VERBATIM)
add_custom_target(AVX COMMENT "build AVX code" VERBATIM)
add_custom_target(AVX2 COMMENT "build AVX2 code" VERBATIM)
AddCompilerFlag(-ftemplate-depth=128 CXX_FLAGS CMAKE_CXX_FLAGS)
set(libvc_compile_flags "-DVc_COMPILE_LIB")
AddCompilerFlag("-fPIC" CXX_FLAGS libvc_compile_flags)
# -fstack-protector is the default of GCC, but at least Ubuntu changes the default to -fstack-protector-strong, which is crazy
AddCompilerFlag("-fstack-protector" CXX_FLAGS libvc_compile_flags)
set(_srcs src/const.cpp)
if(Vc_X86)
list(APPEND _srcs src/cpuid.cpp src/support_x86.cpp)
vc_compile_for_all_implementations(_srcs src/trigonometric.cpp ONLY SSE2 SSE3 SSSE3 SSE4_1 AVX AVX+FMA AVX2+FMA+BMI2)
if(NOT Vc_XOP_INTRINSICS_BROKEN)
vc_compile_for_all_implementations(_srcs src/trigonometric.cpp ONLY AVX+XOP+FMA)
if(NOT Vc_FMA4_INTRINSICS_BROKEN)
vc_compile_for_all_implementations(_srcs src/trigonometric.cpp ONLY SSE+XOP+FMA4 AVX+XOP+FMA4)
endif()
endif()
vc_compile_for_all_implementations(_srcs src/sse_sorthelper.cpp ONLY SSE2 SSE4_1 AVX AVX2+FMA+BMI2)
vc_compile_for_all_implementations(_srcs src/avx_sorthelper.cpp ONLY AVX AVX2+FMA+BMI2)
elseif(Vc_ARM)
list(APPEND _srcs src/support_dummy.cpp)
else()
list(APPEND _srcs src/support_dummy.cpp)
endif()
add_library(Vc STATIC ${_srcs})
set_property(TARGET Vc APPEND PROPERTY COMPILE_OPTIONS ${libvc_compile_flags})
add_target_property(Vc LABELS "other")
if(XCODE)
# TODO: document what this does and why it has no counterpart in the non-XCODE logic
set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_GCC_INLINES_ARE_PRIVATE_EXTERN "NO")
set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN "YES")
set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++0x")
set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++")
elseif(UNIX AND Vc_COMPILER_IS_CLANG)
# On UNIX (Linux) the standard library used by default typically is libstdc++ (GCC).
# To get the full clang deal we rather want to build against libc++. This requires
# additionally the libc++abi and libsupc++ libraries in all linker invokations.
option(USE_LIBC++ "Use libc++ instead of the system default C++ standard library." OFF)
if(USE_LIBC++)
AddCompilerFlag(-stdlib=libc++ CXX_FLAGS CMAKE_CXX_FLAGS CXX_RESULT _use_libcxx)
if(_use_libcxx)
find_library(LIBC++ABI c++abi)
mark_as_advanced(LIBC++ABI)
if(LIBC++ABI)
set(CMAKE_REQUIRED_LIBRARIES "${LIBC++ABI};supc++")
CHECK_CXX_SOURCE_COMPILES("#include <stdexcept>
#include <iostream>
void foo() {
std::cout << 'h' << std::flush << std::endl;
throw std::exception();
}
int main() {
try { foo(); }
catch (int) { return 0; }
return 1;
}" libcxx_compiles)
unset(CMAKE_REQUIRED_LIBRARIES)
if(libcxx_compiles)
link_libraries(${LIBC++ABI} supc++)
endif()
endif()
endif()
else()
CHECK_CXX_SOURCE_COMPILES("#include <tuple>
std::tuple<int> f() { std::tuple<int> r; return r; }
int main() { return 0; }
" tuple_sanity)
if (NOT tuple_sanity)
message(FATAL_ERROR "Clang and std::tuple brokenness detected. Please update your compiler.")
endif()
endif()
endif()
add_dependencies(other Vc)
target_include_directories(Vc
PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
$<INSTALL_INTERFACE:include>
)
option(Vc_ENABLE_INSTALL "Whether to install the library." ON)
if (Vc_ENABLE_INSTALL)
install(TARGETS Vc EXPORT VcTargets DESTINATION lib${LIB_SUFFIX})
install(DIRECTORY Vc/ DESTINATION include/Vc FILES_MATCHING REGEX "/*.(h|tcc|def)$")
install(FILES
Vc/Allocator
Vc/IO
Vc/Memory
Vc/SimdArray
Vc/Utils
Vc/Vc
Vc/algorithm
Vc/array
Vc/iterators
Vc/limits
Vc/simdize
Vc/span
Vc/type_traits
Vc/vector
DESTINATION include/Vc)
# Generate and install CMake package and modules
include(CMakePackageConfigHelpers)
set(PACKAGE_INSTALL_DESTINATION
lib${LIB_SUFFIX}/cmake/${PROJECT_NAME}
)
install(EXPORT ${PROJECT_NAME}Targets
NAMESPACE ${PROJECT_NAME}::
DESTINATION ${PACKAGE_INSTALL_DESTINATION}
EXPORT_LINK_INTERFACE_LIBRARIES
)
write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/cmake/${PROJECT_NAME}ConfigVersion.cmake
VERSION ${PROJECT_VERSION}
COMPATIBILITY AnyNewerVersion
)
configure_package_config_file(
${PROJECT_SOURCE_DIR}/cmake/${PROJECT_NAME}Config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/cmake/${PROJECT_NAME}Config.cmake
INSTALL_DESTINATION ${PACKAGE_INSTALL_DESTINATION}
PATH_VARS CMAKE_INSTALL_PREFIX
)
install(FILES
cmake/UserWarning.cmake
cmake/VcMacros.cmake
cmake/AddCompilerFlag.cmake
cmake/CheckCCompilerFlag.cmake
cmake/CheckCXXCompilerFlag.cmake
cmake/OptimizeForArchitecture.cmake
cmake/FindVc.cmake
${CMAKE_CURRENT_BINARY_DIR}/cmake/${PROJECT_NAME}Config.cmake
${CMAKE_CURRENT_BINARY_DIR}/cmake/${PROJECT_NAME}ConfigVersion.cmake
DESTINATION ${PACKAGE_INSTALL_DESTINATION}
)
endif()
option(BUILD_TESTING "Build the testing tree." OFF)
include (CTest)
configure_file(${PROJECT_SOURCE_DIR}/CTestCustom.cmake ${PROJECT_BINARY_DIR}/CTestCustom.cmake COPYONLY)
if(BUILD_TESTING)
add_custom_target(build_tests ALL VERBATIM)
add_subdirectory(tests)
endif()
set(BUILD_EXAMPLES FALSE CACHE BOOL "Build examples.")
if(BUILD_EXAMPLES)
add_subdirectory(examples)
endif(BUILD_EXAMPLES)
# Hide Vc_IMPL as it is only meant for users of Vc
mark_as_advanced(Vc_IMPL)
find_program(BIN_CAT cat)
mark_as_advanced(BIN_CAT)
if(BIN_CAT)
file(REMOVE ${PROJECT_BINARY_DIR}/help.txt)
add_custom_command(OUTPUT ${PROJECT_BINARY_DIR}/help.txt
COMMAND ${CMAKE_MAKE_PROGRAM} help > ${PROJECT_BINARY_DIR}/help.txt
VERBATIM
)
add_custom_target(cached_help
${BIN_CAT} ${PROJECT_BINARY_DIR}/help.txt
DEPENDS ${PROJECT_BINARY_DIR}/help.txt
VERBATIM
)
endif()

15
CTestConfig.cmake Normal file
View File

@ -0,0 +1,15 @@
set(CTEST_PROJECT_NAME "Vc")
set(CTEST_NIGHTLY_START_TIME "00:00:00 CEST")
set(CTEST_DROP_METHOD "http")
set(CTEST_DROP_SITE "cdash.cern.ch")
set(CTEST_DROP_LOCATION "/submit.php?project=Vc")
set(CTEST_DROP_SITE_CDASH TRUE)
set(CTEST_UPDATE_TYPE "git")
find_program(GITCOMMAND git)
set(CTEST_UPDATE_COMMAND "${GITCOMMAND}")
mark_as_advanced(GITCOMMAND)

21
CTestCustom.cmake Normal file
View File

@ -0,0 +1,21 @@
set(CTEST_CUSTOM_WARNING_EXCEPTION ${CTEST_CUSTOM_WARNING_EXCEPTION}
" C4723: " # MSVC 2012 can't suppress this warning
" C4756: " # MSVC 2012 can't suppress this warning
"used uninitialized in this function"
"Skipping compilation of tests gatherStruct and gather2dim because of clang bug" # Not a helpful warning for the dashboard
"warning is a GCC extension"
"^-- " # Ignore output from cmake
"AVX disabled per default because of old/broken compiler" # This warning is meant for users not the dashboard
"WARNING non-zero return value in ctest from: make" # Ignore output from ctest
"ipo: warning #11010:" # Ignore warning about incompatible libraries with ICC -m32 on 64-bit system
"include/qt4" # -Wuninitialized in QWeakPointer(X *ptr)
" note: " # Notes are additional lines from errors (or warnings) that we don't want to count as additional warnings
"clang: warning: argument unused during compilation: '-stdlib=libc"
"clang 3.6.x miscompiles AVX code" # a preprocessor warning for users of Vc, irrelevant for the dashboard
)
set(CTEST_CUSTOM_ERROR_EXCEPTION ${CTEST_CUSTOM_ERROR_EXCEPTION}
"^ICECC"
"^make\\[[1-9]\\]: "
"^collect2: ld returned . exit status"
"^make: \\*\\*\\* \\[.*\\] Error ")

1
INSTALL Normal file
View File

@ -0,0 +1 @@
See README.md.

23
LICENSE Normal file
View File

@ -0,0 +1,23 @@
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

27
Makefile Normal file
View File

@ -0,0 +1,27 @@
CXX ?= c++
build_dir := $(shell which $(CXX))
tmp := "case $$(readlink -f $(build_dir)) in *icecc) which $${ICECC_CXX:-g++};; *) echo $(build_dir);; esac"
build_dir := $(shell sh -c $(tmp))
build_dir := $(realpath $(build_dir))
build_dir := build-$(subst /,-,$(build_dir:/%=%)$(CXXFLAGS))
all:
%:: $(build_dir)/CMakeCache.txt
$(MAKE) --no-print-directory -C "$(build_dir)" $(MAKECMDGOALS)
$(build_dir)/CMakeCache.txt:
@test -n "$(build_dir)"
@mkdir -p "$(build_dir)"
@test -e "$(build_dir)/CMakeCache.txt" || cmake -H. -B"$(build_dir)"
print_build_dir:
@echo "$(PWD)/$(build_dir)"
clean_builddir:
rm -rf "$(build_dir)"
# the following rule works around %:: grabbing the Makefile rule and thus stops it from running every time
Makefile:
@true
.PHONY: print_build_dir clean_builddir

194
README.md Normal file
View File

@ -0,0 +1,194 @@
**Vc is now in maintenance mode and no longer actively developed.
However, we continue to review pull requests with bugfixes from the community.**
**You may be interested in switching to [std-simd](https://github.com/VcDevel/std-simd).**
GCC 11 includes an experimental version of `std::simd` as part of libstdc++, which also works with clang.
Features present in Vc 1.4 and not present in *std-simd* will eventually turn into Vc 2.0,which then depends on *std-simd*.
# Vc: portable, zero-overhead C++ types for explicitly data-parallel programming
Recent generations of CPUs, and GPUs in particular, require data-parallel codes
for full efficiency. Data parallelism requires that the same sequence of
operations is applied to different input data. CPUs and GPUs can thus reduce
the necessary hardware for instruction decoding and scheduling in favor of more
arithmetic and logic units, which execute the same instructions synchronously.
On CPU architectures this is implemented via SIMD registers and instructions.
A single SIMD register can store N values and a single SIMD instruction can
execute N operations on those values. On GPU architectures N threads run in
perfect sync, fed by a single instruction decoder/scheduler. Each thread has
local memory and a given index to calculate the offsets in memory for loads and
stores.
Current C++ compilers can do automatic transformation of scalar codes to SIMD
instructions (auto-vectorization). However, the compiler must reconstruct an
intrinsic property of the algorithm that was lost when the developer wrote a
purely scalar implementation in C++. Consequently, C++ compilers cannot
vectorize any given code to its most efficient data-parallel variant.
Especially larger data-parallel loops, spanning over multiple functions or even
translation units, will often not be transformed into efficient SIMD code.
The Vc library provides the missing link. Its types enable explicitly stating
data-parallel operations on multiple values. The parallelism is therefore added
via the type system. Competing approaches state the parallelism via new control
structures and consequently new semantics inside the body of these control
structures.
Vc is a free software library to ease explicit vectorization of C++ code. It
has an intuitive API and provides portability between different compilers and
compiler versions as well as portability between different vector instruction
sets. Thus an application written with Vc can be compiled for:
* AVX and AVX2
* SSE2 up to SSE4.2 or SSE4a
* Scalar
* ~~AVX-512 (Vc 2 development)~~
* ~~NEON (in development)~~
* ~~NVIDIA GPUs / CUDA (research)~~
After Intel dropped MIC support with ICC 18, Vc 1.4 also removed support for it.
## Examples
### Usage on Compiler Explorer
* [Simdize Example](https://godbolt.org/z/JVEM2j)
* [Total momentum and time stepping of `std::vector<Particle>`](https://godbolt.org/z/JNdkL9)
* [Matrix Example](https://godbolt.org/z/fFEkuX): This uses vertical
vectorization which does not scale to different vector sizes. However, the
example is instructive to compare it with similar solutions of other languages
or libraries.
* [N-vortex solver](https://godbolt.org/z/4o1cg_) showing `simdize`d iteration
over many `std::vector<float>`. Note how [important the `-march` flag is, compared
to plain `-mavx2 -mfma`](https://godbolt.org/z/hKiOjr).
### Scalar Product
Let's start from the code for calculating a 3D scalar product using builtin floats:
```cpp
using Vec3D = std::array<float, 3>;
float scalar_product(Vec3D a, Vec3D b) {
return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
}
```
Using Vc, we can easily vectorize the code using the `float_v` type:
```cpp
using Vc::float_v
using Vec3D = std::array<float_v, 3>;
float_v scalar_product(Vec3D a, Vec3D b) {
return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
}
```
The above will scale to 1, 4, 8, 16, etc. scalar products calculated in parallel, depending
on the target hardware's capabilities.
For comparison, the same vectorization using Intel SSE intrinsics is more verbose and uses
prefix notation (i.e. function calls):
```cpp
using Vec3D = std::array<__m128, 3>;
__m128 scalar_product(Vec3D a, Vec3D b) {
return _mm_add_ps(_mm_add_ps(_mm_mul_ps(a[0], b[0]), _mm_mul_ps(a[1], b[1])),
_mm_mul_ps(a[2], b[2]));
}
```
The above will neither scale to AVX, AVX-512, etc. nor is it portable to other SIMD ISAs.
## Build Requirements
cmake >= 3.0
C++11 Compiler:
* GCC >= 4.8.1
* clang >= 3.4
* ICC >= 18.0.5
* Visual Studio 2019 (64-bit target)
## Building and Installing Vc
* Clone Vc and initialize Vc's git submodules:
```sh
git clone https://github.com/VcDevel/Vc.git
cd Vc
git submodule update --init
```
* Create a build directory:
```sh
$ mkdir build
$ cd build
```
* Configure with cmake and add relevant options:
```sh
$ cmake ..
```
Optionally, specify an installation directory:
```sh
$ cmake -DCMAKE_INSTALL_PREFIX=/opt/Vc ..
```
Optionally, include building the unit tests:
```sh
$ cmake -DBUILD_TESTING=ON ..
```
On Windows, if you have multiple versions of Visual Studio installed, you can select one:
```sh
$ cmake -G "Visual Studio 16 2019" ..
```
See `cmake --help` for a list of possible generators.
* Build and install:
```sh
$ cmake --build . -j 16
$ cmake --install . # may require permissions
```
On Windows, you can also open `Vc.sln` in Visual Studio and build/install from the IDE.
## Documentation
The documentation is generated via [doxygen](http://doxygen.org). You can build
the documentation by running `doxygen` in the `doc` subdirectory.
Alternatively, you can find nightly builds of the documentation at:
* [1.4 branch](https://vcdevel.github.io/Vc-1.4/)
* [1.4.3 release](https://vcdevel.github.io/Vc-1.4.3/)
* [1.4.2 release](https://vcdevel.github.io/Vc-1.4.2/)
* [1.4.1 release](https://vcdevel.github.io/Vc-1.4.1/)
* [1.4.0 release](https://vcdevel.github.io/Vc-1.4.0/)
* [1.3 branch](https://vcdevel.github.io/Vc-1.3/)
* [1.3.0 release](https://vcdevel.github.io/Vc-1.3.0/)
* [1.2.0 release](https://vcdevel.github.io/Vc-1.2.0/)
* [1.1.0 release](https://vcdevel.github.io/Vc-1.1.0/)
* [0.7 branch](https://vcdevel.github.io/Vc-0.7/)
## Publications
* [M. Kretz, "Extending C++ for Explicit Data-Parallel Programming via SIMD
Vector Types", Goethe University Frankfurt, Dissertation,
2015.](http://publikationen.ub.uni-frankfurt.de/frontdoor/index/index/docId/38415)
* [M. Kretz and V. Lindenstruth, "Vc: A C++ library for explicit
vectorization", Software: Practice and Experience,
2011.](http://dx.doi.org/10.1002/spe.1149)
* [M. Kretz, "Efficient Use of Multi- and Many-Core Systems with Vectorization
and Multithreading", University of Heidelberg,
2009.](http://code.compeng.uni-frankfurt.de/attachments/13/Diplomarbeit.pdf)
[Work on integrating the functionality of Vc in the C++ standard library.](
https://github.com/VcDevel/Vc/wiki/ISO-Standardization-of-the-Vector-classes)
## License
Vc is released under the terms of the [3-clause BSD license](http://opensource.org/licenses/BSD-3-Clause).

140
Test_all_compilers.sh Executable file
View File

@ -0,0 +1,140 @@
#!/bin/sh -e
export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games"
export LANG="en_US.UTF-8"
export LANGUAGE="en_US.UTF-8"
export LC_CTYPE="en_US.UTF-8"
export LC_NUMERIC="en_US.UTF-8"
export LC_TIME="en_US.UTF-8"
export LC_MESSAGES="en_US.UTF-8"
unset CFLAGS CXXFLAGS
cd "`dirname "$0"`"
test -z "dashboard_model" && export dashboard_model=Experimental
runTest() {
libpath="$LD_LIBRARY_PATH"
test -n "$1" && libpath="$(dirname $(realpath $($CXX $1 -print-file-name=libstdc++.so)))${libpath:+:}${libpath}"
LD_LIBRARY_PATH="$libpath" CFLAGS="$1" CXXFLAGS="$1" ctest -S test.cmake || true
}
tested_compilers="lsakdfjwowleqirjodfisj"
runAllTests() {
# first make sure we don't test a compiler a second time
id="`which $CXX`"
id="`readlink -f $id`"
echo "$id"|grep -qF "$tested_compilers" && return
tested_compilers="$tested_compilers
$id"
# alright run the ctest script
runTest
supports32Bit && runTest -m32 || true
supportsx32 && runTest -mx32 || true
}
supports32Bit() {
test `uname -m` = "x86_64" || return 1
CXX=${CXX:-c++}
cat > /tmp/m32test.cpp <<END
#include <algorithm>
#include <string>
#include <iostream>
#include <cerrno>
void foo(int x) { switch (x) { case 0x0A: break; case 0x0B: break; case 0x0C: break; case 0x0D: break; case 0x0E: break; } }
int main() { std::cout << "Hello World!\n"; return 0; }
END
$CXX -m32 -o /tmp/m32test /tmp/m32test.cpp >/dev/null 2>&1 || return 1
rm /tmp/m32test*
return 0
}
supportsx32() {
test `uname -m` = "x86_64" || return 1
CXX=${CXX:-c++}
cat > /tmp/mx32test.cpp <<END
#include <algorithm>
#include <string>
#include <iostream>
#include <cerrno>
void foo(int x) { switch (x) { case 0x0A: break; case 0x0B: break; case 0x0C: break; case 0x0D: break; case 0x0E: break; } }
int main() { std::cout << "Hello World!\n"; return 0; }
END
$CXX -mx32 -o /tmp/mx32test /tmp/mx32test.cpp >/dev/null 2>&1 || return 1
rm /tmp/mx32test*
return 0
}
system_compilers() {
cxxlist="`find /usr/bin/ /usr/local/bin/ -name '*++-[0-9]*'|grep -v -- -linux-gnu`"
if test -z "$cxxlist"; then
cxxlist="`find /usr/bin/ /usr/local/bin/ -name '*++'|grep -v -- -linux-gnu`"
fi
if test -z "$cxxlist"; then
# default compiler
runAllTests
else
for CXX in $cxxlist; do
CC=`echo "$CXX"|sed 's/clang++/clang/;s/g++/gcc/'`
if test -x "$CC" -a -x "$CXX"; then
export CC
export CXX
runAllTests
fi
done
fi
}
modules_compilers() {
if test -r /etc/profile.d/modules.sh; then
source /etc/profile.d/modules.sh
for mod in `module avail -t 2>&1`; do
case `echo $mod|tr '[:upper:]' '[:lower:]'` in
*intel*|*icc*) export CC=icc CXX=icpc;;
*gnu*|*gcc*) export CC=gcc CXX=g++;;
*llvm*|*clang*) export CC=clang CXX=clang++;;
*) continue;;
esac
module load $mod
runAllTests
module unload $mod
done
fi
}
gccbuild_compilers() {
for VcEnv in `find /opt/ -mindepth 2 -maxdepth 2 -name Vc.env`; do (
. "$VcEnv"
case "$VcEnv" in
*-snapshot/Vc.env)
( cd $HOME/src/gcc-build && ./update.sh "`dirname "$VcEnv"`" )
;;
esac
runAllTests
) done
}
icc_compilers() {
test -d /opt/intel || return
export CC=icc
export CXX=icpc
icclist="`find /opt/intel/compiler* -name 'iccvars.sh' | xargs readlink -e | sort -ur`"
case `uname -m` in
x86_64)
COMPILERVARS_ARCHITECTURE=intel64
;;
i[345678]86)
COMPILERVARS_ARCHITECTURE=ia32
;;
esac
export COMPILERVARS_ARCHITECTURE
test -n "$icclist" && for IccEnv in $icclist; do (
. $IccEnv $COMPILERVARS_ARCHITECTURE
runAllTests
) done
}
system_compilers
modules_compilers
gccbuild_compilers
icc_compilers

22
Test_vc.sh Executable file
View File

@ -0,0 +1,22 @@
#!/bin/bash
case "$1" in
Experimental|Nightly|Continuous)
export dashboard_model=$1
case "$2" in
None|Debug|Release|RelWithDebug|RelWithDebInfo|MinSizeRel)
export build_type=$2
;;
esac
;;
*)
echo "Usage: $0 <model> [<build type>]"
echo
echo "Possible arguments for model are Nightly, Continuous, or Experimental."
echo "Build type may be one of: None Debug Release RelWithDebug RelWithDebInfo MinSizeRel."
echo
exit 1
;;
esac
ctest -S "`dirname $0`/test.cmake"

284
Vc/Allocator Normal file
View File

@ -0,0 +1,284 @@
/* This file is part of the Vc library. {{{
Copyright © 2014 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_ALLOCATOR_H_
#define VC_ALLOCATOR_H_
#include <new>
#include <cstddef>
#include <cstdlib>
#include <utility>
#include "global.h"
#include "common/macros.h"
/**
* \ingroup Utilities
*
* Convenience macro to set the default allocator for a given \p Type to
* Vc::Allocator.
*
* \param Type Your type that you want to use with STL containers.
*
* \note You have to use this macro in the global namespace.
*/
#ifdef Vc_MSVC
#define Vc_DECLARE_ALLOCATOR(Type) \
namespace std \
{ \
template <> class allocator<Type> : public ::Vc::Allocator<Type> \
{ \
public: \
template <typename U> struct rebind { \
typedef ::std::allocator<U> other; \
}; \
/* MSVC brokenness: the following function is optional - just doesn't compile \
* without it */ \
const allocator &select_on_container_copy_construction() const { return *this; } \
}; \
}
#else
#define Vc_DECLARE_ALLOCATOR(Type) \
namespace std \
{ \
template <> class allocator<Type> : public ::Vc::Allocator<Type> \
{ \
public: \
template <typename U> struct rebind { \
typedef ::std::allocator<U> other; \
}; \
}; \
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
using std::size_t;
using std::ptrdiff_t;
/**
* \headerfile Allocator <Vc/Allocator>
* An allocator that uses global new and supports over-aligned types, as per [C++11 20.6.9].
*
* Meant as a simple replacement for the allocator defined in the C++ Standard.
* Allocation is done using the global new/delete operators. But if the alignment property of \p
* T is larger than the size of a pointer, the allocate function allocates slightly more memory
* to adjust the pointer for correct alignment.
*
* If the \p T does not require over-alignment no additional memory will be allocated.
*
* \tparam T The type of objects to allocate.
*
* Example:
* \code
* struct Data {
* Vc::float_v x, y, z;
* };
*
* void fun()
* {
* std::vector<Data> dat0; // this will use std::allocator<Data>, which probably ignores the
* // alignment requirements for Data. Thus any access to dat0 may
* // crash your program.
*
* std::vector<Data, Vc::Allocator<Data> > dat1; // now std::vector will get correctly aligned
* // memory. Accesses to dat1 are safe.
* ...
* \endcode
*
* %Vc ships a macro to conveniently tell STL to use Vc::Allocator per default for a given type:
* \code
* struct Data {
* Vc::float_v x, y, z;
* };
* Vc_DECLARE_ALLOCATOR(Data)
*
* void fun()
* {
* std::vector<Data> dat0; // good now
* ...
* \endcode
*
* \ingroup Utilities
*/
template<typename T> class Allocator
{
private:
enum Constants {
#ifdef Vc_HAVE_STD_MAX_ALIGN_T
NaturalAlignment = alignof(std::max_align_t),
#elif defined(Vc_HAVE_MAX_ALIGN_T)
NaturalAlignment = alignof(::max_align_t),
#else
NaturalAlignment = sizeof(void *) > alignof(long double) ? sizeof(void *) :
(alignof(long double) > alignof(long long) ? alignof(long double) : alignof(long long)),
#endif
#if defined Vc_IMPL_AVX
SimdAlignment = 32,
#elif defined Vc_IMPL_SSE
SimdAlignment = 16,
#else
SimdAlignment = 1,
#endif
Alignment = alignof(T) > SimdAlignment ? alignof(T) : SimdAlignment,
/* The number of extra bytes allocated must be large enough to put a pointer right
* before the adjusted address. This pointer stores the original address, which is
* required to call ::operator delete in deallocate.
*
* The address we get from ::operator new is a multiple of NaturalAlignment:
* p = N * NaturalAlignment
*
* Since all alignments are powers of two, Alignment is a multiple of NaturalAlignment:
* Alignment = k * NaturalAlignment
*
* two cases:
* 1. If p is already aligned to Alignment then allocate will return p + Alignment. In
* this case there are Alignment Bytes available to store a pointer.
* 2. If p is not aligned then p + (k - (N modulo k)) * NaturalAlignment will be
* returned. Since NaturalAlignment >= sizeof(void*) the pointer fits.
*/
ExtraBytes = Alignment > NaturalAlignment ? Alignment : 0,
AlignmentMask = Alignment - 1
};
public:
typedef size_t size_type;
typedef ptrdiff_t difference_type;
typedef T* pointer;
typedef const T* const_pointer;
typedef T& reference;
typedef const T& const_reference;
typedef T value_type;
template<typename U> struct rebind { typedef Allocator<U> other; };
Allocator() throw() { }
Allocator(const Allocator&) throw() { }
template<typename U> Allocator(const Allocator<U>&) throw() { }
pointer address(reference x) const { return &x; }
const_pointer address(const_reference x) const { return &x; }
pointer allocate(size_type n, const void* = 0)
{
if (n > this->max_size()) {
throw std::bad_alloc();
}
char *p = static_cast<char *>(::operator new(n * sizeof(T) + ExtraBytes));
if (ExtraBytes > 0) {
char *const pp = p;
p += ExtraBytes;
const char *null = 0;
p -= ((p - null) & AlignmentMask); // equivalent to p &= ~AlignmentMask;
reinterpret_cast<char **>(p)[-1] = pp;
}
return reinterpret_cast<pointer>(p);
}
void deallocate(pointer p, size_type)
{
if (ExtraBytes > 0) {
p = reinterpret_cast<pointer *>(p)[-1];
}
::operator delete(p);
}
size_type max_size() const throw() { return size_t(-1) / sizeof(T); }
#ifdef Vc_MSVC
// MSVC brokenness: the following function is optional - just doesn't compile without it
const Allocator &select_on_container_copy_construction() const { return *this; }
// MSVC also requires a function that neither C++98 nor C++11 mention
// but it doesn't support variadic templates... otherwise the Vc_CXX11 clause would be nice
void construct(pointer p) { ::new(p) T(); }
// we still need the C++98 version:
void construct(pointer p, const T& val) { ::new(p) T(val); }
void destroy(pointer p) { p->~T(); }
#else
template<typename U, typename... Args> void construct(U* p, Args&&... args)
{
::new(p) U(std::forward<Args>(args)...);
}
template<typename U> void destroy(U* p) { p->~U(); }
#endif
};
template<typename T> inline bool operator==(const Allocator<T>&, const Allocator<T>&) { return true; }
template<typename T> inline bool operator!=(const Allocator<T>&, const Allocator<T>&) { return false; }
}
#include "vector.h"
namespace std
{
template<typename T, typename Abi>
class allocator<Vc::Vector<T, Abi> > : public ::Vc::Allocator<Vc::Vector<T, Abi> >
{
public:
template<typename U> struct rebind { typedef ::std::allocator<U> other; };
#ifdef Vc_MSVC
// MSVC brokenness: the following function is optional - just doesn't compile without it
const allocator &select_on_container_copy_construction() const { return *this; }
#endif
};
template <typename T, typename Abi>
class allocator<Vc::Mask<T, Abi>> : public ::Vc::Allocator<Vc::Mask<T, Abi>>
{
public:
template<typename U> struct rebind { typedef ::std::allocator<U> other; };
#ifdef Vc_MSVC
// MSVC brokenness: the following function is optional - just doesn't compile without it
const allocator &select_on_container_copy_construction() const { return *this; }
#endif
};
template <typename T, std::size_t N, typename V, std::size_t M>
class allocator<Vc::SimdArray<T, N, V, M>> : public ::Vc::Allocator<Vc::SimdArray<T, N, V, M>>
{
public:
template<typename U> struct rebind { typedef ::std::allocator<U> other; };
#ifdef Vc_MSVC
// MSVC brokenness: the following function is optional - just doesn't compile without it
const allocator &select_on_container_copy_construction() const { return *this; }
#endif
};
template <typename T, std::size_t N, typename V, std::size_t M>
class allocator<Vc::SimdMaskArray<T, N, V, M>> : public ::Vc::Allocator<Vc::SimdMaskArray<T, N, V, M>>
{
public:
template<typename U> struct rebind { typedef ::std::allocator<U> other; };
#ifdef Vc_MSVC
// MSVC brokenness: the following function is optional - just doesn't compile without it
const allocator &select_on_container_copy_construction() const { return *this; }
#endif
};
}
#endif // VC_ALLOCATOR_H_
// vim: ft=cpp et sw=4 sts=4

268
Vc/IO Normal file
View File

@ -0,0 +1,268 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_IO_
#define VC_IO_
#include "common/types.h"
#include "common/simdarrayfwd.h"
#include "common/memoryfwd.h"
#include <iostream>
#if defined(__GNUC__) && !defined(_WIN32) && defined(_GLIBCXX_OSTREAM)
#define Vc_HACK_OSTREAM_FOR_TTY 1
#endif
#ifdef Vc_HACK_OSTREAM_FOR_TTY
#include <unistd.h>
#include <ext/stdio_sync_filebuf.h>
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace
{
#ifdef Vc_HACK_OSTREAM_FOR_TTY
class hacked_ostream : public std::ostream
{
public:
using std::ostream::_M_streambuf;
};
bool mayUseColor(const std::ostream &os) __attribute__((__const__));
bool mayUseColor(const std::ostream &os)
{
std::basic_streambuf<char> *hack1 =
const_cast<std::basic_streambuf<char> *>(os.*(&hacked_ostream::_M_streambuf));
__gnu_cxx::stdio_sync_filebuf<char> *hack =
dynamic_cast<__gnu_cxx::stdio_sync_filebuf<char> *>(hack1);
if (!hack) {
return false;
}
FILE *file = hack->file();
return 1 == isatty(fileno(file));
}
#else
bool mayUseColor(const std::ostream &) { return false; }
#endif
} // anonymous namespace
namespace AnsiColor
{
struct Type
{
const char *data;
};
static const Type green = {"\033[1;40;32m"};
static const Type yellow = {"\033[1;40;33m"};
static const Type blue = {"\033[1;40;34m"};
static const Type normal = {"\033[0m"};
inline std::ostream &operator<<(std::ostream &out, const Type &c)
{
if (mayUseColor(out)) {
out << c.data;
}
return out;
}
} // namespace AnsiColor
/**
* \ingroup Vectors
* \headerfile IO <Vc/IO>
*
* Prints the contents of a vector into a stream object.
*
* \code
* const Vc::int_v v(Vc::IndexesFromZero);
* std::cout << v << std::endl;
* \endcode
* will output (with SSE):
\verbatim
[0, 1, 2, 3]
\endverbatim
*
* \param out Any standard C++ ostream object. For example std::cout or a
* std::stringstream object.
* \param v Any Vc::Vector object.
* \return The ostream object: to chain multiple stream operations.
*
* \note With the GNU standard library this function will check whether the
* output stream is a tty in which case it colorizes the output.
*/
template <typename T, typename Abi>
inline std::ostream &operator<<(std::ostream &out, const Vc::Vector<T, Abi> &v)
{
using TT = typename std::conditional<std::is_same<T, char>::value ||
std::is_same<T, unsigned char>::value ||
std::is_same<T, signed char>::value,
int,
T>::type;
out << AnsiColor::green << '[';
out << TT(v[0]);
for (size_t i = 1; i < v.Size; ++i) {
out << ", " << TT(v[i]);
}
out << ']' << AnsiColor::normal;
return out;
}
/**
* \ingroup Masks
* \headerfile IO <Vc/IO>
*
* Prints the contents of a mask into a stream object.
*
* \code
* const Vc::short_m m = Vc::short_v::IndexesFromZero() < 3;
* std::cout << m << std::endl;
* \endcode
* will output (with SSE):
\verbatim
m[1110 0000]
\endverbatim
*
* \param out Any standard C++ ostream object. For example std::cout or a
* std::stringstream object.
* \param m Any Vc::Mask object.
* \return The ostream object: to chain multiple stream operations.
*
* \note With the GNU standard library this function will check whether the
* output stream is a tty in which case it colorizes the output.
*/
template <typename T, typename Abi>
inline std::ostream &operator<<(std::ostream &out, const Vc::Mask<T, Abi> &m)
{
out << AnsiColor::blue << "m[";
for (unsigned int i = 0; i < m.Size; ++i) {
if (i > 0 && (i % 4) == 0) {
out << ' ';
}
if (m[i]) {
out << AnsiColor::yellow << '1';
} else {
out << AnsiColor::blue << '0';
}
}
out << AnsiColor::blue << ']' << AnsiColor::normal;
return out;
}
namespace Common
{
#ifdef DOXYGEN
/**
* \ingroup Utilities
* \headerfile dox.h <Vc/IO>
*
* Prints the contents of a Memory object into a stream object.
*
* \code
* Vc::Memory<int_v, 10> m;
* for (int i = 0; i < m.entriesCount(); ++i) {
* m[i] = i;
* }
* std::cout << m << std::endl;
* \endcode
* will output (with SSE):
\verbatim
{[0, 1, 2, 3] [4, 5, 6, 7] [8, 9, 0, 0]}
\endverbatim
*
* \param s Any standard C++ ostream object. For example std::cout or a std::stringstream object.
* \param m Any Vc::Memory object.
* \return The ostream object: to chain multiple stream operations.
*
* \note With the GNU standard library this function will check whether the
* output stream is a tty in which case it colorizes the output.
*
* \warning Please do not forget that printing a large memory object can take a long time.
*/
template<typename V, typename Parent, typename Dimension, typename RM>
inline std::ostream &operator<<(std::ostream &s, const Vc::MemoryBase<V, Parent, Dimension, RM> &m);
#endif
template<typename V, typename Parent, typename RM>
inline std::ostream &operator<<(std::ostream &out, const MemoryBase<V, Parent, 1, RM> &m )
{
out << AnsiColor::blue << '{' << AnsiColor::normal;
for (unsigned int i = 0; i < m.vectorsCount(); ++i) {
out << V(m.vector(i));
}
out << AnsiColor::blue << '}' << AnsiColor::normal;
return out;
}
template<typename V, typename Parent, typename RM>
inline std::ostream &operator<<(std::ostream &out, const MemoryBase<V, Parent, 2, RM> &m )
{
out << AnsiColor::blue << '{' << AnsiColor::normal;
for (size_t i = 0; i < m.rowsCount(); ++i) {
if (i > 0) {
out << "\n ";
}
const size_t vcount = m[i].vectorsCount();
for (size_t j = 0; j < vcount; ++j) {
out << V(m[i].vector(j));
}
}
out << AnsiColor::blue << '}' << AnsiColor::normal;
return out;
}
} // namespace Common
template<typename T, std::size_t N>
inline std::ostream &operator<<(std::ostream &out, const SimdArray<T, N> &v)
{
out << AnsiColor::green << '<' << v[0];
for (size_t i = 1; i < N; ++i) {
if (i % 4 == 0) out << " |";
out << ' ' << v[i];
}
return out << '>' << AnsiColor::normal;
}
template<typename T, std::size_t N>
inline std::ostream &operator<<(std::ostream &out, const SimdMaskArray<T, N> &m)
{
out << AnsiColor::blue << "«";
for (size_t i = 0; i < N; ++i) {
if (i > 0 && (i % 4) == 0) {
out << ' ';
}
if ( m[i] ) {
out << AnsiColor::yellow << '1';
} else {
out << AnsiColor::blue << '0';
}
}
return out << AnsiColor::blue << "»" << AnsiColor::normal;
}
}
#endif // VC_IO_
// vim: ft=cpp foldmethod=marker

43
Vc/Memory Normal file
View File

@ -0,0 +1,43 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_MEMORY_
#define VC_MEMORY_
#include "vector.h"
#include "common/memory.h"
#include "common/interleavedmemory.h"
#include "common/make_unique.h"
namespace Vc_VERSIONED_NAMESPACE
{
using Common::make_unique;
}
#endif // VC_MEMORY_
// vim: ft=cpp foldmethod=marker

35
Vc/SimdArray Normal file
View File

@ -0,0 +1,35 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_SIMDARRAY_
#define VC_SIMDARRAY_
#include "common/simdarray.h"
#endif // VC_SIMDARRAY_
// vim: ft=cpp foldmethod=marker

44
Vc/Utils Normal file
View File

@ -0,0 +1,44 @@
/* This file is part of the Vc library. {{{
Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_UTILS_
#define VC_UTILS_
#include "global.h"
#ifdef Vc_IMPL_Scalar
# define VECTOR_NAMESPACE Scalar
#else
# define VECTOR_NAMESPACE SSE
#endif
#include "common/deinterleave.h"
#include "common/makeContainer.h"
#endif // VC_UTILS_
// vim: ft=cpp foldmethod=marker

43
Vc/Vc Normal file
View File

@ -0,0 +1,43 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_VC_
#define VC_VC_
#include "vector.h"
#include "IO"
#include "Memory"
#include "Utils"
#include "Allocator"
#include "algorithm"
#include "iterators"
#include "simdize"
#include "array"
#include "span"
#include "vector"
#endif // VC_VC_
// vim: ft=cpp foldmethod=marker

1
Vc/algorithm Normal file
View File

@ -0,0 +1 @@
#include "common/algorithms.h"

315
Vc/array Normal file
View File

@ -0,0 +1,315 @@
/* This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
//===---------------------------- array -----------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#ifndef VC_INCLUDE_VC_ARRAY_
#define VC_INCLUDE_VC_ARRAY_
#include <type_traits>
#include <utility>
#include <iterator>
#include <algorithm>
#include <stdexcept>
#include "common/subscript.h"
namespace Vc_VERSIONED_NAMESPACE
{
/**
* \ingroup Containers
* This is `std::array` with additional subscript operators supporting gather and scatter operations.
*
* The [std::array](https://en.cppreference.com/w/cpp/container/array) documentation applies.
*
* Gathers from structured data (AoS: arrays of struct) are possible via a special
* subscript operator.
* Example:
* \code
* Vc::array<float, 100> data;
* std::iota(data.begin(), data.end(), 0.f); // fill with values 0, 1, 2, ...
* auto indexes = float_v::IndexType::IndexesFromZero();
* float_v gathered = data[indexes]; // gathered == [0, 1, 2, ...]
* \endcode
*
* This also works for gathers into arrays of structures:
* \code
* struct Point { float x, y, z; };
* Vc::array<Point, 100> points;
* // fill points ...
* auto indexes = float_v::IndexType::IndexesFromZero();
* float_v xs = data[indexes][&Point::x]; // [points[0].x, points[1].x, points[2].x, ...]
* float_v ys = data[indexes][&Point::y]; // [points[0].y, points[1].y, points[2].y, ...]
* float_v zs = data[indexes][&Point::z]; // [points[0].z, points[1].z, points[2].z, ...]
* \endcode
*
* Arrays may also be nested:
* \code:
* Vc::array<Vc::array<float, 3>, 100> points;
* // fill points ...
* auto indexes = float_v::IndexType::IndexesFromZero();
* float_v xs = data[indexes][0]; // [points[0][0], points[1][0], points[2][0], ...]
* float_v ys = data[indexes][1]; // [points[0][1], points[1][1], points[2][1], ...]
* float_v zs = data[indexes][2]; // [points[0][2], points[1][2], points[2][2], ...]
* \endcode
*/
template <class T, size_t Size> struct array {
// types:
typedef array self_;
typedef T value_type;
typedef value_type& reference;
typedef const value_type& const_reference;
typedef value_type* iterator;
typedef const value_type* const_iterator;
typedef value_type* pointer;
typedef const value_type* const_pointer;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
typedef std::reverse_iterator<iterator> reverse_iterator;
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
value_type elems_[Size > 0 ? Size : 1];
// No explicit construct/copy/destroy for aggregate type
void fill(const value_type& u_) { std::fill_n(elems_, Size, u_); }
void swap(array& a_) noexcept(std::swap(std::declval<T &>(), std::declval<T &>()))
{
std::swap_ranges(elems_, elems_ + Size, a_.elems_);
}
// iterators:
iterator begin() noexcept { return iterator(elems_); }
const_iterator begin() const noexcept { return const_iterator(elems_); }
iterator end() noexcept { return iterator(elems_ + Size); }
const_iterator end() const noexcept { return const_iterator(elems_ + Size); }
reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
const_reverse_iterator rbegin() const noexcept
{
return const_reverse_iterator(end());
}
reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
const_reverse_iterator rend() const noexcept
{
return const_reverse_iterator(begin());
}
const_iterator cbegin() const noexcept { return begin(); }
const_iterator cend() const noexcept { return end(); }
const_reverse_iterator crbegin() const noexcept { return rbegin(); }
const_reverse_iterator crend() const noexcept { return rend(); }
// capacity:
constexpr size_type size() const noexcept { return Size; }
constexpr size_type max_size() const noexcept { return Size; }
constexpr bool empty() const noexcept { return Size == 0; }
// element access:
reference operator[](size_type n_) { return elems_[n_]; }
constexpr const_reference operator[](size_type n_) const { return elems_[n_]; }
/**
* \name Data-Parallel Subscripting for Gather & Scatter
*/
///@{
template <typename I>
Vc_ALWAYS_INLINE auto operator[](I&& arg_)
-> decltype(subscript_operator(*this, std::forward<I>(arg_)))
{
return subscript_operator(*this, std::forward<I>(arg_));
}
template <typename I>
Vc_ALWAYS_INLINE auto operator[](I&& arg_) const
-> decltype(subscript_operator(*this, std::forward<I>(arg_)))
{
return subscript_operator(*this, std::forward<I>(arg_));
}
///@}
reference at(size_type n_);
constexpr const_reference at(size_type n_) const;
reference front() { return elems_[0]; }
constexpr const_reference front() const { return elems_[0]; }
reference back() { return elems_[Size > 0 ? Size - 1 : 0]; }
constexpr const_reference back() const { return elems_[Size > 0 ? Size - 1 : 0]; }
value_type* data() noexcept { return elems_; }
const value_type* data() const noexcept { return elems_; }
};
template <class T, size_t Size>
typename array<T, Size>::reference array<T, Size>::at(size_type n_)
{
if (n_ >= Size) {
throw std::out_of_range("array::at");
}
return elems_[n_];
}
template <class T, size_t Size>
constexpr typename array<T, Size>::const_reference array<T, Size>::at(size_type n_) const
{
return n_ >= Size ? (throw std::out_of_range("array::at"), elems_[0]) : elems_[n_];
}
template <class T, size_t Size>
inline bool operator==(const array<T, Size>& x_, const array<T, Size>& y_)
{
return std::equal(x_.elems_, x_.elems_ + Size, y_.elems_);
}
template <class T, size_t Size>
inline bool operator!=(const array<T, Size>& x_, const array<T, Size>& y_)
{
return !(x_ == y_);
}
template <class T, size_t Size>
inline bool operator<(const array<T, Size>& x_, const array<T, Size>& y_)
{
return std::lexicographical_compare(x_.elems_, x_.elems_ + Size, y_.elems_,
y_.elems_ + Size);
}
template <class T, size_t Size>
inline bool operator>(const array<T, Size>& x_, const array<T, Size>& y_)
{
return y_ < x_;
}
template <class T, size_t Size>
inline bool operator<=(const array<T, Size>& x_, const array<T, Size>& y_)
{
return !(y_ < x_);
}
template <class T, size_t Size>
inline bool operator>=(const array<T, Size>& x_, const array<T, Size>& y_)
{
return !(x_ < y_);
}
/**\name non-member begin & end
* Implement the non-member begin & end functions in the %Vc namespace so that ADL works
* and `begin(some_vc_array)` always works.
*/
///@{
template <typename T, std::size_t N>
inline auto begin(array<T, N>& arr) -> decltype(arr.begin())
{
return arr.begin();
}
template <typename T, std::size_t N>
inline auto begin(const array<T, N>& arr) -> decltype(arr.begin())
{
return arr.begin();
}
template <typename T, std::size_t N>
inline auto end(array<T, N>& arr) -> decltype(arr.end())
{
return arr.end();
}
template <typename T, std::size_t N>
inline auto end(const array<T, N>& arr) -> decltype(arr.end())
{
return arr.end();
}
///@}
namespace Traits
{
template <typename T, std::size_t N>
struct has_no_allocated_data_impl<Vc::array<T, N>> : public std::true_type
{
};
template <typename T, std::size_t N>
struct has_contiguous_storage_impl<Vc::array<T, N>> : public std::true_type
{
};
} // namespace Traits
} // namespace Vc
namespace std
{
template <class T, size_t Size>
inline
#ifdef Vc_MSVC
// MSVC fails to do SFINAE correctly and gets totally confused:
// error C2433: 'type': 'inline' not permitted on data declarations
// error C4430: missing type specifier - int assumed. Note: C++ does not support default-int
// error C2061: syntax error: identifier 'swap'
void
#else
typename enable_if<is_same<void, decltype(swap(declval<T&>(), declval<T&>()))>::value,
void>::type
#endif
swap(const Vc::array<T, Size>& x_,
const Vc::array<T, Size>& y_) noexcept(swap(declval<T&>(), declval<T&>()))
{
x_.swap(y_);
}
template <class T, size_t Size>
class tuple_size<Vc::array<T, Size>> : public integral_constant<size_t, Size>
{
};
template <size_t I, class T, size_t Size> class tuple_element<I, Vc::array<T, Size>>
{
public:
typedef T type;
};
template <size_t I, class T, size_t Size>
inline constexpr typename std::enable_if<(I < Size), T&>::type get(
Vc::array<T, Size>& a_) noexcept
{
return a_.elems_[I];
}
template <size_t I, class T, size_t Size>
inline constexpr typename std::enable_if<(I < Size), const T&>::type get(
const Vc::array<T, Size>& a_) noexcept
{
return a_.elems_[I];
}
template <size_t I, class T, size_t Size>
inline constexpr typename std::enable_if<(I < Size), T&&>::type get(
Vc::array<T, Size>&& a_) noexcept
{
return std::move(a_.elems_[I]);
}
} // namespace std
#endif // VC_INCLUDE_VC_ARRAY_
// vim: ft=cpp foldmethod=marker

58
Vc/avx/README Normal file
View File

@ -0,0 +1,58 @@
###########################################
################# AVX #################
###########################################
1. Floating Point
===========================================
Uses full 256bit vectors for all operations. 128bit vectors are never used.
2. Integer
===========================================
Integer support in AVX is minimal.
The 256bit integer vectors are just intended as a supporting type of float operations.
Any arithmetic, logical, or comparison operations must be implemented using 128bit operations.
int_v/uint_v could be implemented either as 128 or 256 types. I.e. either int_v::Size == 4 or 8.
2.1. 256bit int vectors
===========================================
2.1.1. Implementation Details:
This requires the SSE operations to not zero the high bits of the registers. Since the YMM registers
are aliased on the XMM registers you need to use SSE ops that are not using the VEX prefix (IIUC).
Or you have to use two XMM registers most of the time.
Perfect would be the use of
union M256I {
__m256i ymm;
__m128i xmm[2];
};
But as far as I know GCC, this will result in lots of unnecessary loads and stores. (It seems this is
due to GCC expecting aliasing, thus making sure the modified values are always up-to-date in memory
- like if it were declared volatile.)
2.1.2. Upsides:
int_v::Size == float_v::Size
2.1.3. Downsides:
Register pressure is increased.
2.2. 128bit int vectors
===========================================
2.2.1. Implementation Details:
2.2.2. Upsides:
2.2.3. Downsides:
- Use of int_v for float_v operations involving __m256i arguments require an extra type. This will
be hard to generalize
2.3. Mixed approach
===========================================
int_v/uint_v are implemented as 256bit while short_v/ushort_v are implemented as 128bit. Thus
int_v::Size == short_v::Size (which is the case on LRBni, too).

305
Vc/avx/casts.h Normal file
View File

@ -0,0 +1,305 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_AVX_CASTS_H_
#define VC_AVX_CASTS_H_
#include "intrinsics.h"
#include "types.h"
#include "../sse/casts.h"
#include "shuffle.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
namespace Casts
{
template<typename T> Vc_INTRINSIC_L T avx_cast(__m128 v) Vc_INTRINSIC_R;
template<typename T> Vc_INTRINSIC_L T avx_cast(__m128i v) Vc_INTRINSIC_R;
template<typename T> Vc_INTRINSIC_L T avx_cast(__m128d v) Vc_INTRINSIC_R;
template<typename T> Vc_INTRINSIC_L T avx_cast(__m256 v) Vc_INTRINSIC_R;
template<typename T> Vc_INTRINSIC_L T avx_cast(__m256i v) Vc_INTRINSIC_R;
template<typename T> Vc_INTRINSIC_L T avx_cast(__m256d v) Vc_INTRINSIC_R;
// 128 -> 128
template<> Vc_INTRINSIC __m128 avx_cast(__m128 v) { return v; }
template<> Vc_INTRINSIC __m128 avx_cast(__m128i v) { return _mm_castsi128_ps(v); }
template<> Vc_INTRINSIC __m128 avx_cast(__m128d v) { return _mm_castpd_ps(v); }
template<> Vc_INTRINSIC __m128i avx_cast(__m128 v) { return _mm_castps_si128(v); }
template<> Vc_INTRINSIC __m128i avx_cast(__m128i v) { return v; }
template<> Vc_INTRINSIC __m128i avx_cast(__m128d v) { return _mm_castpd_si128(v); }
template<> Vc_INTRINSIC __m128d avx_cast(__m128 v) { return _mm_castps_pd(v); }
template<> Vc_INTRINSIC __m128d avx_cast(__m128i v) { return _mm_castsi128_pd(v); }
template<> Vc_INTRINSIC __m128d avx_cast(__m128d v) { return v; }
// 128 -> 256
// FIXME: the following casts leave the upper 128bits undefined. With GCC and ICC I've never
// seen the cast not do what I want though: after a VEX-coded SSE instruction the register's
// upper 128bits are zero. Thus using the same register as AVX register will have the upper
// 128bits zeroed. MSVC, though, implements _mm256_castxx128_xx256 with a 128bit move to memory
// + 256bit load. Thus the upper 128bits are really undefined. But there is no intrinsic to do
// what I want (i.e. alias the register, disallowing the move to memory in-between). I'm stuck,
// do we really want to rely on specific compiler behavior here?
template<> Vc_INTRINSIC __m256 avx_cast(__m128 v) { return _mm256_castps128_ps256(v); }
template<> Vc_INTRINSIC __m256 avx_cast(__m128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); }
template<> Vc_INTRINSIC __m256 avx_cast(__m128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); }
template<> Vc_INTRINSIC __m256i avx_cast(__m128 v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); }
template<> Vc_INTRINSIC __m256i avx_cast(__m128i v) { return _mm256_castsi128_si256(v); }
template<> Vc_INTRINSIC __m256i avx_cast(__m128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); }
template<> Vc_INTRINSIC __m256d avx_cast(__m128 v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); }
template<> Vc_INTRINSIC __m256d avx_cast(__m128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); }
template<> Vc_INTRINSIC __m256d avx_cast(__m128d v) { return _mm256_castpd128_pd256(v); }
#if defined Vc_MSVC || defined Vc_CLANG || defined Vc_APPLECLANG
static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_permute2f128_ps (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); }
static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); }
static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_permute2f128_pd (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); }
#else
static Vc_INTRINSIC Vc_CONST __m256 zeroExtend(__m128 v) { return _mm256_castps128_ps256(v); }
static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); }
static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); }
#endif
// 256 -> 128
template<> Vc_INTRINSIC __m128 avx_cast(__m256 v) { return _mm256_castps256_ps128(v); }
template<> Vc_INTRINSIC __m128 avx_cast(__m256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); }
template<> Vc_INTRINSIC __m128 avx_cast(__m256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); }
template<> Vc_INTRINSIC __m128i avx_cast(__m256 v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); }
template<> Vc_INTRINSIC __m128i avx_cast(__m256i v) { return _mm256_castsi256_si128(v); }
template<> Vc_INTRINSIC __m128i avx_cast(__m256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); }
template<> Vc_INTRINSIC __m128d avx_cast(__m256 v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); }
template<> Vc_INTRINSIC __m128d avx_cast(__m256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); }
template<> Vc_INTRINSIC __m128d avx_cast(__m256d v) { return _mm256_castpd256_pd128(v); }
// 256 -> 256
template<> Vc_INTRINSIC __m256 avx_cast(__m256 v) { return v; }
template<> Vc_INTRINSIC __m256 avx_cast(__m256i v) { return _mm256_castsi256_ps(v); }
template<> Vc_INTRINSIC __m256 avx_cast(__m256d v) { return _mm256_castpd_ps(v); }
template<> Vc_INTRINSIC __m256i avx_cast(__m256 v) { return _mm256_castps_si256(v); }
template<> Vc_INTRINSIC __m256i avx_cast(__m256i v) { return v; }
template<> Vc_INTRINSIC __m256i avx_cast(__m256d v) { return _mm256_castpd_si256(v); }
template<> Vc_INTRINSIC __m256d avx_cast(__m256 v) { return _mm256_castps_pd(v); }
template<> Vc_INTRINSIC __m256d avx_cast(__m256i v) { return _mm256_castsi256_pd(v); }
template<> Vc_INTRINSIC __m256d avx_cast(__m256d v) { return v; }
// simplify splitting 256-bit registers in 128-bit registers
Vc_INTRINSIC Vc_CONST __m128 lo128(__m256 v) { return avx_cast<__m128>(v); }
Vc_INTRINSIC Vc_CONST __m128d lo128(__m256d v) { return avx_cast<__m128d>(v); }
Vc_INTRINSIC Vc_CONST __m128i lo128(__m256i v) { return avx_cast<__m128i>(v); }
Vc_INTRINSIC Vc_CONST __m128 hi128(__m256 v) { return extract128<1>(v); }
Vc_INTRINSIC Vc_CONST __m128d hi128(__m256d v) { return extract128<1>(v); }
Vc_INTRINSIC Vc_CONST __m128i hi128(__m256i v) { return extract128<1>(v); }
// simplify combining 128-bit registers in 256-bit registers
Vc_INTRINSIC Vc_CONST __m256 concat(__m128 a, __m128 b) { return insert128<1>(avx_cast<__m256 >(a), b); }
Vc_INTRINSIC Vc_CONST __m256d concat(__m128d a, __m128d b) { return insert128<1>(avx_cast<__m256d>(a), b); }
Vc_INTRINSIC Vc_CONST __m256i concat(__m128i a, __m128i b) { return insert128<1>(avx_cast<__m256i>(a), b); }
} // namespace Casts
using namespace Casts;
} // namespace AVX
namespace AVX2
{
using namespace AVX::Casts;
} // namespace AVX2
namespace AVX
{
template <typename From, typename To> struct ConvertTag {};
Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag<float , int>) { return _mm256_cvttps_epi32(v); }
Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, int>) { return _mm256_cvttpd_epi32(v); }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int , int>) { return v; }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint , int>) { return v; }
Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , int>) {
#ifdef Vc_IMPL_AVX2
return _mm256_cvtepi16_epi32(v);
#else
return AVX::srai_epi32<16>(
concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
#endif
}
Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, int>) {
#ifdef Vc_IMPL_AVX2
return _mm256_cvtepu16_epi32(v);
#else
return AVX::srli_epi32<16>(
concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
#endif
}
Vc_INTRINSIC __m256i convert(__m256 v, ConvertTag<float , uint>) {
using namespace AVX;
return _mm256_castps_si256(_mm256_blendv_ps(
_mm256_castsi256_ps(_mm256_cvttps_epi32(v)),
_mm256_castsi256_ps(add_epi32(_mm256_cvttps_epi32(_mm256_sub_ps(v, set2power31_ps())),
set2power31_epu32())),
cmpge_ps(v, set2power31_ps())));
}
Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, uint>) {
using namespace AVX;
return _mm_xor_si128(
_mm256_cvttpd_epi32(_mm256_sub_pd(_mm256_floor_pd(v), set1_pd(0x80000000u))),
_mm_set2power31_epu32());
}
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int , uint>) { return v; }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint , uint>) { return v; }
Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , uint>) {
#ifdef Vc_IMPL_AVX2
return _mm256_cvtepi16_epi32(v);
#else
return AVX::srai_epi32<16>(
concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
#endif
}
Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, uint>) {
#ifdef Vc_IMPL_AVX2
return _mm256_cvtepu16_epi32(v);
#else
return AVX::srli_epi32<16>(
concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
#endif
}
Vc_INTRINSIC __m256 convert(__m256 v, ConvertTag<float , float>) { return v; }
Vc_INTRINSIC __m128 convert(__m256d v, ConvertTag<double, float>) { return _mm256_cvtpd_ps(v); }
Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag<int , float>) { return _mm256_cvtepi32_ps(v); }
Vc_INTRINSIC __m256 convert(__m256i v, ConvertTag<uint , float>) {
// this is complicated because cvtepi32_ps only supports signed input. Thus, all
// input values with the MSB set would produce a negative result. We can reuse the
// cvtepi32_ps instruction if we unset the MSB. But then the rounding results can be
// different. Since float uses 24 bits for the mantissa (effectively), the 9-bit LSB
// determines the rounding direction. (Consider the bits ...8'7654'3210. The bits [0:7]
// need to be dropped and if > 0x80 round up, if < 0x80 round down. If [0:7] == 0x80
// then the rounding direction is determined by bit [8] for round to even. That's why
// the 9th bit is relevant for the rounding decision.)
// If the MSB of the input is set to 0, the cvtepi32_ps instruction makes its rounding
// decision on the lowest 8 bits instead. A second rounding decision is made when
// float(0x8000'0000) is added. This will rarely fix the rounding issue.
//
// Here's what the standard rounding mode expects:
// 0xc0000080 should cvt to 0xc0000000
// 0xc0000081 should cvt to 0xc0000100
// -- should cvt to 0xc0000100
// 0xc000017f should cvt to 0xc0000100
// 0xc0000180 should cvt to 0xc0000200
//
// However: using float(input ^ 0x8000'0000) + float(0x8000'0000) we get:
// 0xc0000081 would cvt to 0xc0000000
// 0xc00000c0 would cvt to 0xc0000000
// 0xc00000c1 would cvt to 0xc0000100
// 0xc000013f would cvt to 0xc0000100
// 0xc0000140 would cvt to 0xc0000200
//
// Solution: float(input & 0x7fff'fe00) + (float(0x8000'0000) + float(input & 0x1ff))
// This ensures the rounding decision is made on the 9-bit LSB when 0x8000'0000 is
// added to the float value of the low 8 bits of the input.
using namespace AVX;
return _mm256_blendv_ps(
_mm256_cvtepi32_ps(v),
_mm256_add_ps(_mm256_cvtepi32_ps(and_si256(v, set1_epi32(0x7ffffe00))),
_mm256_add_ps(set2power31_ps(), _mm256_cvtepi32_ps(and_si256(
v, set1_epi32(0x000001ff))))),
_mm256_castsi256_ps(cmplt_epi32(v, _mm256_setzero_si256())));
}
Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag<short , float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag< short, int>())); }
Vc_INTRINSIC __m256 convert(__m128i v, ConvertTag<ushort, float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag<ushort, int>())); }
Vc_INTRINSIC __m256d convert(__m128 v, ConvertTag<float , double>) { return _mm256_cvtps_pd(v); }
Vc_INTRINSIC __m256d convert(__m256d v, ConvertTag<double, double>) { return v; }
Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<int , double>) { return _mm256_cvtepi32_pd(v); }
Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<uint , double>) {
using namespace AVX;
return _mm256_add_pd(
_mm256_cvtepi32_pd(_mm_xor_si128(v, _mm_setmin_epi32())),
set1_pd(1u << 31)); }
Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<short , double>) { return convert(convert(v, SSE::ConvertTag< short, int>()), ConvertTag<int, double>()); }
Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<ushort, double>) { return convert(convert(v, SSE::ConvertTag<ushort, int>()), ConvertTag<int, double>()); }
Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int , short>) {
#ifdef Vc_IMPL_AVX2
auto a = _mm256_shuffle_epi8(
v, _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80,
-0x80, -0x80, -0x80, 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
-0x80, -0x80, -0x80, -0x80, -0x80, -0x80));
return lo128(_mm256_permute4x64_epi64(a, 0xf8)); // a[0] a[2] | a[3] a[3]
#else
const auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
const auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
const auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
const auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
return _mm_unpacklo_epi16(tmp2, tmp3);
#endif
}
Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint , short>) { return convert(v, ConvertTag<int, short>()); }
Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag<float , short>) { return convert(convert(v, ConvertTag<float, int>()), ConvertTag<int, short>()); }
Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, short>) { return convert(convert(v, ConvertTag<double, int>()), SSE::ConvertTag<int, short>()); }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , short>) { return v; }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, short>) { return v; }
Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int , ushort>) {
auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
return _mm_unpacklo_epi16(tmp2, tmp3);
}
Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint , ushort>) {
auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
return _mm_unpacklo_epi16(tmp2, tmp3);
}
Vc_INTRINSIC __m128i convert(__m256 v, ConvertTag<float , ushort>) { return convert(convert(v, ConvertTag<float, uint>()), ConvertTag<uint, ushort>()); }
Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, ushort>) { return convert(convert(v, ConvertTag<double, uint>()), SSE::ConvertTag<uint, ushort>()); }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , ushort>) { return v; }
Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, ushort>) { return v; }
template <typename From, typename To>
Vc_INTRINSIC auto convert(
typename std::conditional<(sizeof(From) < sizeof(To)),
typename SSE::VectorTraits<From>::VectorType,
typename AVX::VectorTypeHelper<From>::Type>::type v)
-> decltype(convert(v, ConvertTag<From, To>()))
{
return convert(v, ConvertTag<From, To>());
}
template <typename From, typename To, typename = enable_if<(sizeof(From) < sizeof(To))>>
Vc_INTRINSIC auto convert(typename AVX::VectorTypeHelper<From>::Type v)
-> decltype(convert(lo128(v), ConvertTag<From, To>()))
{
return convert(lo128(v), ConvertTag<From, To>());
}
} // namespace AVX
} // namespace Vc
#endif // VC_AVX_CASTS_H_

155
Vc/avx/const.h Normal file
View File

@ -0,0 +1,155 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_AVX_CONST_H_
#define VC_AVX_CONST_H_
#include <cstddef>
#include "types.h"
#include "const_data.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
template<typename T> struct IndexesFromZeroData;
template<> struct IndexesFromZeroData<int> {
static Vc_ALWAYS_INLINE Vc_CONST const int *address() { return reinterpret_cast<const int *>(&_IndexesFromZero32[0]); }
};
template<> struct IndexesFromZeroData<unsigned int> {
static Vc_ALWAYS_INLINE Vc_CONST const unsigned int *address() { return &_IndexesFromZero32[0]; }
};
template<> struct IndexesFromZeroData<short> {
static Vc_ALWAYS_INLINE Vc_CONST const short *address() { return reinterpret_cast<const short *>(&_IndexesFromZero16[0]); }
};
template<> struct IndexesFromZeroData<unsigned short> {
static Vc_ALWAYS_INLINE Vc_CONST const unsigned short *address() { return &_IndexesFromZero16[0]; }
};
template<> struct IndexesFromZeroData<signed char> {
static Vc_ALWAYS_INLINE Vc_CONST const signed char *address() { return reinterpret_cast<const signed char *>(&_IndexesFromZero8[0]); }
};
template<> struct IndexesFromZeroData<char> {
static Vc_ALWAYS_INLINE Vc_CONST const char *address() { return reinterpret_cast<const char *>(&_IndexesFromZero8[0]); }
};
template<> struct IndexesFromZeroData<unsigned char> {
static Vc_ALWAYS_INLINE Vc_CONST const unsigned char *address() { return &_IndexesFromZero8[0]; }
};
template<typename _T> struct Const
{
typedef Vector<_T> V;
typedef typename V::EntryType T;
typedef typename V::Mask M;
static Vc_ALWAYS_INLINE Vc_CONST V _pi_4() { return V(c_trig<T>::data[0]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi() { return V(c_trig<T>::data[1]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1() { return V(c_trig<T>::data[2]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2() { return V(c_trig<T>::data[3]); }
static Vc_ALWAYS_INLINE Vc_CONST V _1_16() { return V(c_trig<T>::data[4]); }
static Vc_ALWAYS_INLINE Vc_CONST V _16() { return V(c_trig<T>::data[5]); }
static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i) { return V(c_trig<T>::data[(12 + i)]); }
static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i) { return V(c_trig<T>::data[(17 + i)]); }
static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi() { return V(c_trig<T>::data[22]); }
static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo() { return V(c_trig<T>::data[23]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem() { return V(c_trig<T>::data[24]); }
static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold() { return V(c_trig<T>::data[8]); }
static Vc_ALWAYS_INLINE Vc_CONST V _4_pi() { return V(c_trig<T>::data[9]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi_2() { return V(c_trig<T>::data[10]); }
static Vc_ALWAYS_INLINE Vc_CONST V _pi() { return V(c_trig<T>::data[11]); }
static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return V(c_trig<T>::data[(28 + i)]); }
static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return V(c_trig<T>::data[(33 + i)]); }
static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return V(c_trig<T>::data[(37 + i)]); }
static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return V(c_trig<T>::data[(43 + i)]); }
static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput() { return V(c_trig<T>::data[25]); }
static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput() { return V(c_trig<T>::data[26]); }
static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(V(c_log<T>::d(1)).data()); }
static Vc_ALWAYS_INLINE Vc_CONST V _1_2() { return V(c_log<T>::d(18)); }
static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2() { return V(c_log<T>::d(15)); }
static Vc_ALWAYS_INLINE Vc_CONST V P(int i) { return V(c_log<T>::d(2 + i)); }
static Vc_ALWAYS_INLINE Vc_CONST V Q(int i) { return V(c_log<T>::d(8 + i)); }
static Vc_ALWAYS_INLINE Vc_CONST V min() { return V(c_log<T>::d(14)); }
static Vc_ALWAYS_INLINE Vc_CONST V ln2_small() { return V(c_log<T>::d(17)); }
static Vc_ALWAYS_INLINE Vc_CONST V ln2_large() { return V(c_log<T>::d(16)); }
static Vc_ALWAYS_INLINE Vc_CONST V neginf() { return V(c_log<T>::d(13)); }
static Vc_ALWAYS_INLINE Vc_CONST V log10_e() { return V(c_log<T>::d(19)); }
static Vc_ALWAYS_INLINE Vc_CONST V log2_e() { return V(c_log<T>::d(20)); }
static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R;
static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask(int bits) Vc_ALWAYS_INLINE_R Vc_CONST_R;
};
template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask()
{
return _mm256_broadcast_ss(
reinterpret_cast<const float *>(&c_general::highMaskFloat));
}
template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask()
{
return _mm256_broadcast_sd(
reinterpret_cast<const double *>(&c_general::highMaskDouble));
}
template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask(int bits)
{
#ifdef Vc_IMPL_AVX2
#if defined Vc_ICC || defined Vc_MSVC
__m256i allone = _mm256_set1_epi64x(~0);
#else
auto allone = ~__m256i();
#endif
return _mm256_castsi256_ps(_mm256_slli_epi32(allone, bits));
#else
__m128 tmp = _mm_castsi128_ps(_mm_slli_epi32(_mm_setallone_si128(), bits));
return concat(tmp, tmp);
#endif
}
template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask(int bits)
{
#ifdef Vc_IMPL_AVX2
#if defined Vc_ICC || defined Vc_MSVC
__m256i allone = _mm256_set1_epi64x(~0);
#else
auto allone = ~__m256i();
#endif
return _mm256_castsi256_pd(_mm256_slli_epi64(allone, bits));
#else
__m128d tmp = _mm_castsi128_pd(_mm_slli_epi64(_mm_setallone_si128(), bits));
return concat(tmp, tmp);
#endif
}
} // namespace AVX
namespace AVX2
{
using AVX::IndexesFromZeroData;
using AVX::Const;
} // namespace AVX2
} // namespace Vc
#endif // VC_AVX_CONST_H_

100
Vc/avx/const_data.h Normal file
View File

@ -0,0 +1,100 @@
/* This file is part of the Vc library. {{{
Copyright © 2012-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_AVX_CONST_DATA_H_
#define VC_AVX_CONST_DATA_H_
#include "../common/data.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
alignas(64) extern const unsigned int _IndexesFromZero32[ 8];
alignas(16) extern const unsigned short _IndexesFromZero16[16];
alignas(16) extern const unsigned char _IndexesFromZero8 [32];
struct alignas(64) c_general
{
static const float oneFloat;
static const unsigned int absMaskFloat[2];
static const unsigned int signMaskFloat[2];
static const unsigned int highMaskFloat;
static const unsigned short minShort[2];
static const unsigned short one16[2];
static const float _2power31;
static const double oneDouble;
static const unsigned long long frexpMask;
static const unsigned long long highMaskDouble;
};
template<typename T> struct c_trig
{
alignas(64) static const T data[];
};
#ifndef Vc_MSVC
template <> alignas(64) const float c_trig<float>::data[];
template <> alignas(64) const double c_trig<double>::data[];
#endif
template<typename T> struct c_log
{
typedef float floatAlias Vc_MAY_ALIAS;
static Vc_ALWAYS_INLINE float d(int i) { return *reinterpret_cast<const floatAlias *>(&data[i]); }
alignas(64) static const unsigned int data[21];
};
#ifndef Vc_MSVC
template<> alignas(64) const unsigned int c_log<float>::data[21];
#endif
template<> struct c_log<double>
{
enum VectorSize { Size = 16 / sizeof(double) };
typedef double doubleAlias Vc_MAY_ALIAS;
static Vc_ALWAYS_INLINE double d(int i) { return *reinterpret_cast<const doubleAlias *>(&data[i]); }
alignas(64) static const unsigned long long data[21];
};
} // namespace AVX
} // namespace Vc
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX2
{
using AVX::_IndexesFromZero8;
using AVX::_IndexesFromZero16;
using AVX::_IndexesFromZero32;
using AVX::c_general;
using AVX::c_trig;
using AVX::c_log;
} // namespace AVX2
} // namespace Vc
#endif // VC_AVX_CONST_DATA_H_

124
Vc/avx/debug.h Normal file
View File

@ -0,0 +1,124 @@
/* This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_AVX_DEBUG_H_
#define VC_AVX_DEBUG_H_
#ifndef NDEBUG
#include "vector.h"
#include <iostream>
#include <iomanip>
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
template <typename T, typename U> struct AddType {
const U &d;
};
template <typename T, typename U> AddType<T, U> addType(const U &x) { return {x}; }
#ifdef NDEBUG
class DebugStream
{
public:
DebugStream(const char *, const char *, int) {}
template<typename T> inline DebugStream &operator<<(const T &) { return *this; }
};
#else
class DebugStream
{
private:
template<typename T, typename V> static void printVector(V _x)
{
enum { Size = sizeof(V) / sizeof(T) };
union { V v; T m[Size]; } x = { _x };
std::cerr << '[' << std::setprecision(24) << x.m[0];
for (int i = 1; i < Size; ++i) {
std::cerr << ", " << std::setprecision(24) << x.m[i];
}
std::cerr << ']';
}
public:
DebugStream(const char *func, const char *file, int line)
{
std::cerr << "\033[1;40;33mDEBUG: " << file << ':' << line << ' ' << func << ' ';
}
template<typename T> DebugStream &operator<<(const T &x) { std::cerr << x; return *this; }
template <typename T, typename U> DebugStream &operator<<(AddType<T, U> &&x)
{
printVector<T, U>(x.d);
return *this;
}
DebugStream &operator<<(__m128 x) {
printVector<float, __m128>(x);
return *this;
}
DebugStream &operator<<(__m256 x) {
printVector<float, __m256>(x);
return *this;
}
DebugStream &operator<<(__m128d x) {
printVector<double, __m128d>(x);
return *this;
}
DebugStream &operator<<(__m256d x) {
printVector<double, __m256d>(x);
return *this;
}
DebugStream &operator<<(__m128i x) {
printVector<unsigned int, __m128i>(x);
return *this;
}
DebugStream &operator<<(__m256i x) {
printVector<unsigned int, __m256i>(x);
return *this;
}
~DebugStream()
{
std::cerr << "\033[0m" << std::endl;
}
};
#endif
#ifdef Vc_DEBUG
#undef Vc_DEBUG
#endif
#ifdef Vc_MSVC
#define Vc_DEBUG Vc::AVX::DebugStream(__FUNCSIG__, __FILE__, __LINE__)
#else
#define Vc_DEBUG Vc::AVX::DebugStream(__PRETTY_FUNCTION__, __FILE__, __LINE__)
#endif
} // namespace AVX
} // namespace Vc
#endif // VC_AVX_DEBUG_H_

290
Vc/avx/deinterleave.tcc Normal file
View File

@ -0,0 +1,290 @@
/* This file is part of the Vc library. {{{
Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX2
{
inline void deinterleave(double_v &Vc_RESTRICT a, double_v &Vc_RESTRICT b, double_v &Vc_RESTRICT c)
{ // estimated latency (AVX): 4.5 cycles
const m256d tmp0 = Mem::shuffle128<X0, Y1>(a.data(), b.data());
const m256d tmp1 = Mem::shuffle128<X1, Y0>(a.data(), c.data());
const m256d tmp2 = Mem::shuffle128<X0, Y1>(b.data(), c.data());
a.data() = Mem::shuffle<X0, Y1, X2, Y3>(tmp0, tmp1);
b.data() = Mem::shuffle<X1, Y0, X3, Y2>(tmp0, tmp2);
c.data() = Mem::shuffle<X0, Y1, X2, Y3>(tmp1, tmp2);
}
inline void deinterleave(float_v &Vc_RESTRICT a, float_v &Vc_RESTRICT b, float_v &Vc_RESTRICT c)
{
// abc abc abc
// a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121
// b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211
// c = [b5 c5 a6 b6 c6 a7 b7 c7] 233 = 121+112
const m256 ac0 = Mem::shuffle128<X0, Y0>(a.data(), c.data()); // a0 b0 c0 a1 b5 c5 a6 b6
const m256 ac1 = Mem::shuffle128<X1, Y1>(a.data(), c.data()); // b1 c1 a2 b2 c6 a7 b7 c7
m256 tmp0 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>( ac0, b.data());
tmp0 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(tmp0, ac1); // a0 a3 a2 a1 a4 a7 a6 a5
m256 tmp1 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>( ac0, b.data());
tmp1 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(tmp1, ac1); // b1 b0 b3 b2 b5 b4 b7 b6
m256 tmp2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>( ac0, b.data());
tmp2 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>(tmp2, ac1); // c2 c1 c0 c3 c6 c5 c4 c7
a.data() = Mem::permute<X0, X3, X2, X1>(tmp0);
b.data() = Mem::permute<X1, X0, X3, X2>(tmp1);
c.data() = Mem::permute<X2, X1, X0, X3>(tmp2);
}
inline void deinterleave(int_v &Vc_RESTRICT a, int_v &Vc_RESTRICT b, int_v &Vc_RESTRICT c)
{
deinterleave(reinterpret_cast<float_v &>(a), reinterpret_cast<float_v &>(b),
reinterpret_cast<float_v &>(c));
}
inline void deinterleave(uint_v &Vc_RESTRICT a, uint_v &Vc_RESTRICT b, uint_v &Vc_RESTRICT c)
{
deinterleave(reinterpret_cast<float_v &>(a), reinterpret_cast<float_v &>(b),
reinterpret_cast<float_v &>(c));
}
inline void deinterleave(Vector<short> &Vc_RESTRICT , Vector<short> &Vc_RESTRICT ,
Vector<short> &Vc_RESTRICT )
{
return;
/* TODO:
// abc abc abc
// a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121
// b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211
// c = [b5 c5 a6 b6 c6 a7 b7 c7] 233 = 121+112
m128i ac0 = _mm_unpacklo_epi64(a.data(), c.data()); // a0 b0 c0 a1 b5 c5 a6 b6
m128i ac1 = _mm_unpackhi_epi64(a.data(), c.data()); // b1 c1 a2 b2 c6 a7 b7 c7
m128i tmp0 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>( ac0, b.data());
tmp0 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(tmp0, ac1); // a0 a3 a2 a1 a4 a7 a6 a5
m128i tmp1 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>( ac0, b.data());
tmp1 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(tmp1, ac1); // b1 b0 b3 b2 b5 b4 b7 b6
m128i tmp2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>( ac0, b.data());
tmp2 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>(tmp2, ac1); // c2 c1 c0 c3 c6 c5 c4 c7
a.data() = Mem::permuteHi<X4, X7, X6, X5>(Mem::permuteLo<X0, X3, X2, X1>(tmp0));
b.data() = Mem::permuteHi<X5, X4, X7, X6>(Mem::permuteLo<X1, X0, X3, X2>(tmp1));
c.data() = Mem::permuteHi<X6, X5, X4, X7>(Mem::permuteLo<X2, X1, X0, X3>(tmp2));
*/
}
inline void deinterleave(Vector<unsigned short> &Vc_RESTRICT a, Vector<unsigned short> &Vc_RESTRICT b,
Vector<unsigned short> &Vc_RESTRICT c)
{
deinterleave(reinterpret_cast<Vector<short> &>(a), reinterpret_cast<Vector<short> &>(b),
reinterpret_cast<Vector<short> &>(c));
}
inline void deinterleave(Vector<float> &a, Vector<float> &b)
{
// a7 a6 a5 a4 a3 a2 a1 a0
// b7 b6 b5 b4 b3 b2 b1 b0
const m256 tmp0 = Reg::permute128<Y0, X0>(a.data(), b.data()); // b3 b2 b1 b0 a3 a2 a1 a0
const m256 tmp1 = Reg::permute128<Y1, X1>(a.data(), b.data()); // b7 b6 b5 b4 a7 a6 a5 a4
const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0
const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2
a.data() = _mm256_unpacklo_ps(tmp2, tmp3); // b6 b4 b2 b0 a6 a4 a2 a0
b.data() = _mm256_unpackhi_ps(tmp2, tmp3); // b7 b5 b3 b1 a7 a5 a3 a1
}
inline void deinterleave(Vector<short> &a, // a0 b0 a1 b1 a2 b2 a3 b3 | a4 b4 a5 ...
Vector<short> &b) // a8 b8 a9 ...
{
auto v0 = Mem::shuffle128<X0, Y0>(a.data(), b.data());
auto v1 = Mem::shuffle128<X1, Y1>(a.data(), b.data());
auto v2 = AVX::unpacklo_epi16(v0, v1); // a0 a4 ...
auto v3 = AVX::unpackhi_epi16(v0, v1); // a2 a6 ...
v0 = AVX::unpacklo_epi16(v2, v3); // a0 a2 ...
v1 = AVX::unpackhi_epi16(v2, v3); // a1 a3 ...
a.data() = AVX::unpacklo_epi16(v0, v1); // a0 a1 ...
b.data() = AVX::unpackhi_epi16(v0, v1); // b0 b1 ...
}
inline void deinterleave(Vector<ushort> &a, Vector<ushort> &b)
{
auto v0 = Mem::shuffle128<X0, Y0>(a.data(), b.data());
auto v1 = Mem::shuffle128<X1, Y1>(a.data(), b.data());
auto v2 = AVX::unpacklo_epi16(v0, v1); // a0 a4 ...
auto v3 = AVX::unpackhi_epi16(v0, v1); // a2 a6 ...
v0 = AVX::unpacklo_epi16(v2, v3); // a0 a2 ...
v1 = AVX::unpackhi_epi16(v2, v3); // a1 a3 ...
a.data() = AVX::unpacklo_epi16(v0, v1); // a0 a1 ...
b.data() = AVX::unpackhi_epi16(v0, v1); // b0 b1 ...
}
} // namespace AVX2
namespace Detail
{
template <typename Flags>
inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const float *m, Flags align)
{
a.load(m, align);
b.load(m + AVX2::float_v::Size, align);
Vc::AVX2::deinterleave(a, b);
}
template <typename Flags>
inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const short *m, Flags f)
{
using namespace Vc::AVX2;
const auto tmp = Detail::load32(m, f);
a.data() =
_mm256_cvtepi32_ps(concat(_mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
_mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16)));
b.data() = _mm256_cvtepi32_ps(
concat(_mm_srai_epi32(lo128(tmp), 16), _mm_srai_epi32(hi128(tmp), 16)));
}
template <typename Flags>
inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const unsigned short *m, Flags f)
{
using namespace Vc::AVX2;
const auto tmp = Detail::load32(m, f);
a.data() = _mm256_cvtepi32_ps(
concat(_mm_blend_epi16(lo128(tmp), _mm_setzero_si128(), 0xaa),
_mm_blend_epi16(hi128(tmp), _mm_setzero_si128(), 0xaa)));
b.data() = _mm256_cvtepi32_ps(
concat(_mm_srli_epi32(lo128(tmp), 16), _mm_srli_epi32(hi128(tmp), 16)));
}
template <typename Flags>
inline void deinterleave(AVX2::double_v &a, AVX2::double_v &b, const double *m, Flags align)
{
using namespace Vc::AVX2;
a.load(m, align);
b.load(m + AVX2::double_v::Size, align);
m256d tmp0 = Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()); // b1 b0 a1 a0
m256d tmp1 = Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()); // b3 b2 a3 a2
a.data() = _mm256_unpacklo_pd(tmp0, tmp1); // b2 b0 a2 a0
b.data() = _mm256_unpackhi_pd(tmp0, tmp1); // b3 b1 a3 a1
}
template <typename Flags>
inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const int *m, Flags align)
{
using namespace AVX;
a.load(m, align);
b.load(m + AVX2::int_v::Size, align);
const m256 tmp0 = avx_cast<m256>(Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()));
const m256 tmp1 = avx_cast<m256>(Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()));
const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0
const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2
a.data() = avx_cast<m256i>(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0
b.data() = avx_cast<m256i>(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1
}
template <typename Flags>
inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const short *m, Flags f)
{
using namespace Vc::AVX;
const AVX2::short_v tmp0(m, f);
const m256i tmp = tmp0.data();
a.data() = concat(
_mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
_mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16));
b.data() = concat(
_mm_srai_epi32(lo128(tmp), 16),
_mm_srai_epi32(hi128(tmp), 16));
}
template <typename Flags>
inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned int *m, Flags align)
{
using namespace AVX;
a.load(m, align);
b.load(m + AVX2::uint_v::Size, align);
const m256 tmp0 = avx_cast<m256>(Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()));
const m256 tmp1 = avx_cast<m256>(Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()));
const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0
const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2
a.data() = avx_cast<m256i>(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0
b.data() = avx_cast<m256i>(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1
}
template <typename Flags>
inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned short *m, Flags f)
{
using namespace Vc::AVX;
const AVX2::ushort_v tmp0(m, f);
const m256i tmp = tmp0.data();
a.data() = concat(
_mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
_mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16));
b.data() = concat(
_mm_srai_epi32(lo128(tmp), 16),
_mm_srai_epi32(hi128(tmp), 16));
}
template <typename Flags>
inline void deinterleave(AVX2::short_v &a, AVX2::short_v &b, const short *m, Flags align)
{
a.load(m, align);
b.load(m + AVX2::short_v::Size, align);
Vc::AVX2::deinterleave(a, b);
}
template <typename Flags>
inline void deinterleave(AVX2::ushort_v &a, AVX2::ushort_v &b, const unsigned short *m, Flags align)
{
a.load(m, align);
b.load(m + AVX2::ushort_v::Size, align);
Vc::AVX2::deinterleave(a, b);
}
// only support M == V::EntryType -> no specialization
template <typename T, typename M, typename Flags>
Vc_ALWAYS_INLINE void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
AVX2::Vector<T> &Vc_RESTRICT b,
AVX2::Vector<T> &Vc_RESTRICT c,
const M *Vc_RESTRICT memory, Flags align)
{
using V = AVX2::Vector<T>;
a.load(&memory[0 * V::Size], align);
b.load(&memory[1 * V::Size], align);
c.load(&memory[2 * V::Size], align);
Vc::AVX2::deinterleave(a, b, c);
}
} // namespace Detail
} // namespace Vc

2303
Vc/avx/detail.h Normal file

File diff suppressed because it is too large Load Diff

119
Vc/avx/helperimpl.h Normal file
View File

@ -0,0 +1,119 @@
/* This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_AVX_HELPERIMPL_H_
#define VC_AVX_HELPERIMPL_H_
#include "../sse/helperimpl.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename A>
inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const float *, A);
template <typename A>
inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const short *, A);
template <typename A>
inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const ushort *, A);
template <typename A>
inline void deinterleave(AVX2::double_v &, AVX2::double_v &, const double *, A);
template <typename A>
inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const int *, A);
template <typename A>
inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const short *, A);
template <typename A>
inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const uint *, A);
template <typename A>
inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const ushort *, A);
template <typename A>
inline void deinterleave(AVX2::short_v &, AVX2::short_v &, const short *, A);
template <typename A>
inline void deinterleave(AVX2::ushort_v &, AVX2::ushort_v &, const ushort *, A);
template <typename T, typename M, typename A>
Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
AVX2::Vector<T> &Vc_RESTRICT b,
AVX2::Vector<T> &Vc_RESTRICT c,
const M *Vc_RESTRICT memory,
A align) Vc_ALWAYS_INLINE_R;
template <typename T, typename M, typename A>
Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
AVX2::Vector<T> &Vc_RESTRICT b,
AVX2::Vector<T> &Vc_RESTRICT c,
AVX2::Vector<T> &Vc_RESTRICT d,
const M *Vc_RESTRICT memory,
A align) Vc_ALWAYS_INLINE_R;
template <typename T, typename M, typename A>
Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
AVX2::Vector<T> &Vc_RESTRICT b,
AVX2::Vector<T> &Vc_RESTRICT c,
AVX2::Vector<T> &Vc_RESTRICT d,
AVX2::Vector<T> &Vc_RESTRICT e,
const M *Vc_RESTRICT memory,
A align) Vc_ALWAYS_INLINE_R;
template <typename T, typename M, typename A>
Vc_ALWAYS_INLINE_L void deinterleave(
AVX2::Vector<T> &Vc_RESTRICT a, AVX2::Vector<T> &Vc_RESTRICT b,
AVX2::Vector<T> &Vc_RESTRICT c, AVX2::Vector<T> &Vc_RESTRICT d,
AVX2::Vector<T> &Vc_RESTRICT e, AVX2::Vector<T> &Vc_RESTRICT f,
const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R;
template <typename T, typename M, typename A>
Vc_ALWAYS_INLINE_L void deinterleave(
AVX2::Vector<T> &Vc_RESTRICT a, AVX2::Vector<T> &Vc_RESTRICT b,
AVX2::Vector<T> &Vc_RESTRICT c, AVX2::Vector<T> &Vc_RESTRICT d,
AVX2::Vector<T> &Vc_RESTRICT e, AVX2::Vector<T> &Vc_RESTRICT f,
AVX2::Vector<T> &Vc_RESTRICT g, AVX2::Vector<T> &Vc_RESTRICT h,
const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr, VectorAbi::Avx)
{
prefetchForOneRead(addr, VectorAbi::Sse());
}
Vc_ALWAYS_INLINE void prefetchForModify(const void *addr, VectorAbi::Avx)
{
prefetchForModify(addr, VectorAbi::Sse());
}
Vc_ALWAYS_INLINE void prefetchClose(const void *addr, VectorAbi::Avx)
{
prefetchClose(addr, VectorAbi::Sse());
}
Vc_ALWAYS_INLINE void prefetchMid(const void *addr, VectorAbi::Avx)
{
prefetchMid(addr, VectorAbi::Sse());
}
Vc_ALWAYS_INLINE void prefetchFar(const void *addr, VectorAbi::Avx)
{
prefetchFar(addr, VectorAbi::Sse());
}
} // namespace Detail
} // namespace Vc
#include "deinterleave.tcc"
#endif // VC_AVX_HELPERIMPL_H_

670
Vc/avx/intrinsics.h Normal file
View File

@ -0,0 +1,670 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_AVX_INTRINSICS_H_
#define VC_AVX_INTRINSICS_H_
#include "../global.h"
#include "../traits/type_traits.h"
// see comment in sse/intrinsics.h
extern "C" {
// AVX
#include <immintrin.h>
#if (defined(Vc_IMPL_XOP) || defined(Vc_IMPL_FMA4)) && !defined(Vc_MSVC)
#include <x86intrin.h>
#endif
}
#include "../common/fix_clang_emmintrin.h"
#include "const_data.h"
#include "../common/types.h"
#include "macros.h"
#include <cstdlib>
#if (defined Vc_CLANG && Vc_CLANG >= 0x30900 && Vc_CLANG < 0x70000)
#ifdef _mm256_permute2f128_si256
#undef _mm256_permute2f128_si256
#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
(__v8si)(__m256i)(V2), (char)(M)); })
#endif
#ifdef _mm256_permute2f128_ps
#undef _mm256_permute2f128_ps
#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (char)(M)); })
#endif
#ifdef _mm256_permute2x128_si256
#undef _mm256_permute2x128_si256
#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
(__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (char)(M)); })
#endif
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace AvxIntrinsics
{
using AVX::c_general;
using AVX::_IndexesFromZero32;
using AVX::_IndexesFromZero16;
using AVX::_IndexesFromZero8;
typedef __m128 m128 ;
typedef __m128d m128d;
typedef __m128i m128i;
typedef __m256 m256 ;
typedef __m256d m256d;
typedef __m256i m256i;
#ifdef Vc_GCC
// Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin
// functions. This way the fp-contraction optimization step kicks in and creates FMAs! :)
static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) * static_cast<__v4df>(b)); }
static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) + static_cast<__v4df>(b)); }
static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) - static_cast<__v4df>(b)); }
static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) * static_cast<__v8sf>(b)); }
static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) + static_cast<__v8sf>(b)); }
static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); }
#endif
static Vc_INTRINSIC m256d Vc_CONST set1_pd (double a) { return _mm256_set1_pd (a); }
static Vc_INTRINSIC m256i Vc_CONST set1_epi32(int a) { return _mm256_set1_epi32(a); }
static Vc_INTRINSIC Vc_CONST m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
static Vc_INTRINSIC Vc_CONST m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
static Vc_INTRINSIC Vc_CONST m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
static Vc_INTRINSIC Vc_CONST m256i setallone_si256() { return _mm256_castps_si256(_mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet))); }
static Vc_INTRINSIC Vc_CONST m256d setallone_pd() { return _mm256_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
static Vc_INTRINSIC Vc_CONST m256 setallone_ps() { return _mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
static Vc_INTRINSIC m256i Vc_CONST setone_epi8 () { return _mm256_set1_epi8(1); }
static Vc_INTRINSIC m256i Vc_CONST setone_epu8 () { return setone_epi8(); }
static Vc_INTRINSIC m256i Vc_CONST setone_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::one16))); }
static Vc_INTRINSIC m256i Vc_CONST setone_epu16() { return setone_epi16(); }
static Vc_INTRINSIC m256i Vc_CONST setone_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&_IndexesFromZero32[1]))); }
static Vc_INTRINSIC m256i Vc_CONST setone_epu32() { return setone_epi32(); }
static Vc_INTRINSIC m256 Vc_CONST setone_ps() { return _mm256_broadcast_ss(&c_general::oneFloat); }
static Vc_INTRINSIC m256d Vc_CONST setone_pd() { return _mm256_broadcast_sd(&c_general::oneDouble); }
static Vc_INTRINSIC m256d Vc_CONST setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::absMaskFloat[0])); }
static Vc_INTRINSIC m256 Vc_CONST setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::absMaskFloat[1])); }
static Vc_INTRINSIC m256d Vc_CONST setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::signMaskFloat[0])); }
static Vc_INTRINSIC m256 Vc_CONST setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1])); }
static Vc_INTRINSIC m256 Vc_CONST set2power31_ps() { return _mm256_broadcast_ss(&c_general::_2power31); }
static Vc_INTRINSIC m128 Vc_CONST _mm_set2power31_ps() { return _mm_broadcast_ss(&c_general::_2power31); }
static Vc_INTRINSIC m256i Vc_CONST set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
static Vc_INTRINSIC m128i Vc_CONST _mm_set2power31_epu32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
static Vc_INTRINSIC m256i Vc_CONST setmin_epi8 () { return _mm256_set1_epi8(-0x80); }
static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
static Vc_INTRINSIC m256i Vc_CONST setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
static Vc_INTRINSIC m256i Vc_CONST setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
template <int i>
static Vc_INTRINSIC Vc_CONST unsigned int extract_epu32(__m128i x)
{
return _mm_extract_epi32(x, i);
}
template <int offset> Vc_INTRINSIC __m256 insert128(__m256 a, __m128 b) { return _mm256_insertf128_ps(a, b, offset); }
template <int offset> Vc_INTRINSIC __m256d insert128(__m256d a, __m128d b) { return _mm256_insertf128_pd(a, b, offset); }
template <int offset> Vc_INTRINSIC __m256i insert128(__m256i a, __m128i b) {
#ifdef Vc_IMPL_AVX2
return _mm256_inserti128_si256(a, b, offset);
#else
return _mm256_insertf128_si256(a, b, offset);
#endif
}
template <int offset> Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_extractf128_ps(a, offset); }
template <int offset> Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); }
template <int offset> Vc_INTRINSIC __m128i extract128(__m256i a) {
#ifdef Vc_IMPL_AVX2
return _mm256_extracti128_si256(a, offset);
#else
return _mm256_extractf128_si256(a, offset);
#endif
}
/////////////////////// COMPARE OPS ///////////////////////
#ifdef Vc_GCC
// GCC needs builtin compare operators to enable constant folding
Vc_INTRINSIC __m256d cmpeq_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a == b); }
Vc_INTRINSIC __m256d cmpneq_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a != b); }
Vc_INTRINSIC __m256d cmplt_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a < b); }
Vc_INTRINSIC __m256d cmpge_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a >= b); }
Vc_INTRINSIC __m256d cmple_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a <= b); }
Vc_INTRINSIC __m256d cmpgt_pd (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a > b); }
Vc_INTRINSIC __m256 cmpeq_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a == b); }
Vc_INTRINSIC __m256 cmpneq_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a != b); }
Vc_INTRINSIC __m256 cmplt_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a < b); }
Vc_INTRINSIC __m256 cmpge_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a >= b); }
Vc_INTRINSIC __m256 cmple_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a <= b); }
Vc_INTRINSIC __m256 cmpgt_ps (__m256 a, __m256 b) { return reinterpret_cast<__m256 >(a > b); }
#else
Vc_INTRINSIC __m256d cmpeq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
Vc_INTRINSIC __m256d cmpneq_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
Vc_INTRINSIC __m256d cmplt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
Vc_INTRINSIC __m256d cmpge_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
Vc_INTRINSIC __m256d cmple_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
Vc_INTRINSIC __m256d cmpgt_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
Vc_INTRINSIC __m256 cmpeq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
Vc_INTRINSIC __m256 cmpneq_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
Vc_INTRINSIC __m256 cmplt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
Vc_INTRINSIC __m256 cmpge_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
Vc_INTRINSIC __m256 cmple_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
Vc_INTRINSIC __m256 cmpgt_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
#endif
Vc_INTRINSIC __m256d cmpnlt_pd (__m256d a, __m256d b) { return cmpge_pd(a, b); }
Vc_INTRINSIC __m256d cmpnle_pd (__m256d a, __m256d b) { return cmpgt_pd(a, b); }
Vc_INTRINSIC __m256 cmpnlt_ps (__m256 a, __m256 b) { return cmpge_ps(a, b); }
Vc_INTRINSIC __m256 cmpnle_ps (__m256 a, __m256 b) { return cmpgt_ps(a, b); }
Vc_INTRINSIC __m256d cmpord_pd (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); }
Vc_INTRINSIC __m256d cmpunord_pd(__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); }
Vc_INTRINSIC __m256 cmpord_ps (__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); }
Vc_INTRINSIC __m256 cmpunord_ps(__m256 a, __m256 b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); }
#if defined(Vc_IMPL_XOP)
static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
return _mm_comlt_epu16(a, b);
}
static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
return _mm_comgt_epu16(a, b);
}
#else
static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
}
static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
}
#endif
#ifdef Vc_IMPL_AVX2
template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
{
return _mm256_alignr_epi8(s1, s2, shift);
}
#else
template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
{
return insert128<1>(
_mm256_castsi128_si256(_mm_alignr_epi8(_mm256_castsi256_si128(s1),
_mm256_castsi256_si128(s2), shift)),
_mm_alignr_epi8(extract128<1>(s1), extract128<1>(s2), shift));
}
#endif
#ifdef Vc_IMPL_AVX2
#define Vc_AVX_TO_SSE_2_NEW(name) \
Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \
{ \
return _mm256_##name(a0, b0); \
}
#define Vc_AVX_TO_SSE_256_128(name) \
Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \
{ \
return _mm256_##name(a0, b0); \
}
#define Vc_AVX_TO_SSE_1i(name) \
template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \
{ \
return _mm256_##name(a0, i); \
}
#define Vc_AVX_TO_SSE_1(name) \
Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) { return _mm256_##name(a0); }
#define Vc_AVX_TO_SSE_1_128(name, shift__) \
Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) { return _mm256_##name(a0); }
#else
/**\internal
* Defines the function \p name, which takes to __m256i arguments and calls `_mm_##name` on the low
* and high 128 bit halfs of the arguments.
*
* In case the AVX2 intrinsics are enabled, the arguments are directly passed to a single
* `_mm256_##name` call.
*/
#define Vc_AVX_TO_SSE_1(name) \
Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) \
{ \
__m128i a1 = extract128<1>(a0); \
__m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \
__m128i r1 = _mm_##name(a1); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
#define Vc_AVX_TO_SSE_1_128(name, shift__) \
Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) \
{ \
__m128i r0 = _mm_##name(a0); \
__m128i r1 = _mm_##name(_mm_srli_si128(a0, shift__)); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
#define Vc_AVX_TO_SSE_2_NEW(name) \
Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0) \
{ \
m128i a1 = extract128<1>(a0); \
m128i b1 = extract128<1>(b0); \
m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \
m128i r1 = _mm_##name(a1, b1); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
#define Vc_AVX_TO_SSE_256_128(name) \
Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \
{ \
m128i a1 = extract128<1>(a0); \
m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), b0); \
m128i r1 = _mm_##name(a1, b0); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
#define Vc_AVX_TO_SSE_1i(name) \
template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \
{ \
m128i a1 = extract128<1>(a0); \
m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \
m128i r1 = _mm_##name(a1, i); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
#endif
Vc_INTRINSIC Vc_CONST __m128i sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); }
Vc_INTRINSIC Vc_CONST __m128i sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); }
Vc_INTRINSIC Vc_CONST __m128i sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); }
Vc_INTRINSIC Vc_CONST __m128i srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); }
Vc_INTRINSIC Vc_CONST __m128i srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); }
Vc_INTRINSIC Vc_CONST __m128i srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); }
Vc_INTRINSIC Vc_CONST __m128i sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); }
Vc_INTRINSIC Vc_CONST __m128i sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); }
Vc_AVX_TO_SSE_1i(slli_epi16)
Vc_AVX_TO_SSE_1i(slli_epi32)
Vc_AVX_TO_SSE_1i(slli_epi64)
Vc_AVX_TO_SSE_1i(srai_epi16)
Vc_AVX_TO_SSE_1i(srai_epi32)
Vc_AVX_TO_SSE_1i(srli_epi16)
Vc_AVX_TO_SSE_1i(srli_epi32)
Vc_AVX_TO_SSE_1i(srli_epi64)
Vc_AVX_TO_SSE_256_128(sll_epi16)
Vc_AVX_TO_SSE_256_128(sll_epi32)
Vc_AVX_TO_SSE_256_128(sll_epi64)
Vc_AVX_TO_SSE_256_128(srl_epi16)
Vc_AVX_TO_SSE_256_128(srl_epi32)
Vc_AVX_TO_SSE_256_128(srl_epi64)
Vc_AVX_TO_SSE_256_128(sra_epi16)
Vc_AVX_TO_SSE_256_128(sra_epi32)
Vc_AVX_TO_SSE_2_NEW(cmpeq_epi8)
Vc_AVX_TO_SSE_2_NEW(cmpeq_epi16)
Vc_AVX_TO_SSE_2_NEW(cmpeq_epi32)
Vc_AVX_TO_SSE_2_NEW(cmpeq_epi64)
Vc_AVX_TO_SSE_2_NEW(cmpgt_epi8)
Vc_AVX_TO_SSE_2_NEW(cmpgt_epi16)
Vc_AVX_TO_SSE_2_NEW(cmpgt_epi32)
Vc_AVX_TO_SSE_2_NEW(cmpgt_epi64)
Vc_AVX_TO_SSE_2_NEW(unpackhi_epi16)
Vc_AVX_TO_SSE_2_NEW(unpacklo_epi16)
Vc_AVX_TO_SSE_2_NEW(add_epi16)
Vc_AVX_TO_SSE_2_NEW(add_epi32)
Vc_AVX_TO_SSE_2_NEW(add_epi64)
Vc_AVX_TO_SSE_2_NEW(sub_epi16)
Vc_AVX_TO_SSE_2_NEW(sub_epi32)
Vc_AVX_TO_SSE_2_NEW(mullo_epi16)
Vc_AVX_TO_SSE_2_NEW(sign_epi16)
Vc_AVX_TO_SSE_2_NEW(sign_epi32)
Vc_AVX_TO_SSE_2_NEW(min_epi8)
Vc_AVX_TO_SSE_2_NEW(max_epi8)
Vc_AVX_TO_SSE_2_NEW(min_epu16)
Vc_AVX_TO_SSE_2_NEW(max_epu16)
Vc_AVX_TO_SSE_2_NEW(min_epi32)
Vc_AVX_TO_SSE_2_NEW(max_epi32)
Vc_AVX_TO_SSE_2_NEW(min_epu32)
Vc_AVX_TO_SSE_2_NEW(max_epu32)
Vc_AVX_TO_SSE_2_NEW(mullo_epi32)
Vc_AVX_TO_SSE_1(abs_epi8)
Vc_AVX_TO_SSE_1(abs_epi16)
Vc_AVX_TO_SSE_1(abs_epi32)
Vc_AVX_TO_SSE_1_128(cvtepi8_epi16, 8)
Vc_AVX_TO_SSE_1_128(cvtepi8_epi32, 4)
Vc_AVX_TO_SSE_1_128(cvtepi8_epi64, 2)
Vc_AVX_TO_SSE_1_128(cvtepi16_epi32, 8)
Vc_AVX_TO_SSE_1_128(cvtepi16_epi64, 4)
Vc_AVX_TO_SSE_1_128(cvtepi32_epi64, 8)
Vc_AVX_TO_SSE_1_128(cvtepu8_epi16, 8)
Vc_AVX_TO_SSE_1_128(cvtepu8_epi32, 4)
Vc_AVX_TO_SSE_1_128(cvtepu8_epi64, 2)
Vc_AVX_TO_SSE_1_128(cvtepu16_epi32, 8)
Vc_AVX_TO_SSE_1_128(cvtepu16_epi64, 4)
Vc_AVX_TO_SSE_1_128(cvtepu32_epi64, 8)
#ifndef Vc_IMPL_AVX2
/////////////////////////////////////////////////////////////////////////
// implementation of the intrinsics missing in AVX
/////////////////////////////////////////////////////////////////////////
static Vc_INTRINSIC m256i Vc_CONST and_si256(__m256i x, __m256i y) {
return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
}
static Vc_INTRINSIC m256i Vc_CONST andnot_si256(__m256i x, __m256i y) {
return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
}
static Vc_INTRINSIC m256i Vc_CONST or_si256(__m256i x, __m256i y) {
return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
}
static Vc_INTRINSIC m256i Vc_CONST xor_si256(__m256i x, __m256i y) {
return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
}
Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
{
m128i a1 = extract128<1>(a0);
return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0));
}
template <int m> Vc_INTRINSIC Vc_CONST m256i blend_epi16(__m256i a0, __m256i b0)
{
m128i a1 = extract128<1>(a0);
m128i b1 = extract128<1>(b0);
m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff);
m128i r1 = _mm_blend_epi16(a1, b1, m >> 8);
return insert128<1>(_mm256_castsi128_si256(r0), r1);
}
Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0) {
m128i a1 = extract128<1>(a0);
m128i b1 = extract128<1>(b0);
m128i m1 = extract128<1>(m0);
m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0));
m128i r1 = _mm_blendv_epi8(a1, b1, m1);
return insert128<1>(_mm256_castsi128_si256(r0), r1);
}
// mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
#else // Vc_IMPL_AVX2
static Vc_INTRINSIC Vc_CONST m256i xor_si256(__m256i x, __m256i y) { return _mm256_xor_si256(x, y); }
static Vc_INTRINSIC Vc_CONST m256i or_si256(__m256i x, __m256i y) { return _mm256_or_si256(x, y); }
static Vc_INTRINSIC Vc_CONST m256i and_si256(__m256i x, __m256i y) { return _mm256_and_si256(x, y); }
static Vc_INTRINSIC Vc_CONST m256i andnot_si256(__m256i x, __m256i y) { return _mm256_andnot_si256(x, y); }
/////////////////////////////////////////////////////////////////////////
// implementation of the intrinsics missing in AVX2
/////////////////////////////////////////////////////////////////////////
Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0)
{
return _mm256_blendv_epi8(a0, b0, m0);
}
Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
{
return _mm256_movemask_epi8(a0);
}
#endif // Vc_IMPL_AVX2
/////////////////////////////////////////////////////////////////////////
// implementation of intrinsics missing in AVX and AVX2
/////////////////////////////////////////////////////////////////////////
static Vc_INTRINSIC m256i cmplt_epi64(__m256i a, __m256i b) {
return cmpgt_epi64(b, a);
}
static Vc_INTRINSIC m256i cmplt_epi32(__m256i a, __m256i b) {
return cmpgt_epi32(b, a);
}
static Vc_INTRINSIC m256i cmplt_epi16(__m256i a, __m256i b) {
return cmpgt_epi16(b, a);
}
static Vc_INTRINSIC m256i cmplt_epi8(__m256i a, __m256i b) {
return cmpgt_epi8(b, a);
}
static Vc_INTRINSIC m256i cmpgt_epu8(__m256i a, __m256i b) {
return cmpgt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8()));
}
#if defined(Vc_IMPL_XOP)
Vc_AVX_TO_SSE_2_NEW(comlt_epu32)
Vc_AVX_TO_SSE_2_NEW(comgt_epu32)
Vc_AVX_TO_SSE_2_NEW(comlt_epu16)
Vc_AVX_TO_SSE_2_NEW(comgt_epu16)
static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i a, __m256i b) { return comlt_epu32(a, b); }
static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i a, __m256i b) { return comgt_epu32(a, b); }
static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i a, __m256i b) { return comlt_epu16(a, b); }
static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i a, __m256i b) { return comgt_epu16(a, b); }
#else
static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i _a, __m256i _b) {
m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
return cmplt_epi32(a, b);
}
static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i _a, __m256i _b) {
m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
return cmpgt_epi32(a, b);
}
static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i _a, __m256i _b) {
m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
return cmplt_epi16(a, b);
}
static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i _a, __m256i _b) {
m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
return cmpgt_epi16(a, b);
}
#endif
static Vc_INTRINSIC void _mm256_maskstore(float *mem, const __m256 mask, const __m256 v) {
_mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v);
}
static Vc_INTRINSIC void _mm256_maskstore(double *mem, const __m256d mask, const __m256d v) {
_mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v);
}
static Vc_INTRINSIC void _mm256_maskstore(int *mem, const __m256i mask, const __m256i v) {
#ifdef Vc_IMPL_AVX2
_mm256_maskstore_epi32(mem, mask, v);
#else
_mm256_maskstore_ps(reinterpret_cast<float *>(mem), mask, _mm256_castsi256_ps(v));
#endif
}
static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const __m256i mask, const __m256i v) {
_mm256_maskstore(reinterpret_cast<int *>(mem), mask, v);
}
static Vc_INTRINSIC void _mm256_maskstore(short *mem, const __m256i mask, const __m256i v) {
using namespace AVX;
_mm_maskmoveu_si128(_mm256_castsi256_si128(v), _mm256_castsi256_si128(mask), reinterpret_cast<char *>(&mem[0]));
_mm_maskmoveu_si128(extract128<1>(v), extract128<1>(mask), reinterpret_cast<char *>(&mem[8]));
}
static Vc_INTRINSIC void _mm256_maskstore(unsigned short *mem, const __m256i mask, const __m256i v) {
_mm256_maskstore(reinterpret_cast<short *>(mem), mask, v);
}
#undef Vc_AVX_TO_SSE_1
#undef Vc_AVX_TO_SSE_1_128
#undef Vc_AVX_TO_SSE_2_NEW
#undef Vc_AVX_TO_SSE_256_128
#undef Vc_AVX_TO_SSE_1i
template<typename R> Vc_INTRINSIC_L R stream_load(const float *mem) Vc_INTRINSIC_R;
template<> Vc_INTRINSIC m128 stream_load<m128>(const float *mem)
{
return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
}
template<> Vc_INTRINSIC m256 stream_load<m256>(const float *mem)
{
return insert128<1>(_mm256_castps128_ps256(stream_load<m128>(mem)),
stream_load<m128>(mem + 4));
}
template<typename R> Vc_INTRINSIC_L R stream_load(const double *mem) Vc_INTRINSIC_R;
template<> Vc_INTRINSIC m128d stream_load<m128d>(const double *mem)
{
return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
}
template<> Vc_INTRINSIC m256d stream_load<m256d>(const double *mem)
{
return insert128<1>(_mm256_castpd128_pd256(stream_load<m128d>(mem)),
stream_load<m128d>(mem + 2));
}
template<typename R> Vc_INTRINSIC_L R stream_load(const void *mem) Vc_INTRINSIC_R;
template<> Vc_INTRINSIC m128i stream_load<m128i>(const void *mem)
{
return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<void *>(mem)));
}
template<> Vc_INTRINSIC m256i stream_load<m256i>(const void *mem)
{
return insert128<1>(_mm256_castsi128_si256(stream_load<m128i>(mem)),
stream_load<m128i>(static_cast<const __m128i *>(mem) + 1));
}
Vc_INTRINSIC void stream_store(float *mem, __m128 value, __m128 mask)
{
_mm_maskmoveu_si128(_mm_castps_si128(value), _mm_castps_si128(mask), reinterpret_cast<char *>(mem));
}
Vc_INTRINSIC void stream_store(float *mem, __m256 value, __m256 mask)
{
stream_store(mem, _mm256_castps256_ps128(value), _mm256_castps256_ps128(mask));
stream_store(mem + 4, extract128<1>(value), extract128<1>(mask));
}
Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask)
{
_mm_maskmoveu_si128(_mm_castpd_si128(value), _mm_castpd_si128(mask), reinterpret_cast<char *>(mem));
}
Vc_INTRINSIC void stream_store(double *mem, __m256d value, __m256d mask)
{
stream_store(mem, _mm256_castpd256_pd128(value), _mm256_castpd256_pd128(mask));
stream_store(mem + 2, extract128<1>(value), extract128<1>(mask));
}
Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask)
{
_mm_maskmoveu_si128(value, mask, reinterpret_cast<char *>(mem));
}
Vc_INTRINSIC void stream_store(void *mem, __m256i value, __m256i mask)
{
stream_store(mem, _mm256_castsi256_si128(value), _mm256_castsi256_si128(mask));
stream_store(static_cast<__m128i *>(mem) + 1, extract128<1>(value), extract128<1>(mask));
}
#ifndef __x86_64__
Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
}
#endif
#ifdef Vc_IMPL_AVX2
template <int Scale> __m256 gather(const float *addr, __m256i idx)
{
return _mm256_i32gather_ps(addr, idx, Scale);
}
template <int Scale> __m256d gather(const double *addr, __m128i idx)
{
return _mm256_i32gather_pd(addr, idx, Scale);
}
template <int Scale> __m256i gather(const int *addr, __m256i idx)
{
return _mm256_i32gather_epi32(addr, idx, Scale);
}
template <int Scale> __m256i gather(const unsigned *addr, __m256i idx)
{
return _mm256_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
}
template <int Scale> __m256 gather(__m256 src, __m256 k, const float *addr, __m256i idx)
{
return _mm256_mask_i32gather_ps(src, addr, idx, k, Scale);
}
template <int Scale>
__m256d gather(__m256d src, __m256d k, const double *addr, __m128i idx)
{
return _mm256_mask_i32gather_pd(src, addr, idx, k, Scale);
}
template <int Scale> __m256i gather(__m256i src, __m256i k, const int *addr, __m256i idx)
{
return _mm256_mask_i32gather_epi32(src, addr, idx, k, Scale);
}
template <int Scale>
__m256i gather(__m256i src, __m256i k, const unsigned *addr, __m256i idx)
{
return _mm256_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
}
#endif
} // namespace AvxIntrinsics
} // namespace Vc
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
using namespace AvxIntrinsics;
} // namespace AVX
namespace AVX2
{
using namespace AvxIntrinsics;
} // namespace AVX2
namespace AVX
{
template<typename T> struct VectorTypeHelper;
template<> struct VectorTypeHelper< char > { typedef __m256i Type; };
template<> struct VectorTypeHelper< signed char > { typedef __m256i Type; };
template<> struct VectorTypeHelper<unsigned char > { typedef __m256i Type; };
template<> struct VectorTypeHelper< short> { typedef __m256i Type; };
template<> struct VectorTypeHelper<unsigned short> { typedef __m256i Type; };
template<> struct VectorTypeHelper< int > { typedef __m256i Type; };
template<> struct VectorTypeHelper<unsigned int > { typedef __m256i Type; };
template<> struct VectorTypeHelper< long > { typedef __m256i Type; };
template<> struct VectorTypeHelper<unsigned long > { typedef __m256i Type; };
template<> struct VectorTypeHelper< long long> { typedef __m256i Type; };
template<> struct VectorTypeHelper<unsigned long long> { typedef __m256i Type; };
template<> struct VectorTypeHelper< float> { typedef __m256 Type; };
template<> struct VectorTypeHelper< double> { typedef __m256d Type; };
template <typename T>
using IntegerVectorType =
typename std::conditional<sizeof(T) == 16, __m128i, __m256i>::type;
template <typename T>
using DoubleVectorType =
typename std::conditional<sizeof(T) == 16, __m128d, __m256d>::type;
template <typename T>
using FloatVectorType =
typename std::conditional<sizeof(T) == 16, __m128, __m256>::type;
template<typename T> struct VectorHelper {};
template<typename T> struct VectorHelperSize;
} // namespace AVX
} // namespace Vc
#endif // VC_AVX_INTRINSICS_H_

87
Vc/avx/limits.h Normal file
View File

@ -0,0 +1,87 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_AVX_LIMITS_H_
#define VC_AVX_LIMITS_H_
#include "intrinsics.h"
#include "types.h"
#include "macros.h"
namespace std
{
#define Vc_NUM_LIM(T, _max, _min) \
template <> struct numeric_limits<Vc::AVX2::Vector<T>> : public numeric_limits<T> { \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> max() Vc_NOEXCEPT \
{ \
return _max; \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> min() Vc_NOEXCEPT \
{ \
return _min; \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> lowest() Vc_NOEXCEPT \
{ \
return min(); \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> epsilon() Vc_NOEXCEPT \
{ \
return Vc::AVX2::Vector<T>::Zero(); \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> round_error() Vc_NOEXCEPT \
{ \
return Vc::AVX2::Vector<T>::Zero(); \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> infinity() Vc_NOEXCEPT \
{ \
return Vc::AVX2::Vector<T>::Zero(); \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> quiet_NaN() Vc_NOEXCEPT \
{ \
return Vc::AVX2::Vector<T>::Zero(); \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> signaling_NaN() Vc_NOEXCEPT \
{ \
return Vc::AVX2::Vector<T>::Zero(); \
} \
static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> denorm_min() Vc_NOEXCEPT \
{ \
return Vc::AVX2::Vector<T>::Zero(); \
} \
}
#ifdef Vc_IMPL_AVX2
Vc_NUM_LIM(unsigned short, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>());
Vc_NUM_LIM( short, _mm256_srli_epi16(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi16());
Vc_NUM_LIM( unsigned int, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>());
Vc_NUM_LIM( int, _mm256_srli_epi32(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi32());
#endif
#undef Vc_NUM_LIM
} // namespace std
#endif // VC_AVX_LIMITS_H_

33
Vc/avx/macros.h Normal file
View File

@ -0,0 +1,33 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#include "../common/macros.h"
#ifndef VC_AVX_MACROS_H_
#define VC_AVX_MACROS_H_
#endif // VC_AVX_MACROS_H_

235
Vc/avx/mask.h Normal file
View File

@ -0,0 +1,235 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_AVX_MASK_H_
#define VC_AVX_MASK_H_
#include <array>
#include "intrinsics.h"
#include "../common/storage.h"
#include "../common/bitscanintrinsics.h"
#include "../common/maskbool.h"
#include "detail.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
template <typename T> class Mask<T, VectorAbi::Avx>
{
public:
using abi = VectorAbi::Avx;
/**
* The \c EntryType of masks is always bool, independent of \c T.
*/
typedef bool EntryType;
using value_type = EntryType;
using MaskBool = Common::MaskBool<sizeof(T)>;
/**
* The \c VectorEntryType, in contrast to \c EntryType, reveals information about the SIMD
* implementation. This type is useful for the \c sizeof operator in generic functions.
*/
using VectorEntryType = MaskBool;
/**
* The associated Vector<T> type.
*/
using Vector = AVX2::Vector<T>;
///\internal
using VectorTypeF = AVX::FloatVectorType<typename AVX::VectorTypeHelper<T>::Type>;
///\internal
using VectorTypeD = AVX::DoubleVectorType<VectorTypeF>;
///\internal
using VectorTypeI = AVX::IntegerVectorType<VectorTypeF>;
private:
typedef const VectorTypeF VArg;
typedef const VectorTypeD VdArg;
typedef const VectorTypeI ViArg;
public:
static constexpr size_t Size = sizeof(VectorTypeF) / sizeof(T);
static constexpr size_t MemoryAlignment = Size;
static constexpr std::size_t size() { return Size; }
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType));
private:
typedef Common::Storage<T, Size> Storage;
public:
/**
* The \c VectorType reveals the implementation-specific internal type used for the
* SIMD type.
*/
using VectorType = typename Storage::VectorType;
using EntryReference = Vc::Detail::ElementReference<Mask>;
using reference = EntryReference;
// abstracts the way Masks are passed to functions, it can easily be changed to const ref here
#if defined Vc_MSVC && defined _WIN32
typedef const Mask &AsArg;
#else
typedef const Mask AsArg;
#endif
Vc_INTRINSIC Mask() {}
Vc_INTRINSIC Mask(VArg x) : d(AVX::avx_cast<VectorType>(x)) {}
Vc_INTRINSIC Mask(VdArg x) : d(AVX::avx_cast<VectorType>(x)) {}
Vc_INTRINSIC Mask(ViArg x) : d(AVX::avx_cast<VectorType>(x)) {}
Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : d(Detail::zero<VectorType>()) {}
Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : d(Detail::allone<VectorType>()) {}
Vc_INTRINSIC explicit Mask(bool b)
: d(b ? Detail::allone<VectorType>() : Detail::zero<VectorType>())
{
}
Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; }
Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; }
// implicit cast
template <typename U>
Vc_INTRINSIC Mask(
U &&rhs, Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg)
: d(AVX::avx_cast<VectorType>(
Detail::mask_cast<Traits::decay<U>::Size, Size, VectorTypeF>(
rhs.dataI())))
{
}
#if Vc_IS_VERSION_1
// explicit cast, implemented via simd_cast (in avx/simd_cast_caller.h)
template <typename U>
Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
"mask types") Vc_INTRINSIC
explicit Mask(U &&rhs,
Common::enable_if_mask_converts_explicitly<T, U> = nullarg);
#endif
template<typename Flags = DefaultLoadTag> Vc_INTRINSIC explicit Mask(const bool *mem, Flags f = Flags()) { load(mem, f); }
template<typename Flags = DefaultLoadTag> Vc_INTRINSIC void load(const bool *mem, Flags = Flags());
template<typename Flags = DefaultLoadTag> Vc_INTRINSIC void store(bool *mem, Flags = Flags()) const;
Vc_INTRINSIC Mask &operator=(const Mask &) = default;
Vc_INTRINSIC_L Mask &operator=(const std::array<bool, Size> &values) Vc_INTRINSIC_R;
Vc_INTRINSIC_L operator std::array<bool, Size>() const Vc_INTRINSIC_R;
// specializations in mask.tcc
Vc_INTRINSIC Vc_PURE bool operator==(const Mask &rhs) const
{ return Detail::movemask(d.v()) == Detail::movemask(rhs.d.v()); }
Vc_INTRINSIC Vc_PURE bool operator!=(const Mask &rhs) const
{ return !operator==(rhs); }
Vc_INTRINSIC Mask operator!() const
{
#ifdef Vc_GCC
return ~dataI();
#else
return Detail::andnot_(dataF(), Detail::allone<VectorTypeF>());
#endif
}
Vc_INTRINSIC Mask &operator&=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::and_(data(), rhs.data())); return *this; }
Vc_INTRINSIC Mask &operator|=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::or_ (data(), rhs.data())); return *this; }
Vc_INTRINSIC Mask &operator^=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::xor_(data(), rhs.data())); return *this; }
Vc_INTRINSIC Vc_PURE Mask operator&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); }
Vc_INTRINSIC Vc_PURE Mask operator|(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); }
Vc_INTRINSIC Vc_PURE Mask operator^(const Mask &rhs) const { return Detail::xor_(data(), rhs.data()); }
Vc_INTRINSIC Vc_PURE Mask operator&&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); }
Vc_INTRINSIC Vc_PURE Mask operator||(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); }
// no need for expression template optimizations because cmp(n)eq for floats are not bitwise
// compares
Vc_INTRINSIC_L bool isNotEmpty() const Vc_INTRINSIC_R;
Vc_INTRINSIC_L bool isEmpty() const Vc_INTRINSIC_R;
Vc_INTRINSIC_L bool isFull() const Vc_INTRINSIC_R;
Vc_INTRINSIC_L bool isMix() const Vc_INTRINSIC_R;
Vc_INTRINSIC Vc_PURE int shiftMask() const { return Detail::movemask(dataI()); }
Vc_INTRINSIC Vc_PURE int toInt() const { return Detail::mask_to_int<Size>(dataI()); }
Vc_INTRINSIC VectorType data () const { return d.v(); }
Vc_INTRINSIC VectorTypeF dataF() const { return AVX::avx_cast<VectorTypeF>(d.v()); }
Vc_INTRINSIC VectorTypeI dataI() const { return AVX::avx_cast<VectorTypeI>(d.v()); }
Vc_INTRINSIC VectorTypeD dataD() const { return AVX::avx_cast<VectorTypeD>(d.v()); }
private:
friend reference;
static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept
{
return m.toInt() & (1 << i);
}
template <typename U>
static Vc_INTRINSIC void set(Mask &m, int i,
U &&v) noexcept(noexcept(MaskBool(std::declval<U>())))
{
m.d.set(i, MaskBool(std::forward<U>(v)));
}
public:
/**
* \note the returned object models the concept of a reference and
* as such it can exist longer than the data it is referencing.
* \note to avoid lifetime issues, we strongly advice not to store
* any reference objects.
*/
Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
{
return {*this, int(index)};
}
Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept
{
return get(*this, index);
}
Vc_INTRINSIC Vc_PURE int count() const { return Detail::popcnt16(toInt()); }
Vc_INTRINSIC Vc_PURE int firstOne() const { return _bit_scan_forward(toInt()); }
template <typename G> static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R;
private:
#ifdef Vc_COMPILE_BENCHMARKS
public:
#endif
Storage d;
};
template <typename T> constexpr size_t Mask<T, VectorAbi::Avx>::Size;
template <typename T> constexpr size_t Mask<T, VectorAbi::Avx>::MemoryAlignment;
} // namespace Vc
#include "mask.tcc"
#endif // VC_AVX_MASK_H_

292
Vc/avx/mask.tcc Normal file
View File

@ -0,0 +1,292 @@
/* This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
namespace Vc_VERSIONED_NAMESPACE
{
// store {{{1
template <typename T>
template <typename Flags>
Vc_INTRINSIC void Mask<T, VectorAbi::Avx>::store(bool *mem, Flags f) const
{
Detail::mask_store<Size>(dataI(), mem, f);
}
// load {{{1
template <typename T>
template <typename Flags>
Vc_INTRINSIC void Mask<T, VectorAbi::Avx>::load(const bool *mem, Flags f)
{
d.v() = AVX::avx_cast<VectorType>(Detail::mask_load<VectorTypeF, Size>(mem, f));
}
// operator[] {{{1
#ifdef Vc_IMPL_AVX2
template <>
Vc_INTRINSIC Vc_PURE bool AVX2::Mask<int16_t>::get(const AVX2::Mask<int16_t> &m,
int index) noexcept
{
return m.shiftMask() & (1 << 2 * index);
}
template <>
Vc_INTRINSIC Vc_PURE bool AVX2::Mask<uint16_t>::get(const AVX2::Mask<uint16_t> &m,
int index) noexcept
{
return m.shiftMask() & (1 << 2 * index);
}
#endif
// operator== {{{1
template <> Vc_INTRINSIC Vc_PURE bool AVX2::double_m::operator==(const AVX2::double_m &rhs) const
{ return Detail::movemask(dataD()) == Detail::movemask(rhs.dataD()); }
#ifdef Vc_IMPL_AVX2
template <> Vc_INTRINSIC Vc_PURE bool AVX2::short_m::operator==(const AVX2::short_m &rhs) const
{ return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); }
template <> Vc_INTRINSIC Vc_PURE bool AVX2::ushort_m::operator==(const AVX2::ushort_m &rhs) const
{ return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); }
#endif
// isFull, isNotEmpty, isEmpty, isMix specializations{{{1
template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isFull() const {
if (sizeof(T) == 8) {
return 0 != Detail::testc(dataD(), Detail::allone<VectorTypeD>());
} else if (sizeof(T) == 4) {
return 0 != Detail::testc(dataF(), Detail::allone<VectorTypeF>());
} else {
return 0 != Detail::testc(dataI(), Detail::allone<VectorTypeI>());
}
}
template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isNotEmpty() const {
if (sizeof(T) == 8) {
return 0 == Detail::testz(dataD(), dataD());
} else if (sizeof(T) == 4) {
return 0 == Detail::testz(dataF(), dataF());
} else {
return 0 == Detail::testz(dataI(), dataI());
}
}
template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isEmpty() const {
if (sizeof(T) == 8) {
return 0 != Detail::testz(dataD(), dataD());
} else if (sizeof(T) == 4) {
return 0 != Detail::testz(dataF(), dataF());
} else {
return 0 != Detail::testz(dataI(), dataI());
}
}
template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isMix() const {
if (sizeof(T) == 8) {
return 0 != Detail::testnzc(dataD(), Detail::allone<VectorTypeD>());
} else if (sizeof(T) == 4) {
return 0 != Detail::testnzc(dataF(), Detail::allone<VectorTypeF>());
} else {
return 0 != Detail::testnzc(dataI(), Detail::allone<VectorTypeI>());
}
}
// generate {{{1
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4 + 32>)
{
return _mm256_setr_epi64x(
gen(0) ? 0xffffffffffffffffull : 0, gen(1) ? 0xffffffffffffffffull : 0,
gen(2) ? 0xffffffffffffffffull : 0, gen(3) ? 0xffffffffffffffffull : 0);
}
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8 + 32>)
{
return _mm256_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0,
gen(4) ? 0xfffffffful : 0, gen(5) ? 0xfffffffful : 0,
gen(6) ? 0xfffffffful : 0, gen(7) ? 0xfffffffful : 0);
}
template <typename M, typename G>
Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 16 + 32>)
{
return _mm256_setr_epi16(gen(0) ? 0xfffful : 0, gen(1) ? 0xfffful : 0,
gen(2) ? 0xfffful : 0, gen(3) ? 0xfffful : 0,
gen(4) ? 0xfffful : 0, gen(5) ? 0xfffful : 0,
gen(6) ? 0xfffful : 0, gen(7) ? 0xfffful : 0,
gen(8) ? 0xfffful : 0, gen(9) ? 0xfffful : 0,
gen(10) ? 0xfffful : 0, gen(11) ? 0xfffful : 0,
gen(12) ? 0xfffful : 0, gen(13) ? 0xfffful : 0,
gen(14) ? 0xfffful : 0, gen(15) ? 0xfffful : 0);
}
template <typename T>
template <typename G>
Vc_INTRINSIC AVX2::Mask<T> Mask<T, VectorAbi::Avx>::generate(G &&gen)
{
return generate_impl<AVX2::Mask<T>>(std::forward<G>(gen),
std::integral_constant<int, Size + sizeof(Storage)>());
}
// shifted {{{1
template <typename T> Vc_INTRINSIC Vc_PURE AVX2::Mask<T> Mask<T, VectorAbi::Avx>::shifted(int amount) const
{
switch (amount * int(sizeof(VectorEntryType))) {
case 0: return *this;
case 1: return Detail::shifted< 1>(dataI());
case 2: return Detail::shifted< 2>(dataI());
case 3: return Detail::shifted< 3>(dataI());
case 4: return Detail::shifted< 4>(dataI());
case 5: return Detail::shifted< 5>(dataI());
case 6: return Detail::shifted< 6>(dataI());
case 7: return Detail::shifted< 7>(dataI());
case 8: return Detail::shifted< 8>(dataI());
case 9: return Detail::shifted< 9>(dataI());
case 10: return Detail::shifted< 10>(dataI());
case 11: return Detail::shifted< 11>(dataI());
case 12: return Detail::shifted< 12>(dataI());
case 13: return Detail::shifted< 13>(dataI());
case 14: return Detail::shifted< 14>(dataI());
case 15: return Detail::shifted< 15>(dataI());
case 16: return Detail::shifted< 16>(dataI());
case 17: return Detail::shifted< 17>(dataI());
case 18: return Detail::shifted< 18>(dataI());
case 19: return Detail::shifted< 19>(dataI());
case 20: return Detail::shifted< 20>(dataI());
case 21: return Detail::shifted< 21>(dataI());
case 22: return Detail::shifted< 22>(dataI());
case 23: return Detail::shifted< 23>(dataI());
case 24: return Detail::shifted< 24>(dataI());
case 25: return Detail::shifted< 25>(dataI());
case 26: return Detail::shifted< 26>(dataI());
case 27: return Detail::shifted< 27>(dataI());
case 28: return Detail::shifted< 28>(dataI());
case 29: return Detail::shifted< 29>(dataI());
case 30: return Detail::shifted< 30>(dataI());
case 31: return Detail::shifted< 31>(dataI());
case -1: return Detail::shifted< -1>(dataI());
case -2: return Detail::shifted< -2>(dataI());
case -3: return Detail::shifted< -3>(dataI());
case -4: return Detail::shifted< -4>(dataI());
case -5: return Detail::shifted< -5>(dataI());
case -6: return Detail::shifted< -6>(dataI());
case -7: return Detail::shifted< -7>(dataI());
case -8: return Detail::shifted< -8>(dataI());
case -9: return Detail::shifted< -9>(dataI());
case -10: return Detail::shifted<-10>(dataI());
case -11: return Detail::shifted<-11>(dataI());
case -12: return Detail::shifted<-12>(dataI());
case -13: return Detail::shifted<-13>(dataI());
case -14: return Detail::shifted<-14>(dataI());
case -15: return Detail::shifted<-15>(dataI());
case -16: return Detail::shifted<-16>(dataI());
case -17: return Detail::shifted<-17>(dataI());
case -18: return Detail::shifted<-18>(dataI());
case -19: return Detail::shifted<-19>(dataI());
case -20: return Detail::shifted<-20>(dataI());
case -21: return Detail::shifted<-21>(dataI());
case -22: return Detail::shifted<-22>(dataI());
case -23: return Detail::shifted<-23>(dataI());
case -24: return Detail::shifted<-24>(dataI());
case -25: return Detail::shifted<-25>(dataI());
case -26: return Detail::shifted<-26>(dataI());
case -27: return Detail::shifted<-27>(dataI());
case -28: return Detail::shifted<-28>(dataI());
case -29: return Detail::shifted<-29>(dataI());
case -30: return Detail::shifted<-30>(dataI());
case -31: return Detail::shifted<-31>(dataI());
}
return Zero();
}
// }}}1
/*
template<> Vc_INTRINSIC AVX2::Mask< 4, 32> &AVX2::Mask< 4, 32>::operator=(const std::array<bool, 4> &values) {
static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
unsigned int x = *reinterpret_cast<const unsigned int *>(values.data());
x *= 0xffu;
__m128i y = _mm_cvtsi32_si128(x); // 4 Bytes
y = _mm_unpacklo_epi8(y, y); // 8 Bytes
y = _mm_unpacklo_epi16(y, y); // 16 Bytes
d.v() = AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(y, y), _mm_unpackhi_epi32(y, y)));
return *this;
}
template<> Vc_INTRINSIC AVX2::Mask< 8, 32> &AVX2::Mask< 8, 32>::operator=(const std::array<bool, 8> &values) {
static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
unsigned long long x = *reinterpret_cast<const unsigned long long *>(values.data());
x *= 0xffull;
__m128i y = _mm_cvtsi64_si128(x); // 8 Bytes
y = _mm_unpacklo_epi8(y, y); // 16 Bytes
d.v() = AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(y, y), _mm_unpackhi_epi16(y, y)));
return *this;
}
template<> Vc_INTRINSIC AVX2::Mask< 8, 16> &AVX2::Mask< 8, 16>::operator=(const std::array<bool, 8> &values) {
static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
unsigned long long x = *reinterpret_cast<const unsigned long long *>(values.data());
x *= 0xffull;
__m128i y = _mm_cvtsi64_si128(x); // 8 Bytes
d.v() = AVX::avx_cast<__m128>(_mm_unpacklo_epi8(y, y));
return *this;
}
template<> Vc_INTRINSIC AVX2::Mask<16, 16> &AVX2::Mask<16, 16>::operator=(const std::array<bool, 16> &values) {
static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
__m128i x = _mm_loadu_si128(reinterpret_cast<const __m128i *>(values.data()));
d.v() = _mm_andnot_ps(AVX::_mm_setallone_ps(), AVX::avx_cast<__m128>(_mm_sub_epi8(x, _mm_set1_epi8(1))));
return *this;
}
template<> Vc_INTRINSIC AVX2::Mask< 4, 32>::operator std::array<bool, 4>() const {
static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
__m128i x = _mm_packs_epi32(AVX::lo128(dataI()), AVX::hi128(dataI())); // 64bit -> 32bit
x = _mm_packs_epi32(x, x); // 32bit -> 16bit
x = _mm_srli_epi16(x, 15);
x = _mm_packs_epi16(x, x); // 16bit -> 8bit
std::array<bool, 4> r;
asm volatile("vmovd %1,%0" : "=m"(*r.data()) : "x"(x));
return r;
}
template<> Vc_INTRINSIC AVX2::Mask< 8, 32>::operator std::array<bool, 8>() const {
static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
__m128i x = _mm_packs_epi32(AVX::lo128(dataI()), AVX::hi128(dataI())); // 32bit -> 16bit
x = _mm_srli_epi16(x, 15);
x = _mm_packs_epi16(x, x); // 16bit -> 8bit
std::array<bool, 8> r;
asm volatile("vmovq %1,%0" : "=m"(*r.data()) : "x"(x));
return r;
}
template<> Vc_INTRINSIC AVX2::Mask< 8, 16>::operator std::array<bool, 8>() const {
static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
__m128i x = _mm_srli_epi16(dataI(), 15);
x = _mm_packs_epi16(x, x); // 16bit -> 8bit
std::array<bool, 8> r;
asm volatile("vmovq %1,%0" : "=m"(*r.data()) : "x"(x));
return r;
}
template<> Vc_INTRINSIC AVX2::Mask<16, 16>::operator std::array<bool, 16>() const {
static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
__m128 x = _mm_and_ps(d.v(), AVX::avx_cast<__m128>(_mm_set1_epi32(0x01010101)));
std::array<bool, 16> r;
asm volatile("vmovups %1,%0" : "=m"(*r.data()) : "x"(x));
return r;
}
*/
}
// vim: foldmethod=marker

321
Vc/avx/math.h Normal file
View File

@ -0,0 +1,321 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_AVX_MATH_H_
#define VC_AVX_MATH_H_
#include "const.h"
#include "limits.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
// min & max {{{1
#ifdef Vc_IMPL_AVX2
Vc_ALWAYS_INLINE AVX2::int_v min(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_min_epi32(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::uint_v min(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_min_epu32(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::short_v min(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_min_epi16(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::ushort_v min(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_min_epu16(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::int_v max(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_max_epi32(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::uint_v max(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_max_epu32(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::short_v max(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_max_epi16(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::ushort_v max(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_max_epu16(x.data(), y.data()); }
#endif
Vc_ALWAYS_INLINE AVX2::float_v min(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_min_ps(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::double_v min(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_min_pd(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::float_v max(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_max_ps(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::double_v max(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_max_pd(x.data(), y.data()); }
// sqrt {{{1
template <typename T>
Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> sqrt(const AVX2::Vector<T> &x)
{
return AVX::VectorHelper<T>::sqrt(x.data());
}
// rsqrt {{{1
template <typename T>
Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> rsqrt(const AVX2::Vector<T> &x)
{
return AVX::VectorHelper<T>::rsqrt(x.data());
}
// reciprocal {{{1
template <typename T>
Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> reciprocal(const AVX2::Vector<T> &x)
{
return AVX::VectorHelper<T>::reciprocal(x.data());
}
// round {{{1
template <typename T>
Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> round(const AVX2::Vector<T> &x)
{
return AVX::VectorHelper<T>::round(x.data());
}
// abs {{{1
Vc_INTRINSIC Vc_CONST AVX2::double_v abs(AVX2::double_v x)
{
return Detail::and_(x.data(), AVX::setabsmask_pd());
}
Vc_INTRINSIC Vc_CONST AVX2::float_v abs(AVX2::float_v x)
{
return Detail::and_(x.data(), AVX::setabsmask_ps());
}
#ifdef Vc_IMPL_AVX2
Vc_INTRINSIC Vc_CONST AVX2::int_v abs(AVX2::int_v x)
{
return _mm256_abs_epi32(x.data());
}
Vc_INTRINSIC Vc_CONST AVX2::short_v abs(AVX2::short_v x)
{
return _mm256_abs_epi16(x.data());
}
#endif
// isfinite {{{1
Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isfinite(const AVX2::double_v &x)
{
return AVX::cmpord_pd(x.data(), _mm256_mul_pd(Detail::zero<__m256d>(), x.data()));
}
Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isfinite(const AVX2::float_v &x)
{
return AVX::cmpord_ps(x.data(), _mm256_mul_ps(Detail::zero<__m256>(), x.data()));
}
// isinf {{{1
Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isinf(const AVX2::double_v &x)
{
return _mm256_castsi256_pd(AVX::cmpeq_epi64(
_mm256_castpd_si256(abs(x).data()),
_mm256_castpd_si256(Detail::avx_broadcast(AVX::c_log<double>::d(1)))));
}
Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isinf(const AVX2::float_v &x)
{
return _mm256_castsi256_ps(
AVX::cmpeq_epi32(_mm256_castps_si256(abs(x).data()),
_mm256_castps_si256(Detail::avx_broadcast(AVX::c_log<float>::d(1)))));
}
// isnan {{{1
Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isnan(const AVX2::double_v &x)
{
return AVX::cmpunord_pd(x.data(), x.data());
}
Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isnan(const AVX2::float_v &x)
{
return AVX::cmpunord_ps(x.data(), x.data());
}
// copysign {{{1
Vc_INTRINSIC Vc_CONST AVX2::float_v copysign(AVX2::float_v mag, AVX2::float_v sign)
{
return _mm256_or_ps(_mm256_and_ps(sign.data(), AVX::setsignmask_ps()),
_mm256_and_ps(mag.data(), AVX::setabsmask_ps()));
}
Vc_INTRINSIC Vc_CONST AVX2::double_v copysign(AVX2::double_v::AsArg mag,
AVX2::double_v::AsArg sign)
{
return _mm256_or_pd(_mm256_and_pd(sign.data(), AVX::setsignmask_pd()),
_mm256_and_pd(mag.data(), AVX::setabsmask_pd()));
}
//}}}1
// frexp {{{1
/**
* splits \p v into exponent and mantissa, the sign is kept with the mantissa
*
* The return value will be in the range [0.5, 1.0[
* The \p e value will be an integer defining the power-of-two exponent
*/
inline AVX2::double_v frexp(AVX2::double_v::AsArg v, SimdArray<int, 4> *e)
{
const __m256d exponentBits = AVX::Const<double>::exponentMask().dataD();
const __m256d exponentPart = _mm256_and_pd(v.data(), exponentBits);
auto lo = AVX::avx_cast<__m128i>(AVX::lo128(exponentPart));
auto hi = AVX::avx_cast<__m128i>(AVX::hi128(exponentPart));
lo = _mm_sub_epi32(_mm_srli_epi64(lo, 52), _mm_set1_epi64x(0x3fe));
hi = _mm_sub_epi32(_mm_srli_epi64(hi, 52), _mm_set1_epi64x(0x3fe));
SSE::int_v exponent = Mem::shuffle<X0, X2, Y0, Y2>(lo, hi);
const __m256d exponentMaximized = _mm256_or_pd(v.data(), exponentBits);
AVX2::double_v ret =
_mm256_and_pd(exponentMaximized,
_mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask)));
const double_m zeroMask = v == AVX2::double_v::Zero();
ret(isnan(v) || !isfinite(v) || zeroMask) = v;
exponent.setZero(simd_cast<SSE::int_m>(zeroMask));
internal_data(*e) = exponent;
return ret;
}
#ifdef Vc_IMPL_AVX2
inline SimdArray<double, 8> frexp(const SimdArray<double, 8> &v, SimdArray<int, 8> *e)
{
const __m256d exponentBits = AVX::Const<double>::exponentMask().dataD();
const __m256d w[2] = {internal_data(internal_data0(v)).data(),
internal_data(internal_data1(v)).data()};
const __m256i exponentPart[2] = {
_mm256_castpd_si256(_mm256_and_pd(w[0], exponentBits)),
_mm256_castpd_si256(_mm256_and_pd(w[1], exponentBits))};
const __m256i lo = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[0], 52),
_mm256_set1_epi32(0x3fe)); // 0.1. 2.3.
const __m256i hi = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[1], 52),
_mm256_set1_epi32(0x3fe)); // 4.5. 6.7.
const __m256i a = _mm256_unpacklo_epi32(lo, hi); // 04.. 26..
const __m256i b = _mm256_unpackhi_epi32(lo, hi); // 15.. 37..
const __m256i tmp = _mm256_unpacklo_epi32(a, b); // 0145 2367
const __m256i exponent =
AVX::concat(_mm_unpacklo_epi64(AVX::lo128(tmp), AVX::hi128(tmp)),
_mm_unpackhi_epi64(AVX::lo128(tmp), AVX::hi128(tmp))); // 0123 4567
const __m256d exponentMaximized[2] = {_mm256_or_pd(w[0], exponentBits),
_mm256_or_pd(w[1], exponentBits)};
const auto frexpMask =
_mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask));
fixed_size_simd<double, 8> ret = {
fixed_size_simd<double, 4>(
AVX::double_v(_mm256_and_pd(exponentMaximized[0], frexpMask))),
fixed_size_simd<double, 4>(
AVX::double_v(_mm256_and_pd(exponentMaximized[1], frexpMask)))};
const auto zeroMask = v == v.Zero();
ret(isnan(v) || !isfinite(v) || zeroMask) = v;
internal_data(*e) =
Detail::andnot_(simd_cast<AVX2::int_m>(zeroMask).dataI(), exponent);
return ret;
}
#endif // Vc_IMPL_AVX2
namespace Detail
{
Vc_INTRINSIC AVX2::float_v::IndexType extractExponent(__m256 e)
{
SimdArray<uint, float_v::Size> exponentPart;
const auto ee = AVX::avx_cast<__m256i>(e);
#ifdef Vc_IMPL_AVX2
exponentPart = AVX2::uint_v(ee);
#else
internal_data(internal_data0(exponentPart)) = AVX::lo128(ee);
internal_data(internal_data1(exponentPart)) = AVX::hi128(ee);
#endif
return (exponentPart >> 23) - 0x7e;
}
} // namespace Detail
inline AVX2::float_v frexp(AVX2::float_v::AsArg v, SimdArray<int, 8> *e)
{
using namespace Detail;
using namespace AVX2;
const __m256 exponentBits = Const<float>::exponentMask().data();
*e = extractExponent(and_(v.data(), exponentBits));
const __m256 exponentMaximized = or_(v.data(), exponentBits);
AVX2::float_v ret = _mm256_and_ps(exponentMaximized, avx_cast<__m256>(set1_epi32(0xbf7fffffu)));
ret(isnan(v) || !isfinite(v) || v == AVX2::float_v::Zero()) = v;
e->setZero(simd_cast<decltype(*e == *e)>(v == AVX2::float_v::Zero()));
return ret;
}
// ldexp {{{1
/* -> x * 2^e
* x == NaN -> NaN
* x == (-)inf -> (-)inf
*/
inline AVX2::double_v ldexp(AVX2::double_v::AsArg v, const SimdArray<int, 4> &_e)
{
SSE::int_v e = internal_data(_e);
e.setZero(simd_cast<SSE::int_m>(v == AVX2::double_v::Zero()));
const __m256i exponentBits =
AVX::concat(_mm_slli_epi64(_mm_unpacklo_epi32(e.data(), e.data()), 52),
_mm_slli_epi64(_mm_unpackhi_epi32(e.data(), e.data()), 52));
return AVX::avx_cast<__m256d>(
AVX::add_epi64(AVX::avx_cast<__m256i>(v.data()), exponentBits));
}
inline AVX2::float_v ldexp(AVX2::float_v::AsArg v, SimdArray<int, 8> e)
{
e.setZero(simd_cast<decltype(e == e)>(v == AVX2::float_v::Zero()));
e <<= 23;
#ifdef Vc_IMPL_AVX2
return {AVX::avx_cast<__m256>(
AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())),
AVX::lo128(internal_data(e).data())),
_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())),
AVX::hi128(internal_data(e).data()))))};
#else
return {AVX::avx_cast<__m256>(
AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())),
internal_data(internal_data0(e)).data()),
_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())),
internal_data(internal_data1(e)).data())))};
#endif
}
// trunc {{{1
Vc_ALWAYS_INLINE AVX2::float_v trunc(AVX2::float_v::AsArg v)
{
return _mm256_round_ps(v.data(), 0x3);
}
Vc_ALWAYS_INLINE AVX2::double_v trunc(AVX2::double_v::AsArg v)
{
return _mm256_round_pd(v.data(), 0x3);
}
// floor {{{1
Vc_ALWAYS_INLINE AVX2::float_v floor(AVX2::float_v::AsArg v)
{
return _mm256_floor_ps(v.data());
}
Vc_ALWAYS_INLINE AVX2::double_v floor(AVX2::double_v::AsArg v)
{
return _mm256_floor_pd(v.data());
}
// ceil {{{1
Vc_ALWAYS_INLINE AVX2::float_v ceil(AVX2::float_v::AsArg v)
{
return _mm256_ceil_ps(v.data());
}
Vc_ALWAYS_INLINE AVX2::double_v ceil(AVX2::double_v::AsArg v)
{
return _mm256_ceil_pd(v.data());
}
// fma {{{1
template <typename T>
Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx> fma(Vector<T, VectorAbi::Avx> a,
Vector<T, VectorAbi::Avx> b,
Vector<T, VectorAbi::Avx> c)
{
return Detail::fma(a.data(), b.data(), c.data(), T());
}
// }}}1
} // namespace Vc
#endif // VC_AVX_MATH_H_
// vim: foldmethod=marker

308
Vc/avx/shuffle.h Normal file
View File

@ -0,0 +1,308 @@
/* This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_AVX_SHUFFLE_H_
#define VC_AVX_SHUFFLE_H_
#include "../sse/shuffle.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <int... Dst> struct Permutation {};
template <uint8_t... Sel> struct Mask {};
#ifdef Vc_IMPL_AVX2
template <uint8_t Sel0, uint8_t Sel1, uint8_t Sel2, uint8_t Sel3, uint8_t Sel4,
uint8_t Sel5, uint8_t Sel6, uint8_t Sel7, uint8_t Sel8, uint8_t Sel9,
uint8_t Sel10, uint8_t Sel11, uint8_t Sel12, uint8_t Sel13, uint8_t Sel14,
uint8_t Sel15>
Vc_INTRINSIC Vc_CONST __m256i
blend(__m256i a, __m256i b, Mask<Sel0, Sel1, Sel2, Sel3, Sel4, Sel5, Sel6, Sel7, Sel8,
Sel9, Sel10, Sel11, Sel12, Sel13, Sel14, Sel15>)
{
static_assert((Sel0 == 0 || Sel0 == 1) && (Sel1 == 0 || Sel1 == 1) &&
(Sel2 == 0 || Sel2 == 1) && (Sel3 == 0 || Sel3 == 1) &&
(Sel4 == 0 || Sel4 == 1) && (Sel5 == 0 || Sel5 == 1) &&
(Sel6 == 0 || Sel6 == 1) && (Sel7 == 0 || Sel7 == 1) &&
(Sel8 == 0 || Sel8 == 1) && (Sel9 == 0 || Sel9 == 1) &&
(Sel10 == 0 || Sel10 == 1) && (Sel11 == 0 || Sel11 == 1) &&
(Sel12 == 0 || Sel12 == 1) && (Sel13 == 0 || Sel13 == 1) &&
(Sel14 == 0 || Sel14 == 1) && (Sel15 == 0 || Sel15 == 1),
"Selectors must be 0 or 1 to select the value from a or b");
constexpr uint8_t mask = static_cast<uint8_t>(
(Sel0 << 0 ) | (Sel1 << 1 ) | (Sel2 << 2 ) | (Sel3 << 3 ) |
(Sel4 << 4 ) | (Sel5 << 5 ) | (Sel6 << 6 ) | (Sel7 << 7 ) |
(Sel8 << 8 ) | (Sel9 << 9 ) | (Sel10 << 10) | (Sel11 << 11) |
(Sel12 << 12) | (Sel13 << 13) | (Sel14 << 14) | (Sel15 << 15));
return _mm256_blend_epi16(a, b, mask);
}
#endif // Vc_IMPL_AVX2
} // namespace Detail
namespace Mem
{
#ifdef Vc_IMPL_AVX2
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteLo(__m256i x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm256_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteHi(__m256i x) {
static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
return _mm256_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
}
#endif // Vc_IMPL_AVX2
template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x) {
static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
return _mm256_permute2f128_ps(
x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
}
template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x) {
static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
return _mm256_permute2f128_pd(
x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
}
template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x) {
static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
#ifdef Vc_IMPL_AVX2
return _mm256_permute2x128_si256(
x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
#else
return _mm256_permute2f128_si256(
x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
#endif
}
template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle128(__m256 x, __m256 y) {
static_assert(L >= X0 && H >= X0, "Incorrect_Range");
static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
}
template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST shuffle128(__m256i x, __m256i y) {
static_assert(L >= X0 && H >= X0, "Incorrect_Range");
static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
#ifdef Vc_IMPL_AVX2
return _mm256_permute2x128_si256(
x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
#else
return _mm256_permute2f128_si256(
x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
#endif
}
template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle128(__m256d x, __m256d y) {
static_assert(L >= X0 && H >= X0, "Incorrect_Range");
static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute(__m256i x) {
return _mm256_castps_si256(permute<Dst0, Dst1, Dst2, Dst3>(_mm256_castsi256_ps(x)));
}
#ifdef Vc_IMPL_AVX2
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute4x64(__m256i x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm256_permute4x64_epi64(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
#endif // Vc_IMPL_AVX2
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
static Vc_ALWAYS_INLINE __m256 Vc_CONST blend(__m256 x, __m256 y) {
static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
return _mm256_blend_ps(x, y,
(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
(Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
(Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
(Dst6 / Y6) * 64 + (Dst7 / Y7) *128
);
}
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
static Vc_ALWAYS_INLINE __m256i Vc_CONST blend(__m256i x, __m256i y) {
return _mm256_castps_si256(blend<Dst0, Dst1, Dst2, Dst3, Dst4, Dst5, Dst6, Dst7>(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
}
template<VecPos Dst> struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; };
template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
static_assert(Dst0 >= X0 && Dst0 <= X7, "Incorrect_Range");
static_assert(Dst1 >= X0 && Dst1 <= X7, "Incorrect_Range");
static_assert(Dst2 >= X0 && Dst2 <= X7, "Incorrect_Range");
static_assert(Dst3 >= X0 && Dst3 <= X7, "Incorrect_Range");
static_assert(Dst4 >= X0 && Dst4 <= X7, "Incorrect_Range");
static_assert(Dst5 >= X0 && Dst5 <= X7, "Incorrect_Range");
static_assert(Dst6 >= X0 && Dst6 <= X7, "Incorrect_Range");
static_assert(Dst7 >= X0 && Dst7 <= X7, "Incorrect_Range");
if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) {
return permute<Dst0, Dst1, Dst2, Dst3>(x);
}
const __m128 loIn = _mm256_castps256_ps128(x);
const __m128 hiIn = _mm256_extractf128_ps(x, 1);
__m128 lo, hi;
if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) {
lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
} else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) {
lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
} else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) {
lo = shuffle<Dst0, Dst1, Dst2 - X4 + Y0, Dst3 - X4 + Y0>(loIn, hiIn);
} else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) {
lo = shuffle<Dst0 - X4, Dst1 - X4, Dst2 + Y0, Dst3 + Y0>(hiIn, loIn);
} else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) {
lo = _mm_unpacklo_ps(loIn, hiIn);
} else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) {
lo = _mm_unpacklo_ps(hiIn, loIn);
} else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) {
lo = _mm_unpackhi_ps(loIn, hiIn);
} else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) {
lo = _mm_unpackhi_ps(hiIn, loIn);
} else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) {
lo = blend<ScaleForBlend<Dst0>::Value, ScaleForBlend<Dst1>::Value,
ScaleForBlend<Dst2>::Value, ScaleForBlend<Dst3>::Value>(loIn, hiIn);
}
if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) {
hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
} else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) {
hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
} else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) {
hi = shuffle<Dst4, Dst5, Dst6 - X4 + Y0, Dst7 - X4 + Y0>(loIn, hiIn);
} else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) {
hi = shuffle<Dst4 - X4, Dst5 - X4, Dst6 + Y0, Dst7 + Y0>(hiIn, loIn);
} else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) {
hi = _mm_unpacklo_ps(loIn, hiIn);
} else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) {
hi = _mm_unpacklo_ps(hiIn, loIn);
} else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) {
hi = _mm_unpackhi_ps(loIn, hiIn);
} else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) {
hi = _mm_unpackhi_ps(hiIn, loIn);
} else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) {
hi = blend<ScaleForBlend<Dst4>::Value, ScaleForBlend<Dst5>::Value,
ScaleForBlend<Dst6>::Value, ScaleForBlend<Dst7>::Value>(loIn, hiIn);
}
return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
}
} // namespace Mem
} // namespace Vc
// little endian has the lo bits on the right and high bits on the left
// with vectors this becomes greatly confusing:
// Mem: abcd
// Reg: dcba
//
// The shuffles and permutes above use memory ordering. The ones below use register ordering:
namespace Vc_VERSIONED_NAMESPACE
{
namespace Reg
{
template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x, __m256 y) {
static_assert(L >= X0 && H >= X0, "Incorrect_Range");
static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
}
template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x, __m256i y) {
static_assert(L >= X0 && H >= X0, "Incorrect_Range");
static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
#ifdef Vc_IMPL_AVX2
return _mm256_permute2x128_si256(
x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
#else
return _mm256_permute2f128_si256(
x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
#endif
}
template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x, __m256d y) {
static_assert(L >= X0 && H >= X0, "Incorrect_Range");
static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
}
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
}
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST permute(__m128d x) {
static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
return _mm_permute_pd(x, Dst0 + Dst1 * 2);
}
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
}
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
}
template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
}
} // namespace Reg
} // namespace Vc
#endif // VC_AVX_SHUFFLE_H_

2724
Vc/avx/simd_cast.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,55 @@
/* This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef Vc_AVX_SIMD_CAST_CALLER_TCC_
#define Vc_AVX_SIMD_CAST_CALLER_TCC_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
#if Vc_IS_VERSION_1
template <typename T>
template <typename U, typename>
Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(U &&x)
: d(simd_cast<Vector>(std::forward<U>(x)).data())
{
}
template <typename T>
template <typename U>
Vc_INTRINSIC Mask<T, VectorAbi::Avx>::Mask(U &&rhs,
Common::enable_if_mask_converts_explicitly<T, U>)
: Mask(simd_cast<Mask>(std::forward<U>(rhs)))
{
}
#endif // Vc_IS_VERSION_1
}
#endif // Vc_AVX_SIMD_CAST_CALLER_TCC_
// vim: foldmethod=marker

120
Vc/avx/types.h Normal file
View File

@ -0,0 +1,120 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_AVX_TYPES_H_
#define VC_AVX_TYPES_H_
#include "../sse/types.h"
#include "../traits/type_traits.h"
#include "macros.h"
#ifdef Vc_DEFAULT_IMPL_AVX2
#define Vc_DOUBLE_V_SIZE 4
#define Vc_FLOAT_V_SIZE 8
#define Vc_INT_V_SIZE 8
#define Vc_UINT_V_SIZE 8
#define Vc_SHORT_V_SIZE 16
#define Vc_USHORT_V_SIZE 16
#elif defined Vc_DEFAULT_IMPL_AVX
#define Vc_DOUBLE_V_SIZE 4
#define Vc_FLOAT_V_SIZE 8
#define Vc_INT_V_SIZE 4
#define Vc_UINT_V_SIZE 4
#define Vc_SHORT_V_SIZE 8
#define Vc_USHORT_V_SIZE 8
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
template <typename T> using Vector = Vc::Vector<T, VectorAbi::Avx1Abi<T>>;
typedef Vector<double> double_v;
typedef Vector<float> float_v;
typedef Vector<int> int_v;
typedef Vector<unsigned int> uint_v;
typedef Vector<short> short_v;
typedef Vector<unsigned short> ushort_v;
template <typename T> using Mask = Vc::Mask<T, VectorAbi::Avx1Abi<T>>;
typedef Mask<double> double_m;
typedef Mask<float> float_m;
typedef Mask<int> int_m;
typedef Mask<unsigned int> uint_m;
typedef Mask<short> short_m;
typedef Mask<unsigned short> ushort_m;
template <typename T> struct Const;
template <typename T> struct is_vector : public std::false_type {};
template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
template <typename T> struct is_mask : public std::false_type {};
template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
} // namespace AVX
namespace AVX2
{
template <typename T> using Vector = Vc::Vector<T, VectorAbi::Avx>;
using double_v = Vector<double>;
using float_v = Vector< float>;
using int_v = Vector< int>;
using uint_v = Vector< uint>;
using short_v = Vector< short>;
using ushort_v = Vector<ushort>;
template <typename T> using Mask = Vc::Mask<T, VectorAbi::Avx>;
using double_m = Mask<double>;
using float_m = Mask< float>;
using llong_m = Mask< llong>;
using ullong_m = Mask<ullong>;
using long_m = Mask< long>;
using ulong_m = Mask< ulong>;
using int_m = Mask< int>;
using uint_m = Mask< uint>;
using short_m = Mask< short>;
using ushort_m = Mask<ushort>;
using schar_m = Mask< schar>;
using uchar_m = Mask< uchar>;
template <typename T> struct is_vector : public std::false_type {};
template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
template <typename T> struct is_mask : public std::false_type {};
template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
} // namespace AVX2
namespace Traits
{
template <class T> struct
is_simd_vector_internal<Vector<T, VectorAbi::Avx>>
: public is_valid_vector_argument<T> {};
template<typename T> struct is_simd_mask_internal<Mask<T, VectorAbi::Avx>>
: public std::true_type {};
} // namespace Traits
} // namespace Vc
#endif // VC_AVX_TYPES_H_

545
Vc/avx/vector.h Normal file
View File

@ -0,0 +1,545 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_AVX_VECTOR_H_
#define VC_AVX_VECTOR_H_
#include "intrinsics.h"
#include "casts.h"
#include "../sse/vector.h"
#include "shuffle.h"
#include "vectorhelper.h"
#include "mask.h"
#include <algorithm>
#include <cmath>
#include "../common/aliasingentryhelper.h"
#include "../common/memoryfwd.h"
#include "../common/where.h"
#include "macros.h"
#ifdef isfinite
#undef isfinite
#endif
#ifdef isnan
#undef isnan
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename T, typename Abi> struct VectorTraits
{
using mask_type = Vc::Mask<T, Abi>;
using vector_type = Vc::Vector<T, Abi>;
using writemasked_vector_type = Common::WriteMaskedVector<vector_type, mask_type>;
using intrinsic_type = typename AVX::VectorTypeHelper<T>::Type;
};
} // namespace Detail
#define Vc_CURRENT_CLASS_NAME Vector
template <typename T> class Vector<T, VectorAbi::Avx>
{
public:
using abi = VectorAbi::Avx;
private:
using traits_type = Detail::VectorTraits<T, abi>;
static_assert(
std::is_arithmetic<T>::value,
"Vector<T> only accepts arithmetic builtin types as template parameter T.");
using WriteMaskedVector = typename traits_type::writemasked_vector_type;
public:
using VectorType = typename traits_type::intrinsic_type;
using vector_type = VectorType;
using mask_type = typename traits_type::mask_type;
using Mask = mask_type;
using MaskType = mask_type;
using MaskArg Vc_DEPRECATED_ALIAS("Use MaskArgument instead.") = typename Mask::AsArg;
using MaskArgument = typename Mask::AsArg;
using reference = Detail::ElementReference<Vector>;
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType));
using EntryType = T;
using value_type = EntryType;
typedef EntryType VectorEntryType;
static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
static constexpr size_t MemoryAlignment = alignof(VectorType);
using IndexType = fixed_size_simd<int, Size>;
using index_type = IndexType;
typedef Vector<T, abi> AsArg;
typedef VectorType VectorTypeArg;
protected:
template <typename U> using V = Vector<U, abi>;
// helper that specializes on VectorType
typedef AVX::VectorHelper<VectorType> HV;
// helper that specializes on T
typedef AVX::VectorHelper<T> HT;
// cast any m256/m128 to VectorType
template <typename V> static Vc_INTRINSIC VectorType _cast(V v)
{
return AVX::avx_cast<VectorType>(v);
}
typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
StorageType d;
using WidthT = Common::WidthT<VectorType>;
// ICC can't compile this:
// static constexpr WidthT Width = WidthT();
public:
#include "../common/generalinterface.h"
static Vc_ALWAYS_INLINE_L Vector Random() Vc_ALWAYS_INLINE_R;
///////////////////////////////////////////////////////////////////////////////////////////
// internal: required to enable returning objects of VectorType
Vc_ALWAYS_INLINE Vector(VectorTypeArg x) : d(x) {}
// implict conversion from compatible Vector<U, abi>
template <typename U>
Vc_INTRINSIC Vector(
V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
void *>::type = nullptr)
: d(AVX::convert<U, T>(x.data()))
{
}
#if Vc_IS_VERSION_1
// static_cast from the remaining Vector<U, abi>
template <typename U>
Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
"vector types") Vc_INTRINSIC explicit Vector(
V<U> x,
typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
void *>::type = nullptr)
: d(Detail::zeroExtendIfNeeded(AVX::convert<U, T>(x.data())))
{
}
// static_cast from other types, implemented via the non-member simd_cast function in
// simd_cast_caller.tcc
template <typename U,
typename = enable_if<Traits::is_simd_vector<U>::value &&
!std::is_same<Vector, Traits::decay<U>>::value>>
Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
"vector types") Vc_INTRINSIC_L
explicit Vector(U &&x) Vc_INTRINSIC_R;
#endif
Vc_INTRINSIC explicit Vector(reference a) : Vector(static_cast<EntryType>(a)) {}
///////////////////////////////////////////////////////////////////////////////////////////
// broadcast
Vc_INTRINSIC Vector(EntryType a) : d(Detail::avx_broadcast(a)) {}
template <typename U>
Vc_INTRINSIC Vector(U a,
typename std::enable_if<std::is_same<U, int>::value &&
!std::is_same<U, EntryType>::value,
void *>::type = nullptr)
: Vector(static_cast<EntryType>(a))
{
}
//template<typename U>
explicit Vector(std::initializer_list<EntryType>)
{
static_assert(std::is_same<EntryType, void>::value,
"A SIMD vector object cannot be initialized from an initializer list "
"because the number of entries in the vector is target-dependent.");
}
#include "../common/loadinterface.h"
#include "../common/storeinterface.h"
///////////////////////////////////////////////////////////////////////////////////////////
// zeroing
Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R;
Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R;
Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R;
Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
Vc_INTRINSIC_L void setQnan(MaskArgument k) Vc_INTRINSIC_R;
#include "../common/gatherinterface.h"
#include "../common/scatterinterface.h"
#if defined Vc_IMPL_AVX2 && !defined Vc_MSVC
// skip this code for MSVC because it fails to do overload resolution correctly
////////////////////////////////////////////////////////////////////////////////
// non-converting pd, ps, and epi32 gathers
template <class U, class A, int Scale, int N = Vector<U, A>::size(),
class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<T, Vector<U, A>, Scale> &args)
{
d.v() = AVX::gather<sizeof(T) * Scale>(
args.address,
simd_cast<conditional_t<Size == 4, SSE::int_v, AVX2::int_v>>(args.indexes)
.data());
}
// masked overload
template <class U, class A, int Scale, int N = Vector<U, A>::size(),
class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<T, Vector<U, A>, Scale> &args, MaskArgument k)
{
d.v() = AVX::gather<sizeof(T) * Scale>(
d.v(), k.data(), args.address,
simd_cast<conditional_t<Size == 4, SSE::int_v, AVX2::int_v>>(args.indexes)
.data());
}
////////////////////////////////////////////////////////////////////////////////
// converting (from 8-bit and 16-bit integers only) epi16 gather emulation via
// epi32 gathers
template <
class MT, class U, class A, int Scale,
class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
(sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
{
using AVX2::int_v;
const auto idx0 = simd_cast<int_v, 0>(args.indexes).data();
const auto idx1 = simd_cast<int_v, 1>(args.indexes).data();
*this = simd_cast<Vector>(int_v(AVX::gather<sizeof(MT) * Scale>(
aliasing_cast<int>(args.address), idx0)),
int_v(AVX::gather<sizeof(MT) * Scale>(
aliasing_cast<int>(args.address), idx1)));
if (sizeof(MT) == 1) {
if (std::is_signed<MT>::value) {
using Signed = AVX2::Vector<typename std::make_signed<T>::type>;
*this = (simd_cast<Signed>(*this) << 8) >> 8; // sign extend
} else {
*this &= 0xff;
}
}
}
// masked overload
template <
class MT, class U, class A, int Scale,
class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
(sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<MT, Vector<U, A>, Scale> &args, MaskArgument k)
{
using AVX2::int_v;
const auto idx0 = simd_cast<int_v, 0>(args.indexes).data();
const auto idx1 = simd_cast<int_v, 1>(args.indexes).data();
const auto k0 = simd_cast<AVX2::int_m, 0>(k).data();
const auto k1 = simd_cast<AVX2::int_m, 1>(k).data();
auto v = simd_cast<Vector>(
int_v(AVX::gather<sizeof(MT) * Scale>(
_mm256_setzero_si256(), k0, aliasing_cast<int>(args.address), idx0)),
int_v(AVX::gather<sizeof(MT) * Scale>(
_mm256_setzero_si256(), k1, aliasing_cast<int>(args.address), idx1)));
if (sizeof(MT) == 1) {
if (std::is_signed<MT>::value) {
using Signed = AVX2::Vector<typename std::make_signed<T>::type>;
v = (simd_cast<Signed>(v) << 8) >> 8; // sign extend
} else {
v &= 0xff;
}
}
assign(v, k);
}
////////////////////////////////////////////////////////////////////////////////
// all remaining converting gathers
template <class MT, class U, class A, int Scale>
Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
Traits::is_valid_vector_argument<MT>::value &&
!std::is_same<MT, T>::value &&
Vector<U, A>::size() >= size()),
void>
gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
{
*this = simd_cast<Vector>(fixed_size_simd<MT, Size>(args));
}
// masked overload
template <class MT, class U, class A, int Scale>
Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
Traits::is_valid_vector_argument<MT>::value &&
!std::is_same<MT, T>::value &&
Vector<U, A>::size() >= size()),
void>
gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args,
MaskArgument k)
{
assign(simd_cast<Vector>(fixed_size_simd<MT, Size>(args, k)), k);
}
#endif // Vc_IMPL_AVX2 && !MSVC
///////////////////////////////////////////////////////////////////////////////////////////
//prefix
Vc_ALWAYS_INLINE Vector &operator++() { data() = Detail::add(data(), Detail::one(T()), T()); return *this; }
Vc_ALWAYS_INLINE Vector &operator--() { data() = Detail::sub(data(), Detail::one(T()), T()); return *this; }
//postfix
Vc_ALWAYS_INLINE Vector operator++(int) { const Vector r = *this; data() = Detail::add(data(), Detail::one(T()), T()); return r; }
Vc_ALWAYS_INLINE Vector operator--(int) { const Vector r = *this; data() = Detail::sub(data(), Detail::one(T()), T()); return r; }
private:
friend reference;
Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
{
return o.d.m(i);
}
template <typename U>
Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
noexcept(std::declval<value_type &>() = v))
{
return o.d.set(i, v);
}
public:
/**
* \note the returned object models the concept of a reference and
* as such it can exist longer than the data it is referencing.
* \note to avoid lifetime issues, we strongly advice not to store
* any reference objects.
*/
Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
{
static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
return {*this, int(index)};
}
Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
{
return d.m(index);
}
Vc_INTRINSIC_L Vc_PURE_L Vector operator[](Permutation::ReversedTag) const Vc_INTRINSIC_R Vc_PURE_R;
Vc_INTRINSIC_L Vc_PURE_L Vector operator[](const IndexType &perm) const Vc_INTRINSIC_R Vc_PURE_R;
Vc_INTRINSIC Vc_PURE Mask operator!() const
{
return *this == Zero();
}
Vc_ALWAYS_INLINE Vector operator~() const
{
#ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
static_assert(std::is_integral<T>::value,
"bit-complement can only be used with Vectors of integral type");
#endif
return Detail::andnot_(data(), Detail::allone<VectorType>());
}
Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; }
// shifts
#define Vc_OP_VEC(op) \
Vc_INTRINSIC Vector &operator op##=(AsArg x); \
Vc_INTRINSIC Vc_PURE Vector operator op(AsArg x) const \
{ \
static_assert( \
std::is_integral<T>::value, \
"bitwise-operators can only be used with Vectors of integral type"); \
}
Vc_ALL_SHIFTS(Vc_OP_VEC);
#undef Vc_OP_VEC
Vc_ALWAYS_INLINE_L Vector &operator>>=(int x) Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L Vector &operator<<=(int x) Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L Vector operator>>(int x) const Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L Vector operator<<(int x) const Vc_ALWAYS_INLINE_R;
Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
isNegative() const
{
return Vc::isnegative(*this);
}
Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) {
data() = Detail::blend(data(), v.data(), mask.data());
}
template <typename V2>
Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2
staticCast() const
{
return V2(*this);
}
template <typename V2>
Vc_DEPRECATED("use reinterpret_components_cast instead") Vc_ALWAYS_INLINE V2
reinterpretCast() const
{
return AVX::avx_cast<typename V2::VectorType>(data());
}
Vc_ALWAYS_INLINE WriteMaskedVector operator()(const Mask &k)
{
return {*this, k};
}
Vc_ALWAYS_INLINE VectorType &data() { return d.v(); }
Vc_ALWAYS_INLINE const VectorType &data() const { return d.v(); }
template<int Index>
Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R;
Vc_INTRINSIC_L std::pair<Vector, int> minIndex() const Vc_INTRINSIC_R;
Vc_INTRINSIC_L std::pair<Vector, int> maxIndex() const Vc_INTRINSIC_R;
Vc_ALWAYS_INLINE EntryType min() const { return Detail::min(data(), T()); }
Vc_ALWAYS_INLINE EntryType max() const { return Detail::max(data(), T()); }
Vc_ALWAYS_INLINE EntryType product() const { return Detail::mul(data(), T()); }
Vc_ALWAYS_INLINE EntryType sum() const { return Detail::add(data(), T()); }
Vc_ALWAYS_INLINE_L Vector partialSum() const Vc_ALWAYS_INLINE_R;
//template<typename BinaryOperation> Vc_ALWAYS_INLINE_L Vector partialSum(BinaryOperation op) const Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L EntryType min(MaskArgument m) const Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L EntryType max(MaskArgument m) const Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L EntryType product(MaskArgument m) const Vc_ALWAYS_INLINE_R;
Vc_ALWAYS_INLINE_L EntryType sum(MaskArgument m) const Vc_ALWAYS_INLINE_R;
Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R;
Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
template <typename F> void callWithValuesSorted(F &&f)
{
EntryType value = d.m(0);
f(value);
for (size_t i = 1; i < Size; ++i) {
if (d.m(i) != value) {
value = d.m(i);
f(value);
}
}
}
template <typename F> Vc_INTRINSIC void call(F &&f) const
{
Common::for_all_vector_entries<Size>([&](size_t i) { f(EntryType(d.m(i))); });
}
template <typename F> Vc_INTRINSIC void call(F &&f, const Mask &mask) const
{
for (size_t i : where(mask)) {
f(EntryType(d.m(i)));
}
}
template <typename F> Vc_INTRINSIC Vector apply(F &&f) const
{
Vector r;
Common::for_all_vector_entries<Size>(
[&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); });
return r;
}
template <typename F> Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const
{
Vector r(*this);
for (size_t i : where(mask)) {
r.d.set(i, f(EntryType(r.d.m(i))));
}
return r;
}
template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f(i)); });
}
Vc_INTRINSIC void fill(EntryType (&f)()) {
Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f()); });
}
template <typename G> static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R;
Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector
copySign(AsArg x) const
{
return Vc::copysign(*this, x);
}
Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
{
Vc::exponent(*this);
}
Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R;
Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R;
};
#undef Vc_CURRENT_CLASS_NAME
template <typename T> constexpr size_t Vector<T, VectorAbi::Avx>::Size;
template <typename T> constexpr size_t Vector<T, VectorAbi::Avx>::MemoryAlignment;
#define Vc_CONDITIONAL_ASSIGN(name_, op_) \
template <Operator O, typename T, typename M, typename U> \
Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign( \
AVX2::Vector<T> &lhs, M &&mask, U &&rhs) \
{ \
lhs(mask) op_ rhs; \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN( Assign, =);
Vc_CONDITIONAL_ASSIGN( PlusAssign, +=);
Vc_CONDITIONAL_ASSIGN( MinusAssign, -=);
Vc_CONDITIONAL_ASSIGN( MultiplyAssign, *=);
Vc_CONDITIONAL_ASSIGN( DivideAssign, /=);
Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
Vc_CONDITIONAL_ASSIGN( XorAssign, ^=);
Vc_CONDITIONAL_ASSIGN( AndAssign, &=);
Vc_CONDITIONAL_ASSIGN( OrAssign, |=);
Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
#undef Vc_CONDITIONAL_ASSIGN
#define Vc_CONDITIONAL_ASSIGN(name_, expr_) \
template <Operator O, typename T, typename M> \
Vc_INTRINSIC enable_if<O == Operator::name_, AVX2::Vector<T>> conditional_assign( \
AVX2::Vector<T> &lhs, M &&mask) \
{ \
return expr_; \
} \
Vc_NOTHING_EXPECTING_SEMICOLON
Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++);
Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask));
Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--);
Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask));
#undef Vc_CONDITIONAL_ASSIGN
} // namespace Vc
#include "vector.tcc"
#include "simd_cast.h"
#endif // VC_AVX_VECTOR_H_

939
Vc/avx/vector.tcc Normal file
View File

@ -0,0 +1,939 @@
/* This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#include "../common/x86_prefetches.h"
#include "../common/gatherimplementation.h"
#include "../common/scatterimplementation.h"
#include "limits.h"
#include "const.h"
#include "../common/set.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
// compare operators {{{1
Vc_INTRINSIC AVX2::double_m operator==(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpeq_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator==(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpeq_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::double_m operator!=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpneq_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator!=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpneq_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::double_m operator>=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpnlt_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator>=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpnlt_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::double_m operator<=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmple_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator<=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmple_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::double_m operator> (AVX2::double_v a, AVX2::double_v b) { return AVX::cmpgt_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator> (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpgt_ps(a.data(), b.data()); }
Vc_INTRINSIC AVX2::double_m operator< (AVX2::double_v a, AVX2::double_v b) { return AVX::cmplt_pd(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: float_m operator< (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmplt_ps(a.data(), b.data()); }
#ifdef Vc_IMPL_AVX2
Vc_INTRINSIC AVX2:: int_m operator==(AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: uint_m operator==(AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: short_m operator==(AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
Vc_INTRINSIC AVX2::ushort_m operator==(AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: int_m operator!=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: uint_m operator!=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: short_m operator!=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
Vc_INTRINSIC AVX2::ushort_m operator!=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: int_m operator>=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmplt_epi32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: uint_m operator>=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmplt_epu32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: short_m operator>=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmplt_epi16(a.data(), b.data())); }
Vc_INTRINSIC AVX2::ushort_m operator>=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmplt_epu16(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: int_m operator<=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpgt_epi32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: uint_m operator<=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpgt_epu32(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: short_m operator<=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpgt_epi16(a.data(), b.data())); }
Vc_INTRINSIC AVX2::ushort_m operator<=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpgt_epu16(a.data(), b.data())); }
Vc_INTRINSIC AVX2:: int_m operator> (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpgt_epi32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: uint_m operator> (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpgt_epu32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: short_m operator> (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpgt_epi16(a.data(), b.data()); }
Vc_INTRINSIC AVX2::ushort_m operator> (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpgt_epu16(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: int_m operator< (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmplt_epi32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: uint_m operator< (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmplt_epu32(a.data(), b.data()); }
Vc_INTRINSIC AVX2:: short_m operator< (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmplt_epi16(a.data(), b.data()); }
Vc_INTRINSIC AVX2::ushort_m operator< (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmplt_epu16(a.data(), b.data()); }
#endif // Vc_IMPL_AVX2
// bitwise operators {{{1
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator^(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return xor_(a.data(), b.data());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator&(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return and_(a.data(), b.data());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator|(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return or_(a.data(), b.data());
}
// }}}1
// arithmetic operators {{{1
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator+(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return add(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator-(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return sub(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator*(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return mul(a.data(), b.data(), T());
}
template <typename T>
Vc_INTRINSIC AVX2::Vector<T> operator/(AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return div(a.data(), b.data(), T());
}
Vc_INTRINSIC AVX2::Vector<ushort> operator/(AVX2::Vector<ushort> a,
AVX2::Vector<ushort> b)
{
using namespace AVX;
const __m256 lo = _mm256_div_ps(convert<ushort, float>(lo128(a.data())),
convert<ushort, float>(lo128(b.data())));
const __m256 hi = _mm256_div_ps(convert<ushort, float>(hi128(a.data())),
convert<ushort, float>(hi128(b.data())));
const float_v threshold = 32767.f;
using Detail::operator>;
const __m128i loShort = (Vc_IS_UNLIKELY((float_v(lo) > threshold).isNotEmpty()))
? convert<float, ushort>(lo)
: convert<float, short>(lo);
const __m128i hiShort = (Vc_IS_UNLIKELY((float_v(hi) > threshold).isNotEmpty()))
? convert<float, ushort>(hi)
: convert<float, short>(hi);
return concat(loShort, hiShort);
}
template <typename T>
Vc_INTRINSIC enable_if<std::is_integral<T>::value, AVX2::Vector<T>> operator%(
AVX2::Vector<T> a, AVX2::Vector<T> b)
{
return a - a / b * b;
}
// }}}1
} // namespace Detail
///////////////////////////////////////////////////////////////////////////////////////////
// generate {{{1
template <> template <typename G> Vc_INTRINSIC AVX2::double_v AVX2::double_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
return _mm256_setr_pd(tmp0, tmp1, tmp2, tmp3);
}
template <> template <typename G> Vc_INTRINSIC AVX2::float_v AVX2::float_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
const auto tmp4 = gen(4);
const auto tmp5 = gen(5);
const auto tmp6 = gen(6);
const auto tmp7 = gen(7);
return _mm256_setr_ps(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
}
#ifdef Vc_IMPL_AVX2
template <> template <typename G> Vc_INTRINSIC AVX2::int_v AVX2::int_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
const auto tmp4 = gen(4);
const auto tmp5 = gen(5);
const auto tmp6 = gen(6);
const auto tmp7 = gen(7);
return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
}
template <> template <typename G> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
const auto tmp4 = gen(4);
const auto tmp5 = gen(5);
const auto tmp6 = gen(6);
const auto tmp7 = gen(7);
return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
}
template <> template <typename G> Vc_INTRINSIC AVX2::short_v AVX2::short_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
const auto tmp4 = gen(4);
const auto tmp5 = gen(5);
const auto tmp6 = gen(6);
const auto tmp7 = gen(7);
const auto tmp8 = gen(8);
const auto tmp9 = gen(9);
const auto tmp10 = gen(10);
const auto tmp11 = gen(11);
const auto tmp12 = gen(12);
const auto tmp13 = gen(13);
const auto tmp14 = gen(14);
const auto tmp15 = gen(15);
return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
}
template <> template <typename G> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::generate(G gen)
{
const auto tmp0 = gen(0);
const auto tmp1 = gen(1);
const auto tmp2 = gen(2);
const auto tmp3 = gen(3);
const auto tmp4 = gen(4);
const auto tmp5 = gen(5);
const auto tmp6 = gen(6);
const auto tmp7 = gen(7);
const auto tmp8 = gen(8);
const auto tmp9 = gen(9);
const auto tmp10 = gen(10);
const auto tmp11 = gen(11);
const auto tmp12 = gen(12);
const auto tmp13 = gen(13);
const auto tmp14 = gen(14);
const auto tmp15 = gen(15);
return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
}
#endif
// constants {{{1
template <typename T> Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(VectorSpecialInitializerZero) : d{} {}
template <> Vc_INTRINSIC Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_pd()) {}
template <> Vc_INTRINSIC Vector< float, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_ps()) {}
#ifdef Vc_IMPL_AVX2
template <> Vc_INTRINSIC Vector< int, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi32()) {}
template <> Vc_INTRINSIC Vector< uint, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu32()) {}
template <> Vc_INTRINSIC Vector< short, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi16()) {}
template <> Vc_INTRINSIC Vector<ushort, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu16()) {}
template <> Vc_INTRINSIC Vector< schar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi8()) {}
template <> Vc_INTRINSIC Vector< uchar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu8()) {}
#endif
template <typename T>
Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx>::Vector(
VectorSpecialInitializerIndexesFromZero)
: Vector(AVX::IndexesFromZeroData<T>::address(), Vc::Aligned)
{
}
template <>
Vc_ALWAYS_INLINE Vector<float, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
: Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
{
}
template <>
Vc_ALWAYS_INLINE Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
: Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
{
}
///////////////////////////////////////////////////////////////////////////////////////////
// load member functions {{{1
// general load, implemented via LoadHelper {{{2
template <typename DstT>
template <typename SrcT, typename Flags>
Vc_INTRINSIC typename Vector<DstT, VectorAbi::Avx>::
#ifndef Vc_MSVC
template
#endif
load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Avx>::load(const SrcT *mem, Flags flags)
{
Common::handleLoadPrefetches(mem, flags);
d.v() = Detail::load<VectorType, DstT>(mem, flags);
}
///////////////////////////////////////////////////////////////////////////////////////////
// zeroing {{{1
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero()
{
data() = Detail::zero<VectorType>();
}
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero(const Mask &k)
{
data() = Detail::andnot_(k.data(), data());
}
template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZeroInverted(const Mask &k)
{
data() = Detail::and_(k.data(), data());
}
template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan()
{
data() = Detail::allone<VectorType>();
}
template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan(MaskArgument k)
{
data() = _mm256_or_pd(data(), k.dataD());
}
template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan()
{
data() = Detail::allone<VectorType>();
}
template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan(MaskArgument k)
{
data() = _mm256_or_ps(data(), k.dataF());
}
///////////////////////////////////////////////////////////////////////////////////////////
// stores {{{1
template <typename T>
template <typename U,
typename Flags,
typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Flags flags) const
{
Common::handleStorePrefetches(mem, flags);
HV::template store<Flags>(mem, data());
}
template <typename T>
template <typename U,
typename Flags,
typename>
Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Mask mask, Flags flags) const
{
Common::handleStorePrefetches(mem, flags);
HV::template store<Flags>(mem, data(), mask.data());
}
///////////////////////////////////////////////////////////////////////////////////////////
// integer ops {{{1
#ifdef Vc_IMPL_AVX2
template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srav_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srlv_epi32(d.v(), x.d.v()); }
template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
template <typename T>
Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(AsArg x)
{
static_assert(std::is_integral<T>::value,
"bitwise-operators can only be used with Vectors of integral type");
return *this = *this << x;
}
template <typename T>
Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(AsArg x)
{
static_assert(std::is_integral<T>::value,
"bitwise-operators can only be used with Vectors of integral type");
return *this = *this >> x;
}
#endif
template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(int shift) {
d.v() = Detail::shiftRight(d.v(), shift, T());
return *static_cast<AVX2::Vector<T> *>(this);
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator>>(int shift) const {
return Detail::shiftRight(d.v(), shift, T());
}
template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(int shift) {
d.v() = Detail::shiftLeft(d.v(), shift, T());
return *static_cast<AVX2::Vector<T> *>(this);
}
template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator<<(int shift) const {
return Detail::shiftLeft(d.v(), shift, T());
}
// isnegative {{{1
Vc_INTRINSIC Vc_CONST AVX2::float_m isnegative(AVX2::float_v x)
{
return AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
AVX::avx_cast<__m256i>(_mm256_and_ps(AVX::setsignmask_ps(), x.data()))));
}
Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x)
{
return Mem::permute<X1, X1, X3, X3>(AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data())))));
}
// gathers {{{1
#define Vc_GATHER_IMPL(V_) \
template <> \
template <class MT, class IT, int Scale> \
inline void AVX2::V_::gatherImplementation( \
const Common::GatherArguments<MT, IT, Scale> &args)
#define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
Vc_GATHER_IMPL(double_v) { d.v() = _mm256_setr_pd(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
Vc_GATHER_IMPL(float_v)
{
d.v() = _mm256_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6),
Vc_M(7));
}
#ifdef Vc_IMPL_AVX2
Vc_GATHER_IMPL(int_v)
{
d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
Vc_M(6), Vc_M(7));
}
Vc_GATHER_IMPL(uint_v)
{
d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
Vc_M(6), Vc_M(7));
}
Vc_GATHER_IMPL(short_v)
{
d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
}
Vc_GATHER_IMPL(ushort_v)
{
d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
}
#endif
#undef Vc_M
#undef Vc_GATHER_IMPL
template <class T>
template <class MT, class IT, int Scale>
inline void Vector<T, VectorAbi::Avx>::gatherImplementation(
const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
{
const auto *mem = args.address;
const auto indexes = Scale * args.indexes;
using Selector = std::integral_constant < Common::GatherScatterImplementation,
#ifdef Vc_USE_SET_GATHERS
Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
#endif
#ifdef Vc_USE_BSF_GATHERS
Common::GatherScatterImplementation::BitScanLoop
#elif defined Vc_USE_POPCNT_BSF_GATHERS
Common::GatherScatterImplementation::PopcntSwitch
#else
Common::GatherScatterImplementation::SimpleLoop
#endif
> ;
Common::executeGather(Selector(), *this, mem, indexes, mask);
}
template <typename T>
template <typename MT, typename IT>
inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes) const
{
Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
}
template <typename T>
template <typename MT, typename IT>
inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
{
using Selector = std::integral_constant < Common::GatherScatterImplementation,
#ifdef Vc_USE_SET_GATHERS
Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
#endif
#ifdef Vc_USE_BSF_GATHERS
Common::GatherScatterImplementation::BitScanLoop
#elif defined Vc_USE_POPCNT_BSF_GATHERS
Common::GatherScatterImplementation::PopcntSwitch
#else
Common::GatherScatterImplementation::SimpleLoop
#endif
> ;
Common::executeScatter(Selector(), *this, mem, std::forward<IT>(indexes), mask);
}
///////////////////////////////////////////////////////////////////////////////////////////
// operator- {{{1
#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
{
return VectorType(-d.builtin());
}
#else
template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
{
return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
}
#endif
///////////////////////////////////////////////////////////////////////////////////////////
// horizontal ops {{{1
template <typename T>
Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
Vector<T, VectorAbi::Avx>::minIndex() const
{
AVX2::Vector<T> x = min();
return std::make_pair(x, (*this == x).firstOne());
}
template <typename T>
Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
Vector<T, VectorAbi::Avx>::maxIndex() const
{
AVX2::Vector<T> x = max();
return std::make_pair(x, (*this == x).firstOne());
}
template <> Vc_INTRINSIC std::pair<AVX2::float_v, int> AVX2::float_v::minIndex() const
{
/*
// 28 cycles latency:
__m256 x = _mm256_min_ps(Mem::permute128<X1, X0>(d.v()), d.v());
x = _mm256_min_ps(x, Reg::permute<X2, X3, X0, X1>(x));
AVX2::float_v xx = _mm256_min_ps(x, Reg::permute<X1, X0, X3, X2>(x));
AVX2::uint_v idx = AVX2::uint_v::IndexesFromZero();
idx = _mm256_castps_si256(
_mm256_or_ps((*this != xx).data(), _mm256_castsi256_ps(idx.data())));
return std::make_pair(xx, (*this == xx).firstOne());
__m128 loData = AVX::lo128(d.v());
__m128 hiData = AVX::hi128(d.v());
const __m128 less2 = _mm_cmplt_ps(hiData, loData);
loData = _mm_min_ps(loData, hiData);
hiData = Mem::permute<X2, X3, X0, X1>(loData);
const __m128 less1 = _mm_cmplt_ps(hiData, loData);
loData = _mm_min_ps(loData, hiData);
hiData = Mem::permute<X1, X0, X3, X2>(loData);
const __m128 less0 = _mm_cmplt_ps(hiData, loData);
unsigned bits = _mm_movemask_ps(less0) & 0x1;
bits |= ((_mm_movemask_ps(less1) << 1) - bits) & 0x2;
bits |= ((_mm_movemask_ps(less2) << 3) - bits) & 0x4;
loData = _mm_min_ps(loData, hiData);
return std::make_pair(AVX::concat(loData, loData), bits);
*/
// 28 cycles Latency:
__m256 x = d.v();
__m256 idx = Vector<float>::IndexesFromZero().data();
__m256 y = Mem::permute128<X1, X0>(x);
__m256 idy = Mem::permute128<X1, X0>(idx);
__m256 less = AVX::cmplt_ps(x, y);
x = _mm256_blendv_ps(y, x, less);
idx = _mm256_blendv_ps(idy, idx, less);
y = Reg::permute<X2, X3, X0, X1>(x);
idy = Reg::permute<X2, X3, X0, X1>(idx);
less = AVX::cmplt_ps(x, y);
x = _mm256_blendv_ps(y, x, less);
idx = _mm256_blendv_ps(idy, idx, less);
y = Reg::permute<X1, X0, X3, X2>(x);
idy = Reg::permute<X1, X0, X3, X2>(idx);
less = AVX::cmplt_ps(x, y);
idx = _mm256_blendv_ps(idy, idx, less);
const auto index = _mm_cvtsi128_si32(AVX::avx_cast<__m128i>(idx));
#ifdef Vc_GNU_ASM
__asm__ __volatile__(""); // help GCC to order the instructions better
#endif
x = _mm256_blendv_ps(y, x, less);
return std::make_pair(x, index);
}
template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum() const
{
// a b c d e f g h
// + a b c d e f g -> a ab bc cd de ef fg gh
// + a ab bc cd de ef -> a ab abc abcd bcde cdef defg efgh
// + a ab abc abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
AVX2::Vector<T> tmp = *this;
if (Size > 1) tmp += tmp.shifted(-1);
if (Size > 2) tmp += tmp.shifted(-2);
if (Size > 4) tmp += tmp.shifted(-4);
if (Size > 8) tmp += tmp.shifted(-8);
if (Size > 16) tmp += tmp.shifted(-16);
return tmp;
}
/* This function requires correct masking because the neutral element of \p op is not necessarily 0
*
template<typename T> template<typename BinaryOperation> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum(BinaryOperation op) const
{
// a b c d e f g h
// + a b c d e f g -> a ab bc cd de ef fg gh
// + a ab bc cd de ef -> a ab abc abcd bcde cdef defg efgh
// + a ab abc abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
AVX2::Vector<T> tmp = *this;
Mask mask(true);
if (Size > 1) tmp(mask) = op(tmp, tmp.shifted(-1));
if (Size > 2) tmp(mask) = op(tmp, tmp.shifted(-2));
if (Size > 4) tmp(mask) = op(tmp, tmp.shifted(-4));
if (Size > 8) tmp(mask) = op(tmp, tmp.shifted(-8));
if (Size > 16) tmp(mask) = op(tmp, tmp.shifted(-16));
return tmp;
}
*/
template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::min(MaskArgument m) const
{
AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::max();
tmp(m) = *this;
return tmp.min();
}
template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::max(MaskArgument m) const
{
AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::min();
tmp(m) = *this;
return tmp.max();
}
template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::product(MaskArgument m) const
{
AVX2::Vector<T> tmp(Vc::One);
tmp(m) = *this;
return tmp.product();
}
template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::sum(MaskArgument m) const
{
AVX2::Vector<T> tmp(Vc::Zero);
tmp(m) = *this;
return tmp.sum();
}//}}}
// exponent {{{1
namespace Detail
{
Vc_INTRINSIC Vc_CONST __m256 exponent(__m256 v)
{
using namespace AVX;
__m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(v), 23);
__m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(v)), 23);
tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
}
Vc_INTRINSIC Vc_CONST __m256d exponent(__m256d v)
{
using namespace AVX;
__m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(v), 52);
__m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(v)), 52);
tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1))));
}
} // namespace Detail
Vc_INTRINSIC Vc_CONST AVX2::float_v exponent(AVX2::float_v x)
{
using Detail::operator>=;
Vc_ASSERT((x >= x.Zero()).isFull());
return Detail::exponent(x.data());
}
Vc_INTRINSIC Vc_CONST AVX2::double_v exponent(AVX2::double_v x)
{
using Detail::operator>=;
Vc_ASSERT((x >= x.Zero()).isFull());
return Detail::exponent(x.data());
}
// }}}1
// Random {{{1
static Vc_ALWAYS_INLINE __m256i _doRandomStep()
{
using Detail::operator*;
using Detail::operator+;
#ifdef Vc_IMPL_AVX2
using AVX2::uint_v;
uint_v state0(&Common::RandomState[0]);
uint_v state1(&Common::RandomState[uint_v::Size]);
(state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
_mm256_srli_epi32(state1.data(), 16)))
.store(&Common::RandomState[0]);
return state0.data();
#else
using SSE::uint_v;
uint_v state0(&Common::RandomState[0]);
uint_v state1(&Common::RandomState[uint_v::Size]);
uint_v state2(&Common::RandomState[2 * uint_v::Size]);
uint_v state3(&Common::RandomState[3 * uint_v::Size]);
(state2 * uint_v(0xdeece66du) + uint_v(11))
.store(&Common::RandomState[2 * uint_v::Size]);
(state3 * uint_v(0xdeece66du) + uint_v(11))
.store(&Common::RandomState[3 * uint_v::Size]);
uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
_mm_srli_epi32(state2.data(), 16)))
.store(&Common::RandomState[0]);
uint_v(Detail::xor_((state1 * uint_v(0xdeece66du) + uint_v(11)).data(),
_mm_srli_epi32(state3.data(), 16)))
.store(&Common::RandomState[uint_v::Size]);
return AVX::concat(state0.data(), state1.data());
#endif
}
#ifdef Vc_IMPL_AVX2
template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::Random()
{
return {_doRandomStep()};
}
#endif
template <> Vc_ALWAYS_INLINE AVX2::float_v AVX2::float_v::Random()
{
return HT::sub(Detail::or_(_cast(AVX::srli_epi32<2>(_doRandomStep())), HT::one()),
HT::one());
}
template<> Vc_ALWAYS_INLINE AVX2::double_v AVX2::double_v::Random()
{
const __m256i state = Detail::load(&Common::RandomState[0], Vc::Aligned,
Detail::LoadTag<__m256i, int>());
for (size_t k = 0; k < 8; k += 2) {
typedef unsigned long long uint64 Vc_MAY_ALIAS;
const uint64 stateX = *aliasing_cast<uint64>(&Common::RandomState[k]);
*aliasing_cast<uint64>(&Common::RandomState[k]) = (stateX * 0x5deece66dull + 11);
}
return HT::sub(Detail::or_(_cast(AVX::srli_epi64<12>(state)), HT::one()), HT::one());
}
// }}}1
// shifted / rotated {{{1
template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount) const
{
return Detail::shifted<EntryType>(d.v(), amount);
}
template <typename VectorType>
Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m128>)
{
return Mem::shuffle<X2, X3, Y0, Y1>(left, right);
}
template <typename VectorType>
Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m256>)
{
return Mem::shuffle128<X1, Y0>(left, right);
}
template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount, Vector shiftIn) const
{
#ifdef __GNUC__
if (__builtin_constant_p(amount)) {
const __m256i a = AVX::avx_cast<__m256i>(d.v());
const __m256i b = AVX::avx_cast<__m256i>(shiftIn.d.v());
if (amount * 2 == int(Size)) {
return shifted_shortcut(d.v(), shiftIn.d.v(), WidthT());
}
if (amount * 2 == -int(Size)) {
return shifted_shortcut(shiftIn.d.v(), d.v(), WidthT());
}
switch (amount) {
case 1:
return AVX::avx_cast<VectorType>(
#ifdef Vc_IMPL_AVX2
_mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
sizeof(EntryType))
#else // Vc_IMPL_AVX2
AVX::concat(
_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), sizeof(EntryType)),
_mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), sizeof(EntryType)))
#endif // Vc_IMPL_AVX2
);
case 2:
return AVX::avx_cast<VectorType>(
#ifdef Vc_IMPL_AVX2
_mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
2 * sizeof(EntryType))
#else // Vc_IMPL_AVX2
AVX::concat(
_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), 2 * sizeof(EntryType)),
_mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), 2 * sizeof(EntryType)))
#endif // Vc_IMPL_AVX2
);
case 3:
if (6u < Size) {
return AVX::avx_cast<VectorType>(
#ifdef Vc_IMPL_AVX2
_mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
3 * sizeof(EntryType))
#else // Vc_IMPL_AVX2
AVX::concat(_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a),
3 * sizeof(EntryType)),
_mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a),
3 * sizeof(EntryType)))
#endif // Vc_IMPL_AVX2
);
// TODO: } else {
}
}
}
#endif
using Detail::operator|;
return shifted(amount) | (amount > 0 ?
shiftIn.shifted(amount - Size) :
shiftIn.shifted(Size + amount));
}
template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::rotated(int amount) const
{
return Detail::rotated<EntryType, size()>(d.v(), amount);
}
// sorted {{{1
template <typename T>
Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::sorted()
const
{
return Detail::sorted(*this);
}
// interleaveLow/-High {{{1
template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveLow(AVX2::double_v x) const
{
return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_pd(data(), x.data()),
_mm256_unpackhi_pd(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveHigh(AVX2::double_v x) const
{
return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_pd(data(), x.data()),
_mm256_unpackhi_pd(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveLow(AVX2::float_v x) const
{
return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_ps(data(), x.data()),
_mm256_unpackhi_ps(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveHigh(AVX2::float_v x) const
{
return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_ps(data(), x.data()),
_mm256_unpackhi_ps(data(), x.data()));
}
#ifdef Vc_IMPL_AVX2
template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveLow ( AVX2::int_v x) const {
return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
_mm256_unpackhi_epi32(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveHigh( AVX2::int_v x) const {
return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
_mm256_unpackhi_epi32(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveLow ( AVX2::uint_v x) const {
return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
_mm256_unpackhi_epi32(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveHigh( AVX2::uint_v x) const {
return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
_mm256_unpackhi_epi32(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveLow ( AVX2::short_v x) const {
return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
_mm256_unpackhi_epi16(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveHigh( AVX2::short_v x) const {
return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
_mm256_unpackhi_epi16(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveLow (AVX2::ushort_v x) const {
return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
_mm256_unpackhi_epi16(data(), x.data()));
}
template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveHigh(AVX2::ushort_v x) const {
return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
_mm256_unpackhi_epi16(data(), x.data()));
}
#endif
// permutation via operator[] {{{1
template <> Vc_INTRINSIC Vc_PURE AVX2::double_v AVX2::double_v::operator[](Permutation::ReversedTag) const
{
return Mem::permute128<X1, X0>(Mem::permute<X1, X0, X3, X2>(d.v()));
}
template <> Vc_INTRINSIC Vc_PURE AVX2::float_v AVX2::float_v::operator[](Permutation::ReversedTag) const
{
return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
}
#ifdef Vc_IMPL_AVX2
template <>
Vc_INTRINSIC Vc_PURE AVX2::int_v AVX2::int_v::operator[](Permutation::ReversedTag) const
{
return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
}
template <>
Vc_INTRINSIC Vc_PURE AVX2::uint_v AVX2::uint_v::operator[](Permutation::ReversedTag) const
{
return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
}
template <>
Vc_INTRINSIC Vc_PURE AVX2::short_v AVX2::short_v::operator[](
Permutation::ReversedTag) const
{
return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
}
template <>
Vc_INTRINSIC Vc_PURE AVX2::ushort_v AVX2::ushort_v::operator[](
Permutation::ReversedTag) const
{
return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
}
#endif
template <> Vc_INTRINSIC AVX2::float_v Vector<float, VectorAbi::Avx>::operator[](const IndexType &/*perm*/) const
{
// TODO
return *this;
#ifdef Vc_IMPL_AVX2
#else
/*
const int_m cross128 = AVX::concat(_mm_cmpgt_epi32(AVX::lo128(perm.data()), _mm_set1_epi32(3)),
_mm_cmplt_epi32(AVX::hi128(perm.data()), _mm_set1_epi32(4)));
if (cross128.isNotEmpty()) {
AVX2::float_v x = _mm256_permutevar_ps(d.v(), perm.data());
x(cross128) = _mm256_permutevar_ps(Mem::permute128<X1, X0>(d.v()), perm.data());
return x;
} else {
*/
#endif
}
// reversed {{{1
template <typename T>
Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::reversed() const
{
return (*this)[Permutation::Reversed];
}
// broadcast from constexpr index {{{1
template <> template <int Index> Vc_INTRINSIC AVX2::float_v AVX2::float_v::broadcast() const
{
constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
constexpr VecPos Outer = static_cast<VecPos>((Index & 0x4) / 4);
return Mem::permute<Inner, Inner, Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
}
template <> template <int Index> Vc_INTRINSIC AVX2::double_v AVX2::double_v::broadcast() const
{
constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
constexpr VecPos Outer = static_cast<VecPos>((Index & 0x2) / 2);
return Mem::permute<Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
}
// }}}1
} // namespace Vc
// vim: foldmethod=marker

257
Vc/avx/vectorhelper.h Normal file
View File

@ -0,0 +1,257 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_AVX_VECTORHELPER_H_
#define VC_AVX_VECTORHELPER_H_
#include <limits>
#include "types.h"
#include "intrinsics.h"
#include "casts.h"
#include "../common/loadstoreflags.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace AVX
{
template<> struct VectorHelper<__m256>
{
typedef __m256 VectorType;
typedef const VectorType VTArg;
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_ps(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_ps(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_ps(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_ps()); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
};
template<> struct VectorHelper<__m256d>
{
typedef __m256d VectorType;
typedef const VectorType VTArg;
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_pd(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_pd(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_pd(mem, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_pd()); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
};
template<> struct VectorHelper<__m256i>
{
typedef __m256i VectorType;
typedef const VectorType VTArg;
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_si256()); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
};
#define Vc_OP1(op) \
static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return Vc_CAT2(_mm256_##op##_, Vc_SUFFIX)(a); }
#define Vc_OP(op) \
static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(op##_ , Vc_SUFFIX)(a, b); }
#define Vc_OP_(op) \
static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op , Vc_SUFFIX)(a, b); }
#define Vc_OPx(op, op2) \
static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op2##_, Vc_SUFFIX)(a, b); }
template<> struct VectorHelper<double> {
typedef __m256d VectorType;
typedef const VectorType VTArg;
typedef double EntryType;
#define Vc_SUFFIX pd
static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(_mm256_castps_pd(mask), a); }
static Vc_ALWAYS_INLINE VectorType set(const double a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) {
return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d);
}
static Vc_ALWAYS_INLINE VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }// set(1.); }
static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
#ifdef Vc_IMPL_FMA4
v1 = _mm256_macc_pd(v1, v2, v3);
#else
VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
#if defined(Vc_GCC) && Vc_GCC < 0x40703
// GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703
asm("":"+x"(h1), "+x"(h2));
#endif
const VectorType l1 = _mm256_sub_pd(v1, h1);
const VectorType l2 = _mm256_sub_pd(v2, h2);
const VectorType ll = mul(l1, l2);
const VectorType lh = add(mul(l1, h2), mul(h1, l2));
const VectorType hh = mul(h1, h2);
// ll < lh < hh for all entries is certain
const VectorType lh_lt_v3 = cmplt_pd(abs(lh), abs(v3)); // |lh| < |v3|
const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3);
const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3);
v1 = add(add(ll, b), add(c, hh));
#endif
}
static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_pd(a,b); }
static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_pd(a,b); }
static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_pd(a,b); }
Vc_OP1(sqrt)
static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) {
return _mm256_div_pd(one(), sqrt(x));
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
return _mm256_div_pd(one(), x);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_pd());
}
static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_pd(a, b); }
static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_pd(a, b); }
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
__m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
return _mm_cvtsd_f64(b);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
__m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
return _mm_cvtsd_f64(b);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
__m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
return _mm_cvtsd_f64(b);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
__m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
return _mm_cvtsd_f64(b);
}
#undef Vc_SUFFIX
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
return _mm256_round_pd(a, _MM_FROUND_NINT);
}
};
template<> struct VectorHelper<float> {
typedef float EntryType;
typedef __m256 VectorType;
typedef const VectorType VTArg;
#define Vc_SUFFIX ps
static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(mask, a); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d,
const float e, const float f, const float g, const float h) {
return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }// set(1.f); }
static Vc_ALWAYS_INLINE Vc_CONST __m256 concat(__m256d a, __m256d b) { return _mm256_insertf128_ps(avx_cast<__m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }
static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
#ifdef Vc_IMPL_FMA4
v1 = _mm256_macc_ps(v1, v2, v3);
#else
__m256d v1_0 = _mm256_cvtps_pd(lo128(v1));
__m256d v1_1 = _mm256_cvtps_pd(hi128(v1));
__m256d v2_0 = _mm256_cvtps_pd(lo128(v2));
__m256d v2_1 = _mm256_cvtps_pd(hi128(v2));
__m256d v3_0 = _mm256_cvtps_pd(lo128(v3));
__m256d v3_1 = _mm256_cvtps_pd(hi128(v3));
v1 = AVX::concat(
_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
#endif
}
static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_ps(a, b); }
static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_ps(a, b); }
static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_ps(a, b); }
Vc_OP1(sqrt) Vc_OP1(rsqrt)
static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
return _mm256_rcp_ps(x);
}
static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_ps());
}
static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_ps(a, b); }
static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_ps(a, b); }
static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
__m128 b = _mm_min_ps(lo128(a), hi128(a));
b = _mm_min_ps(b, _mm_movehl_ps(b, b)); // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3
return _mm_cvtss_f32(b);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
__m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
b = _mm_max_ps(b, _mm_movehl_ps(b, b)); // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3
return _mm_cvtss_f32(b);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
__m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
return _mm_cvtss_f32(b);
}
static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
__m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
return _mm_cvtss_f32(b);
}
#undef Vc_SUFFIX
static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
return _mm256_round_ps(a, _MM_FROUND_NINT);
}
};
#undef Vc_OP1
#undef Vc_OP
#undef Vc_OP_
#undef Vc_OPx
} // namespace AVX(2)
} // namespace Vc
#endif // VC_AVX_VECTORHELPER_H_

166
Vc/common/algorithms.h Normal file
View File

@ -0,0 +1,166 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_ALGORITHMS_H_
#define VC_COMMON_ALGORITHMS_H_
#include "simdize.h"
namespace Vc_VERSIONED_NAMESPACE
{
#ifdef DOXYGEN
/**
* \ingroup Utilities
* \headerfile algorithms.h <Vc/Vc>
*
* Vc variant of the `std::for_each` algorithm.
*
* This algorithm calls \p f with one argument of type
* `Vc::Vector<` *iterator value type* `, ` *unspecified* `>` as often as is needed to
* iterate over the complete range from \p first to \p last.
* It will try to use the best vector size (VectorAbi) to work on the largest chunks
* possible.
* To support aligned loads (and stores) and to support arbitrary range distances, the
* algorithm may require the use of `Vc::VectorAbi` types that work on fewer elements in
* parallel.
*
* The following example requires C++14 for generic lambdas. If you don't have generic
* lambdas available you can use a "classic" functor type with a templated call operator
* instead.
*
* \code
* void scale(std::vector<double> &data, double factor) {
* Vc::simd_for_each(data.begin(), data.end(), [&](auto v) {
* v *= factor;
* });
* }
* \endcode
*/
template <class InputIt, class UnaryFunction>
UnaryFunction simd_for_each(InputIt first, InputIt last, UnaryFunction f);
#else
template <class InputIt, class UnaryFunction,
class ValueType = typename std::iterator_traits<InputIt>::value_type>
inline enable_if<
Traits::is_functor_argument_immutable<UnaryFunction, simdize<ValueType>>::value,
UnaryFunction>
simd_for_each(InputIt first, InputIt last, UnaryFunction f)
{
typedef simdize<ValueType> V;
typedef simdize<ValueType, 1> V1;
const auto lastV = last - V::Size + 1;
for (; first < lastV; first += V::Size) {
V tmp;
load_interleaved(tmp, std::addressof(*first));
f(tmp);
}
for (; first != last; ++first) {
V1 tmp;
load_interleaved(tmp, std::addressof(*first));
f(tmp);
}
return f;
}
template <typename InputIt, typename UnaryFunction,
class ValueType = typename std::iterator_traits<InputIt>::value_type>
inline enable_if<
!Traits::is_functor_argument_immutable<UnaryFunction, simdize<ValueType>>::value,
UnaryFunction>
simd_for_each(InputIt first, InputIt last, UnaryFunction f)
{
typedef simdize<ValueType> V;
typedef simdize<ValueType, 1> V1;
const auto lastV = last - V::size() + 1;
for (; first < lastV; first += V::size()) {
V tmp;
load_interleaved(tmp, std::addressof(*first));
f(tmp);
store_interleaved(tmp, std::addressof(*first));
}
for (; first != last; ++first) {
V1 tmp;
load_interleaved(tmp, std::addressof(*first));
f(tmp);
store_interleaved(tmp, std::addressof(*first));
}
return f;
}
#endif
///////////////////////////////////////////////////////////////////////////////
template <typename InputIt, typename UnaryFunction,
class ValueType = typename std::iterator_traits<InputIt>::value_type>
inline enable_if<
Traits::is_functor_argument_immutable<UnaryFunction, simdize<ValueType>>::value,
UnaryFunction>
simd_for_each_n(InputIt first, std::size_t count, UnaryFunction f)
{
typename std::make_signed<size_t>::type len = count;
typedef simdize<ValueType> V;
typedef simdize<ValueType, 1> V1;
for (; len >= int(V::size()); len -= V::Size, first += V::Size) {
V tmp;
load_interleaved(tmp, std::addressof(*first));
f(tmp);
}
for (; len != 0; --len, ++first) {
V1 tmp;
load_interleaved(tmp, std::addressof(*first));
f(tmp);
}
return f;
}
template <typename InputIt, typename UnaryFunction,
class ValueType = typename std::iterator_traits<InputIt>::value_type>
inline enable_if<
!Traits::is_functor_argument_immutable<UnaryFunction, simdize<ValueType>>::value,
UnaryFunction>
simd_for_each_n(InputIt first, std::size_t count, UnaryFunction f)
{
typename std::make_signed<size_t>::type len = count;
typedef simdize<ValueType> V;
typedef simdize<ValueType, 1> V1;
for (; len >= int(V::size()); len -= V::Size, first += V::Size) {
V tmp;
load_interleaved(tmp, std::addressof(*first));
f(tmp);
store_interleaved(tmp, std::addressof(*first));
}
for (; len != 0; --len, ++first) {
V1 tmp;
load_interleaved(tmp, std::addressof(*first));
f(tmp);
store_interleaved(tmp, std::addressof(*first));
}
return f;
}
} // namespace Vc
#endif // VC_COMMON_ALGORITHMS_H_

View File

@ -0,0 +1,121 @@
/* This file is part of the Vc library. {{{
Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_ALIASINGENTRYHELPER_H_
#define VC_COMMON_ALIASINGENTRYHELPER_H_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template<class StorageType> class AliasingEntryHelper
{
private:
typedef typename StorageType::EntryType T;
#ifdef Vc_ICC
StorageType *const m_storage;
const int m_index;
public:
Vc_ALWAYS_INLINE AliasingEntryHelper(StorageType *d, int index) : m_storage(d), m_index(index) {}
Vc_ALWAYS_INLINE AliasingEntryHelper(const AliasingEntryHelper &) = default;
Vc_ALWAYS_INLINE AliasingEntryHelper(AliasingEntryHelper &&) = default;
Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) {
m_storage->assign(m_index, rhs);
return *this;
}
Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_storage->assign(m_index, x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator +=(T x) { m_storage->assign(m_index, m_storage->m(m_index) + x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator -=(T x) { m_storage->assign(m_index, m_storage->m(m_index) - x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator /=(T x) { m_storage->assign(m_index, m_storage->m(m_index) / x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator *=(T x) { m_storage->assign(m_index, m_storage->m(m_index) * x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator |=(T x) { m_storage->assign(m_index, m_storage->m(m_index) | x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator &=(T x) { m_storage->assign(m_index, m_storage->m(m_index) & x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator ^=(T x) { m_storage->assign(m_index, m_storage->m(m_index) ^ x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator %=(T x) { m_storage->assign(m_index, m_storage->m(m_index) % x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_storage->assign(m_index, m_storage->m(m_index)<< x); return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_storage->assign(m_index, m_storage->m(m_index)>> x); return *this; }
#define m_data m_storage->read(m_index)
#else
typedef T A Vc_MAY_ALIAS;
A &m_data;
public:
template<typename T2>
Vc_ALWAYS_INLINE AliasingEntryHelper(T2 &d) : m_data(reinterpret_cast<A &>(d)) {}
Vc_ALWAYS_INLINE AliasingEntryHelper(A &d) : m_data(d) {}
Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) {
m_data = rhs.m_data;
return *this;
}
Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_data = x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator+=(T x) { m_data += x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator-=(T x) { m_data -= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator/=(T x) { m_data /= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator*=(T x) { m_data *= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator|=(T x) { m_data |= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator&=(T x) { m_data &= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator^=(T x) { m_data ^= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator%=(T x) { m_data %= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_data <<= x; return *this; }
Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_data >>= x; return *this; }
#endif
Vc_ALWAYS_INLINE Vc_PURE operator const T() const { return m_data; }
Vc_ALWAYS_INLINE Vc_PURE bool operator==(T x) const { return static_cast<T>(m_data) == x; }
Vc_ALWAYS_INLINE Vc_PURE bool operator!=(T x) const { return static_cast<T>(m_data) != x; }
Vc_ALWAYS_INLINE Vc_PURE bool operator<=(T x) const { return static_cast<T>(m_data) <= x; }
Vc_ALWAYS_INLINE Vc_PURE bool operator>=(T x) const { return static_cast<T>(m_data) >= x; }
Vc_ALWAYS_INLINE Vc_PURE bool operator< (T x) const { return static_cast<T>(m_data) < x; }
Vc_ALWAYS_INLINE Vc_PURE bool operator> (T x) const { return static_cast<T>(m_data) > x; }
Vc_ALWAYS_INLINE Vc_PURE T operator-() const { return -static_cast<T>(m_data); }
Vc_ALWAYS_INLINE Vc_PURE T operator~() const { return ~static_cast<T>(m_data); }
Vc_ALWAYS_INLINE Vc_PURE T operator+(T x) const { return static_cast<T>(m_data) + x; }
Vc_ALWAYS_INLINE Vc_PURE T operator-(T x) const { return static_cast<T>(m_data) - x; }
Vc_ALWAYS_INLINE Vc_PURE T operator/(T x) const { return static_cast<T>(m_data) / x; }
Vc_ALWAYS_INLINE Vc_PURE T operator*(T x) const { return static_cast<T>(m_data) * x; }
Vc_ALWAYS_INLINE Vc_PURE T operator|(T x) const { return static_cast<T>(m_data) | x; }
Vc_ALWAYS_INLINE Vc_PURE T operator&(T x) const { return static_cast<T>(m_data) & x; }
Vc_ALWAYS_INLINE Vc_PURE T operator^(T x) const { return static_cast<T>(m_data) ^ x; }
Vc_ALWAYS_INLINE Vc_PURE T operator%(T x) const { return static_cast<T>(m_data) % x; }
//T operator<<(T x) const { return static_cast<T>(m_data) << x; }
//T operator>>(T x) const { return static_cast<T>(m_data) >> x; }
#ifdef m_data
#undef m_data
#endif
};
} // namespace Common
} // namespace Vc
#endif // VC_COMMON_ALIASINGENTRYHELPER_H_

137
Vc/common/alignedbase.h Normal file
View File

@ -0,0 +1,137 @@
/* This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_ALIGNEDBASE_H_
#define VC_COMMON_ALIGNEDBASE_H_
#include "types.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
/**\internal
* Break the recursion of the function below.
*/
template <typename T> constexpr T max(T a) { return a; }
/**\internal
* \returns the maximum of all specified arguments.
*/
template <typename T, typename... Ts> constexpr T max(T a, T b, Ts... rest)
{
return a > b ? max(a, rest...) : max(b, rest...);
}
} // namespace Detail
namespace Common
{
template <std::size_t> Vc_INTRINSIC void *aligned_malloc(std::size_t);
Vc_ALWAYS_INLINE void free(void *);
} // namespace Common
/**
* \ingroup Utilities
*
* Helper class to ensure a given alignment.
*
* This class reimplements the \c new and \c delete operators to align objects allocated
* on the heap suitably with the specified alignment \c Alignment.
*
* \see Vc::VectorAlignedBase
* \see Vc::MemoryAlignedBase
*/
template <std::size_t Alignment> struct alignas(Alignment) AlignedBase
{
Vc_FREE_STORE_OPERATORS_ALIGNED(Alignment);
};
/**
* \ingroup Utilities
*
* Helper type to ensure suitable alignment for any Vc::Vector<T> type (using the default
* VectorAbi).
*
* This class reimplements the \c new and \c delete operators to align objects allocated
* on the heap suitably for objects of Vc::Vector<T> type. This is necessary since the
* standard \c new operator does not adhere to the alignment requirements of the type.
*
* \see Vc::VectorAlignedBaseT
* \see Vc::MemoryAlignedBase
* \see Vc::AlignedBase
*/
using VectorAlignedBase = AlignedBase<
Detail::max(alignof(Vector<float>), alignof(Vector<double>), alignof(Vector<ullong>),
alignof(Vector<llong>), alignof(Vector<ulong>), alignof(Vector<long>),
alignof(Vector<uint>), alignof(Vector<int>), alignof(Vector<ushort>),
alignof(Vector<short>), alignof(Vector<uchar>), alignof(Vector<schar>))>;
/**
* \ingroup Utilities
* Variant of the above type ensuring suitable alignment only for the specified vector
* type \p V.
*
* \see Vc::VectorAlignedBase
* \see Vc::MemoryAlignedBaseT
*/
template <typename V> using VectorAlignedBaseT = AlignedBase<alignof(V)>;
/**
* \ingroup Utilities
*
* Helper class to ensure suitable alignment for arrays of scalar objects for any
* Vc::Vector<T> type (using the default VectorAbi).
*
* This class reimplements the \c new and \c delete operators to align objects allocated
* on the heap suitably for arrays of type \p Vc::Vector<T>::EntryType. Subsequent load
* and store operations are safe to use the aligned variant.
*
* \see Vc::MemoryAlignedBaseT
* \see Vc::VectorAlignedBase
* \see Vc::AlignedBase
*/
using MemoryAlignedBase = AlignedBase<
Detail::max(Vector<float>::MemoryAlignment, Vector<double>::MemoryAlignment,
Vector<ullong>::MemoryAlignment, Vector<llong>::MemoryAlignment,
Vector<ulong>::MemoryAlignment, Vector<long>::MemoryAlignment,
Vector<uint>::MemoryAlignment, Vector<int>::MemoryAlignment,
Vector<ushort>::MemoryAlignment, Vector<short>::MemoryAlignment,
Vector<uchar>::MemoryAlignment, Vector<schar>::MemoryAlignment)>;
/**
* \ingroup Utilities
* Variant of the above type ensuring suitable alignment only for the specified vector
* type \p V.
*
* \see Vc::MemoryAlignedBase
* \see Vc::VectorAlignedBaseT
*/
template <typename V> using MemoryAlignedBaseT = AlignedBase<V::MemoryAlignment>;
}
#endif // VC_COMMON_ALIGNEDBASE_H_
// vim: foldmethod=marker

View File

@ -0,0 +1,62 @@
/* This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_BITSCANINTRINSICS_H_
#define VC_COMMON_BITSCANINTRINSICS_H_
#if defined(Vc_GCC) || defined(Vc_CLANG) || defined(Vc_APPLECLANG)
#include <x86intrin.h>
# ifndef _bit_scan_forward
# define _bit_scan_forward(x) __builtin_ctz(x)
#include "macros.h"
static Vc_ALWAYS_INLINE Vc_CONST int _Vc_bit_scan_reverse_asm(unsigned int x) {
int r;
__asm__("bsr %1,%0" : "=r"(r) : "X"(x));
return r;
}
# define _bit_scan_reverse(x) _Vc_bit_scan_reverse_asm(x)
# endif
#elif defined(_WIN32)
#include <intrin.h>
static inline __forceinline unsigned long _bit_scan_forward(unsigned long x) {
unsigned long index;
_BitScanForward(&index, x);
return index;
}
static inline __forceinline unsigned long _bit_scan_reverse(unsigned long x) {
unsigned long index;
_BitScanReverse(&index, x);
return index;
}
#elif defined(Vc_ICC)
// for all I know ICC supports the _bit_scan_* intrinsics
#else
// just assume the compiler can do it
#endif
#endif // VC_COMMON_BITSCANINTRINSICS_H_

92
Vc/common/const.h Normal file
View File

@ -0,0 +1,92 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_CONST_H_
#define VC_COMMON_CONST_H_
#include <type_traits>
#include "../global.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <int exponent> constexpr double exponentToFloat(std::integral_constant<bool, true>);
template <int exponent> constexpr double exponentToFloat(std::integral_constant<bool, false>);
template <> constexpr double exponentToFloat<0>(std::integral_constant<bool, true>)
{
return 1.;
}
template <> constexpr double exponentToFloat<0>(std::integral_constant<bool, false>)
{
return 1.;
}
template <> constexpr double exponentToFloat<-32>(std::integral_constant<bool, true>)
{
return 1. / (65536. * 65536.);
}
template <> constexpr double exponentToFloat<32>(std::integral_constant<bool, false>)
{
return 65536. * 65536.;
}
template <> constexpr double exponentToFloat<-64>(std::integral_constant<bool, true>)
{
return 1. / (65536. * 65536. * 65536. * 65536.);
}
template <> constexpr double exponentToFloat<64>(std::integral_constant<bool, false>)
{
return 65536. * 65536. * 65536. * 65536.;
}
template <int exponent>
constexpr double exponentToFloat(std::integral_constant<bool, false> negative)
{
return exponentToFloat<exponent - 1>(negative) * 2.0;
}
template <int exponent>
constexpr double exponentToFloat(std::integral_constant<bool, true> negative)
{
return exponentToFloat<exponent + 1>(negative) * 0.5;
}
template <int sign, unsigned long long mantissa, int exponent> constexpr double doubleConstant()
{
return (static_cast<double>((mantissa & 0x000fffffffffffffull) | 0x0010000000000000ull) /
0x0010000000000000ull) *
exponentToFloat<exponent>(std::integral_constant<bool, (exponent < 0)>()) * sign;
}
template <int sign, unsigned int mantissa, int exponent> constexpr float floatConstant()
{
return (static_cast<float>((mantissa & 0x007fffffu) | 0x00800000u) / 0x00800000u) *
static_cast<float>(
exponentToFloat<exponent>(std::integral_constant<bool, (exponent < 0)>())) *
sign;
}
} // namespace Detail
} // namespace Vc
#endif // VC_COMMON_CONST_H_

43
Vc/common/data.h Normal file
View File

@ -0,0 +1,43 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_CONST_DATA_H_
#define VC_COMMON_CONST_DATA_H_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
alignas(64) extern unsigned int RandomState[];
alignas(32) extern const unsigned int AllBitsSet[8];
} // namespace Common
} // namespace Vc
#endif // VC_COMMON_CONST_DATA_H_

91
Vc/common/deinterleave.h Normal file
View File

@ -0,0 +1,91 @@
/* This file is part of the Vc library. {{{
Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_DEINTERLEAVE_H_
#define VC_COMMON_DEINTERLEAVE_H_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
/**
* \ingroup Vectors
*
* \deprecated Turn to InterleavedMemoryWrapper for a more flexible and complete solution.
*
* Loads two vectors of values from an interleaved array.
*
* \param a, b The vectors to load the values from memory into.
* \param memory The memory location where to read the next 2 * V::Size values from
* \param align Either pass Vc::Aligned or Vc::Unaligned. It defaults to Vc::Aligned if nothing is
* specified.
*
* If you store your data as
* \code
* struct { float x, y; } m[1000];
* \endcode
* then the deinterleave function allows you to read \p Size concurrent x and y values like this:
* \code
* Vc::float_v x, y;
* Vc::deinterleave(&x, &y, &m[10], Vc::Unaligned);
* \endcode
* This code will load m[10], m[12], m[14], ... into \p x and m[11], m[13], m[15], ... into \p y.
*
* The deinterleave function supports the following type combinations:
\verbatim
V \ M | float | double | ushort | short | uint | int
=========|=======|========|========|=======|======|=====
float_v | X | | X | X | |
---------|-------|--------|--------|-------|------|-----
double_v | | X | | | |
---------|-------|--------|--------|-------|------|-----
int_v | | | | X | | X
---------|-------|--------|--------|-------|------|-----
uint_v | | | X | | X |
---------|-------|--------|--------|-------|------|-----
short_v | | | | X | |
---------|-------|--------|--------|-------|------|-----
ushort_v | | | X | | |
\endverbatim
*/
template<typename V, typename M, typename A> Vc_ALWAYS_INLINE void deinterleave(V *a, V *b,
const M *memory, A align)
{
Detail::deinterleave(*a, *b, memory, align);
}
// documented as default for align above
template<typename V, typename M> Vc_ALWAYS_INLINE void deinterleave(V *a, V *b,
const M *memory)
{
Detail::deinterleave(*a, *b, memory, Aligned);
}
} // namespace Vc
#endif // VC_COMMON_DEINTERLEAVE_H_

137
Vc/common/detail.h Normal file
View File

@ -0,0 +1,137 @@
/* This file is part of the Vc library. {{{
Copyright © 2018 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_DETAIL_H_
#define VC_COMMON_DETAIL_H_
#include <vector>
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
// convertIndexVector {{{
// if the argument is a Vector<T> already we definitely want to keep it that way
template <typename IV>
Vc_INTRINSIC enable_if<(Traits::is_simd_vector<IV>::value &&
sizeof(typename IV::EntryType) >= sizeof(int)),
const IV &>
convertIndexVector(const IV &indexVector)
{
return indexVector;
}
// but if the scalar (integral) type is smaller than int we convert it up to int. Otherwise it's
// very likely that the calculations we have to perform will overflow.
template <typename IV>
Vc_INTRINSIC enable_if<(Traits::is_simd_vector<IV>::value &&
sizeof(typename IV::EntryType) < sizeof(int)),
fixed_size_simd<int, IV::Size>>
convertIndexVector(const IV &indexVector)
{
return static_cast<fixed_size_simd<int, IV::Size>>(indexVector);
}
// helper for promoting int types to int or higher
template <class T> using promoted_type = decltype(std::declval<T>() + 1);
// std::array, Vc::array, and C-array are fixed size and can therefore be converted to a
// fixed_size_simd of the same size
template <typename T, std::size_t N>
Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
convertIndexVector(const std::array<T, N> &indexVector)
{
return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
Vc::Unaligned};
}
template <typename T, std::size_t N>
Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
convertIndexVector(const Vc::array<T, N> &indexVector)
{
return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
Vc::Unaligned};
}
template <typename T, std::size_t N>
Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
convertIndexVector(const T (&indexVector)[N])
{
return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
Vc::Unaligned};
}
// a plain pointer won't work. Because we need some information on the number of values in
// the index argument
#ifndef Vc_MSVC
// MSVC treats the function as usable in SFINAE context if it is deleted. If it's not declared we
// seem to get what we wanted (except for bad diagnostics)
template <class T>
enable_if<std::is_pointer<T>::value, void> convertIndexVector(T indexVector) = delete;
#endif
// an initializer_list works, but is runtime-sized (before C++14, at least) so we have to
// fall back to std::vector
template <typename T>
Vc_INTRINSIC std::vector<promoted_type<T>> convertIndexVector(
const std::initializer_list<T> &indexVector)
{
return {begin(indexVector), end(indexVector)};
}
// a std::vector cannot be converted to anything better
template <typename T>
Vc_INTRINSIC
enable_if<(std::is_integral<T>::value && sizeof(T) >= sizeof(int)), std::vector<T>>
convertIndexVector(const std::vector<T> &indexVector)
{
return indexVector;
}
template <typename T>
Vc_INTRINSIC enable_if<(std::is_integral<T>::value && sizeof(T) < sizeof(int)),
std::vector<promoted_type<T>>>
convertIndexVector(const std::vector<T> &indexVector)
{
return {std::begin(indexVector), std::end(indexVector)};
}
template <class T,
class = enable_if<
(!std::is_pointer<T>::value && !Traits::is_simd_vector<T>::value &&
!std::is_lvalue_reference<decltype(std::declval<const T &>()[0])>::value)>>
Vc_INTRINSIC const T &convertIndexVector(const T &i)
{
return i;
}
// }}}
} // namespace Common
} // namespace Vc_VERSIONED_NAMESPACE
#endif // VC_COMMON_DETAIL_H_
// vim: foldmethod=marker

View File

@ -0,0 +1,178 @@
/* This file is part of the Vc library. {{{
Copyright © 2016 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_ELEMENTREFERENCE_H_
#define VC_COMMON_ELEMENTREFERENCE_H_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename U, typename Accessor = U> class ElementReference
{
friend U;
friend Accessor;
Vc_INTRINSIC ElementReference(U &o, int i) noexcept : index(i), obj(o) {}
static constexpr bool get_noexcept =
noexcept(Accessor::get(std::declval<U &>(), int()));
template <typename T> static constexpr bool set_noexcept()
{
return noexcept(Accessor::set(std::declval<U &>(), int(), std::declval<T>()));
}
public:
using value_type = typename U::value_type;
Vc_INTRINSIC ElementReference(const ElementReference &) = delete;
/**
* Move Constructor
*
* this is the only way to constructor an ElementReference in user code
*
* \note
* Please be aware that this class models the concept of a reference
* and as such it can have the same lifetime issue as a standard C++
* reference.
*
* \note
* C++ 17 support copy-elision, which in turn allows to
* the ElementReference obtained via operator[] from a function
* and avoid copying. C++11 and C++14 don't offer this, thus we add
* the move constructor, to allow them to move the data and thus avoid
* copying (which was prohibited by the deleted constructor above
*/
Vc_INTRINSIC ElementReference(ElementReference &&) = default;
Vc_INTRINSIC operator value_type() const noexcept(get_noexcept)
{
return Accessor::get(obj, index);
}
template <typename T>
Vc_INTRINSIC ElementReference &operator=(T &&x) &&
noexcept(noexcept(Accessor::set(std::declval<U &>(), int(), std::declval<T>())))
{
Accessor::set(obj, index, std::forward<T>(x));
return *this;
}
// TODO: improve with operator.()
#define Vc_OP_(op_) \
template <typename T, typename R = decltype(std::declval<const value_type &>() \
op_ std::declval<T>())> \
Vc_INTRINSIC ElementReference &operator op_##=(T &&x) && \
noexcept(get_noexcept && noexcept(Accessor::set(std::declval<U &>(), int(), \
std::declval<R &&>()))) \
{ \
const value_type &lhs = Accessor::get(obj, index); \
Accessor::set(obj, index, lhs op_ std::forward<T>(x)); \
return *this; \
}
Vc_ALL_ARITHMETICS(Vc_OP_);
Vc_ALL_SHIFTS(Vc_OP_);
Vc_ALL_BINARY(Vc_OP_);
#undef Vc_OP_
template <typename = void>
Vc_INTRINSIC ElementReference &operator++() &&
noexcept(noexcept(std::declval<value_type &>() =
Accessor::get(std::declval<U &>(), int())) &&
set_noexcept<decltype(++std::declval<value_type &>())>())
{
value_type x = Accessor::get(obj, index);
Accessor::set(obj, index, ++x);
return *this;
}
template <typename = void>
Vc_INTRINSIC value_type operator++(int) &&
noexcept(noexcept(std::declval<value_type &>() =
Accessor::get(std::declval<U &>(), int())) &&
set_noexcept<decltype(std::declval<value_type &>()++)>())
{
const value_type r = Accessor::get(obj, index);
value_type x = r;
Accessor::set(obj, index, ++x);
return r;
}
template <typename = void>
Vc_INTRINSIC ElementReference &operator--() &&
noexcept(noexcept(std::declval<value_type &>() =
Accessor::get(std::declval<U &>(), int())) &&
set_noexcept<decltype(--std::declval<value_type &>())>())
{
value_type x = Accessor::get(obj, index);
Accessor::set(obj, index, --x);
return *this;
}
template <typename = void>
Vc_INTRINSIC value_type operator--(int) &&
noexcept(noexcept(std::declval<value_type &>() =
Accessor::get(std::declval<U &>(), int())) &&
set_noexcept<decltype(std::declval<value_type &>()--)>())
{
const value_type r = Accessor::get(obj, index);
value_type x = r;
Accessor::set(obj, index, --x);
return r;
}
friend void swap(ElementReference &&a, ElementReference &&b) {
value_type tmp(a);
static_cast<ElementReference &&>(a) = static_cast<value_type>(b);
static_cast<ElementReference &&>(b) = tmp;
}
friend void swap(value_type &a, ElementReference &&b) {
value_type tmp(a);
a = static_cast<value_type>(b);
static_cast<ElementReference &&>(b) = tmp;
}
friend void swap(ElementReference &&a, value_type &b) {
value_type tmp(a);
static_cast<ElementReference &&>(a) = b;
b = tmp;
}
private:
int index;
U &obj;
};
} // namespace Detail
} // namespace Vc
#endif // VC_COMMON_ELEMENTREFERENCE_H_
// vim: foldmethod=marker

91
Vc/common/exponential.h Normal file
View File

@ -0,0 +1,91 @@
/* This file is part of the Vc library. {{{
Copyright © 2012-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-------------------------------------------------------------------
The exp implementation is derived from Cephes, which carries the
following Copyright notice:
Cephes Math Library Release 2.2: June, 1992
Copyright 1984, 1987, 1989 by Stephen L. Moshier
Direct inquiries to 30 Frost Street, Cambridge, MA 02140
}}}*/
#ifdef Vc_COMMON_MATH_H_INTERNAL
constexpr float log2_e = 1.44269504088896341f;
// These constants are adjusted to account for single-precision floating point.
// The original are for double precision:
//
// constexpr float MAXLOGF = 88.72283905206835f;
// constexpr float MINLOGF = -103.278929903431851103f; /* log(2^-149) */
constexpr float MAXLOGF = 88.722831726074219f; /* log(2^127.99998474121094f) */
constexpr float MINLOGF = -88.029685974121094f; /* log(2^-126.99999237060547f) */
constexpr float MAXNUMF = 3.4028234663852885981170418348451692544e38f;
template <typename Abi, typename = enable_if<std::is_same<Abi, VectorAbi::Sse>::value ||
std::is_same<Abi, VectorAbi::Avx>::value>>
inline Vector<float, detail::not_fixed_size_abi<Abi>> exp(Vector<float, Abi> x)
{
using V = Vector<float, Abi>;
typedef typename V::Mask M;
typedef Detail::Const<float, Abi> C;
const M overflow = x > MAXLOGF;
const M underflow = x < MINLOGF;
// log₂(eˣ) = x * log₂(e) * log₂(2)
// = log₂(2^(x * log₂(e)))
// => eˣ = 2^(x * log₂(e))
// => n = ⌊x * log₂(e) + ½⌋
// => y = x - n * ln(2) | recall that: ln(2) * log₂(e) == 1
// <=> eˣ = 2ⁿ * eʸ
V z = floor(C::log2_e() * x + 0.5f);
const auto n = static_cast<Vc::SimdArray<int, V::Size>>(z);
x -= z * C::ln2_large();
x -= z * C::ln2_small();
/* Theoretical peak relative error in [-0.5, +0.5] is 4.2e-9. */
z = ((((( 1.9875691500E-4f * x
+ 1.3981999507E-3f) * x
+ 8.3334519073E-3f) * x
+ 4.1665795894E-2f) * x
+ 1.6666665459E-1f) * x
+ 5.0000001201E-1f) * (x * x)
+ x
+ 1.0f;
x = ldexp(z, n); // == z * 2ⁿ
x(overflow) = std::numeric_limits<typename V::EntryType>::infinity();
x.setZero(underflow);
return x;
}
#endif // Vc_COMMON_MATH_H_INTERNAL

View File

@ -0,0 +1,79 @@
/*{{{
Copyright (C) 2013-2015 Matthias Kretz <kretz@kde.org>
Permission to use, copy, modify, and distribute this software
and its documentation for any purpose and without fee is hereby
granted, provided that the above copyright notice appear in all
copies and that both that the copyright notice and this
permission notice and warranty disclaimer appear in supporting
documentation, and that the name of the author not be used in
advertising or publicity pertaining to distribution of the
software without specific, written prior permission.
The author disclaim all warranties with regard to this
software, including all implied warranties of merchantability
and fitness. In no event shall the author be liable for any
special, indirect or consequential damages or any damages
whatsoever resulting from loss of use, data or profits, whether
in an action of contract, negligence or other tortious action,
arising out of or in connection with the use or performance of
this software.
}}}*/
#ifndef VC_COMMON_FIX_CLANG_EMMINTRIN_H_
#define VC_COMMON_FIX_CLANG_EMMINTRIN_H_
#include "../global.h"
#if (defined Vc_CLANG && Vc_CLANG < 0x30700) || (defined Vc_APPLECLANG && Vc_APPLECLANG < 0x70000)
#ifdef _mm_slli_si128
#undef _mm_slli_si128
#define _mm_slli_si128(a, count) __extension__ ({ \
(__m128i)__builtin_ia32_pslldqi128((__m128i)(a), (count)*8); })
#endif
#ifdef _mm_srli_si128
#undef _mm_srli_si128
#define _mm_srli_si128(a, count) __extension__ ({ \
(__m128i)__builtin_ia32_psrldqi128((__m128i)(a), (count)*8); })
#endif
#ifdef _mm_shuffle_epi32
#undef _mm_shuffle_epi32
#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
(__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), (__v4si) _mm_set1_epi32(0), \
(imm) & 0x3, ((imm) & 0xc) >> 2, \
((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
#endif
#ifdef _mm_shufflelo_epi16
#undef _mm_shufflelo_epi16
#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \
(imm) & 0x3, ((imm) & 0xc) >> 2, \
((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
4, 5, 6, 7); })
#endif
#ifdef _mm_shufflehi_epi16
#undef _mm_shufflehi_epi16
#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \
0, 1, 2, 3, \
4 + (((imm) & 0x03) >> 0), \
4 + (((imm) & 0x0c) >> 2), \
4 + (((imm) & 0x30) >> 4), \
4 + (((imm) & 0xc0) >> 6)); })
#endif
#ifdef _mm_shuffle_pd
#undef _mm_shuffle_pd
#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, (((i) & 2) >> 1) + 2); })
#endif
#endif // Vc_CLANG || Vc_APPLECLANG
#endif // VC_COMMON_FIX_CLANG_EMMINTRIN_H_

View File

@ -0,0 +1,318 @@
/* This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_GATHERIMPLEMENTATION_H_
#define VC_COMMON_GATHERIMPLEMENTATION_H_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
enum class GatherScatterImplementation : int {
SimpleLoop,
SetIndexZero,
BitScanLoop,
PopcntSwitch
};
using SimpleLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SimpleLoop>;
using SetIndexZeroT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SetIndexZero>;
using BitScanLoopT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::BitScanLoop>;
using PopcntSwitchT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::PopcntSwitch>;
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(SetIndexZeroT,
V &v,
const MT *mem,
IT &&indexes_,
typename V::MaskArgument mask)
{
auto indexes = std::forward<IT>(indexes_);
indexes.setZeroInverted(static_cast<decltype(!indexes)>(mask));
const V tmp(mem, indexes);
where(mask) | v = tmp;
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(SimpleLoopT, V &v, const MT *mem, const IT &indexes,
const typename V::MaskArgument mask)
{
if (Vc_IS_UNLIKELY(mask.isEmpty())) {
return;
}
#if defined Vc_GCC && Vc_GCC >= 0x40900
// GCC 4.8 doesn't support dependent type and constexpr vector_size argument
constexpr std::size_t Sizeof = sizeof(V);
using Builtin [[gnu::vector_size(Sizeof)]] = typename V::value_type;
Builtin tmp = reinterpret_cast<Builtin>(v.data());
Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
if (mask[i]) {
tmp[i] = mem[indexes[i]];
}
});
v.data() = reinterpret_cast<typename V::VectorType>(tmp);
#else
Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
if (mask[i])
v[i] = mem[indexes[i]];
});
#endif
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(BitScanLoopT,
V &v,
const MT *mem,
const IT &indexes,
typename V::MaskArgument mask)
{
#ifdef Vc_GNU_ASM
size_t bits = mask.toInt();
while (Vc_IS_LIKELY(bits > 0)) {
size_t i, j;
asm("bsf %[bits],%[i]\n\t"
"bsr %[bits],%[j]\n\t"
"btr %[i],%[bits]\n\t"
"btr %[j],%[bits]\n\t"
: [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
v[i] = mem[indexes[i]];
v[j] = mem[indexes[j]];
}
#else
// Alternative from Vc::SSE (0.7)
int bits = mask.toInt();
while (bits) {
const int i = _bit_scan_forward(bits);
bits &= bits - 1;
v[i] = mem[indexes[i]];
}
#endif // Vc_GNU_ASM
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
V &v,
const MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 16> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low, high = 0;
switch (Vc::Detail::popcnt16(bits)) {
case 16:
v.gather(mem, indexes);
break;
case 15:
low = _bit_scan_forward(bits);
bits ^= 1 << low;
v[low] = mem[indexes[low]];
// fallthrough
case 14:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
// fallthrough
case 13:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
// fallthrough
case 12:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
// fallthrough
case 11:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
// fallthrough
case 10:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
// fallthrough
case 9:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
// fallthrough
case 8:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
// fallthrough
case 7:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
// fallthrough
case 6:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
// fallthrough
case 5:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
// fallthrough
case 4:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
// fallthrough
case 3:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
// fallthrough
case 2:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
// fallthrough
case 1:
low = _bit_scan_forward(bits);
v[low] = mem[indexes[low]];
// fallthrough
case 0:
break;
}
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
V &v,
const MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 8> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low, high = 0;
switch (Vc::Detail::popcnt8(bits)) {
case 8:
v.gather(mem, indexes);
break;
case 7:
low = _bit_scan_forward(bits);
bits ^= 1 << low;
v[low] = mem[indexes[low]];
// fallthrough
case 6:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
// fallthrough
case 5:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
// fallthrough
case 4:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
high = (1 << high);
// fallthrough
case 3:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
v[low] = mem[indexes[low]];
// fallthrough
case 2:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
// fallthrough
case 1:
low = _bit_scan_forward(bits);
v[low] = mem[indexes[low]];
// fallthrough
case 0:
break;
}
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
V &v,
const MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 4> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low, high = 0;
switch (Vc::Detail::popcnt4(bits)) {
case 4:
v.gather(mem, indexes);
break;
case 3:
low = _bit_scan_forward(bits);
bits ^= 1 << low;
v[low] = mem[indexes[low]];
// fallthrough
case 2:
high = _bit_scan_reverse(bits);
v[high] = mem[indexes[high]];
// fallthrough
case 1:
low = _bit_scan_forward(bits);
v[low] = mem[indexes[low]];
// fallthrough
case 0:
break;
}
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
V &v,
const MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 2> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low;
switch (Vc::Detail::popcnt4(bits)) {
case 2:
v.gather(mem, indexes);
break;
case 1:
low = _bit_scan_forward(bits);
v[low] = mem[indexes[low]];
// fallthrough
case 0:
break;
}
}
} // namespace Common
} // namespace Vc
#endif // VC_COMMON_GATHERIMPLEMENTATION_H_

221
Vc/common/gatherinterface.h Normal file
View File

@ -0,0 +1,221 @@
/* This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef Vc_CURRENT_CLASS_NAME
#error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
#endif
///////////////////////////////////////////////////////////////////////////////////////////
// gathers
// A gather takes the following arguments:
// 1. A const pointer to memory of any type that can convert to EntryType
// 2. An indexes “vector”. The requirement is that the type implements the subscript operator,
// stores «Size» valid index values, and each offset to the pointer above yields a valid
// memory location for reading.
// 3. Optionally the third argument may be a mask. The mask disables several memory reads and
// thus removes the requirements in (2.) for the disabled entries.
private:
/**\internal
* This function implements a gather given a pointer to memory \p mem and some
* container object storing the gather \p indexes.
*
* \param mem This pointer must be aligned correctly for the type \p MT. This is the
* natural behavior of C++, so this is typically the case.
* \param indexes This object contains at least \VSize{T} indexes that denote the
* offset in \p mem where the components for the current vector should be copied from.
* The offset is not in Bytes, but in multiples of `sizeof(MT)`.
*/
// enable_if<std::can_convert<MT, EntryType>::value &&
// has_subscript_operator<IT>::value>
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
/**\internal
* This overload of the above function adds a \p mask argument to disable memory
* accesses at the \p indexes offsets where \p mask is \c false.
*/
template <class MT, class IT, int Scale = 1>
inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
MaskArgument mask);
public:
#define Vc_ASSERT_GATHER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<MT, EntryType>::value, \
"The memory pointer needs to point to a type that can be converted to the " \
"EntryType of this SIMD vector type."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
/**
* \name Gather constructors and member functions
*
* Constructs or loads a vector from the objects at `mem[indexes[0]]`,
* `mem[indexes[1]]`, `mem[indexes[2]]`, ...
*
* All gather functions optionally take a mask as last argument. In that case only the
* entries that are selected in the mask are accessed in memory and copied to the
* vector. This enables invalid indexes in the \p indexes vector if those are masked
* off in \p mask.
*
* Gathers from structured data (AoS: arrays of struct) are possible via a special
* subscript operator of the container (array). You can use \ref Vc::array and \ref
* Vc::vector as drop-in replacements for \c std::array and \c std::vector. These
* container classes contain the necessary subscript operator overload. Example:
* \code
* Vc::vector<float> data(100);
* std::iota(data.begin(), data.end(), 0.f); // fill with values 0, 1, 2, ...
* auto indexes = float_v::IndexType::IndexesFromZero();
* float_v gathered = data[indexes]; // gathered == [0, 1, 2, ...]
* \endcode
*
* This also works for gathers into arrays of structures:
* \code
* struct Point { float x, y, z; };
* Vc::array<Point, 100> points;
* // fill points ...
* auto indexes = float_v::IndexType::IndexesFromZero();
* float_v xs = data[indexes][&Point::x]; // [points[0].x, points[1].x, points[2].x, ...]
* float_v ys = data[indexes][&Point::y]; // [points[0].y, points[1].y, points[2].y, ...]
* float_v zs = data[indexes][&Point::z]; // [points[0].z, points[1].z, points[2].z, ...]
* \endcode
*
* Alternatively, you can use Vc::Common::AdaptSubscriptOperator to extend a given
* container class with the necessary subscript operator. Example:
* \code
* template <typename T, typename Allocator = std::allocator<T>>
* using my_vector = Vc::Common::AdaptSubscriptOperator<std::vector<T, Allocator>>;
* \endcode
*
* \param mem A pointer to memory which contains objects of type \p MT at the offsets
* given by \p indexes.
* \param indexes A container/vector of offsets into \p mem.
* The type of \p indexes (\p IT) may either be a pointer to integers
* (C-array) or a vector of integers (preferrably IndexType).
* \param mask If a mask is given, only the active entries will be copied from memory.
*
* \note If you use a masked gather constructor the masked-off entries of the vector
* are zero-initilized.
*/
///@{
/// Gather constructor
template <typename MT, typename IT,
typename = enable_if<Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
/// Masked gather constructor
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
/// Gather function
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
}
/// Masked gather function
template <typename MT, typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(
Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
}
///@}
#include "gatherinterface_deprecated.h"
/**\internal
* \name Gather function to use from Vc::Common::subscript_operator
*
* \param args
* \param mask
*/
///@{
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args);
}
template <class MT, class IT, int Scale>
Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
MaskArgument mask)
{
Vc_ASSERT_GATHER_PARAMETER_TYPES_;
gatherImplementation(args, mask);
}
///@}
#undef Vc_ASSERT_GATHER_PARAMETER_TYPES_

View File

@ -0,0 +1,300 @@
/// \name Deprecated Members
///@{
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param indexes Determines the offsets into \p array where the values are gathered from/scattered
* to. The type of indexes can either be an integer vector or a type that supports
* operator[] access.
*/
template <typename S1, typename IT>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
const EntryType S1::*member1,
IT indexes)
{
gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
array, indexes)[member1]
.gatherArguments());
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param indexes Determines the offsets into \p array where the values are gathered from/scattered
* to. The type of indexes can either be an integer vector or a type that supports
* operator[] access.
* \param mask If a mask is given only the active entries will be gathered/scattered.
*/
template <typename S1, typename IT>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
const EntryType S1::*member1,
IT indexes, MaskArgument mask)
{
gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
array, indexes)[member1]
.gatherArguments(),
mask);
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
* struct (i.e. array[i].*member1.*member2 is read).
* \param indexes Determines the offsets into \p array where the values are gathered from/scattered
* to. The type of indexes can either be an integer vector or a type that supports
* operator[] access.
*/
template <typename S1, typename S2, typename IT>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
const S2 S1::*member1,
const EntryType S2::*member2,
IT indexes)
{
gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
array, indexes)[member1][member2]
.gatherArguments());
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
* struct (i.e. array[i].*member1.*member2 is read).
* \param indexes Determines the offsets into \p array where the values are gathered from/scattered
* to. The type of indexes can either be an integer vector or a type that supports
* operator[] access.
* \param mask If a mask is given only the active entries will be gathered/scattered.
*/
template <typename S1, typename S2, typename IT>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
const S2 S1::*member1,
const EntryType S2::*member2,
IT indexes, MaskArgument mask)
{
gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
array, indexes)[member1][member2]
.gatherArguments(),
mask);
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param outerIndexes
* \param innerIndexes
*/
template <typename S1, typename IT1, typename IT2>
Vc_DEPRECATED(
"use the subscript operator to Vc::array or Vc::vector "
"instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
const EntryType *const S1::*ptrMember1,
IT1 outerIndexes, IT2 innerIndexes)
{
gather(Common::SubscriptOperation<const S1, IT1, std::ratio<1, 1>, true>(
array, outerIndexes)[ptrMember1][innerIndexes]
.gatherArguments());
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param outerIndexes
* \param innerIndexes
* \param mask If a mask is given only the active entries will be gathered/scattered.
*/
template <typename S1, typename IT1, typename IT2>
Vc_DEPRECATED(
"use the subscript operator to Vc::array or Vc::vector "
"instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
const EntryType *const S1::*ptrMember1,
IT1 outerIndexes, IT2 innerIndexes,
MaskArgument mask)
{
gather(Common::SubscriptOperation<const S1, IT1, std::ratio<1, 1>, true>(
array, outerIndexes)[ptrMember1][innerIndexes]
.gatherArguments(),
mask);
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param indexes Determines the offsets into \p array where the values are gathered from/scattered
* to. The type of indexes can either be an integer vector or a type that supports
* operator[] access.
*/
template <typename S1, typename IT>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline void gather(const S1 *array,
const EntryType S1::*member1, IT indexes)
{
gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
array, indexes)[member1]
.gatherArguments());
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param indexes Determines the offsets into \p array where the values are gathered from/scattered
* to. The type of indexes can either be an integer vector or a type that supports
* operator[] access.
* \param mask If a mask is given only the active entries will be gathered/scattered.
*/
template <typename S1, typename IT>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline void gather(const S1 *array,
const EntryType S1::*member1,
IT indexes,
MaskArgument mask)
{
gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
array, indexes)[member1]
.gatherArguments(),
mask);
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
* struct (i.e. array[i].*member1.*member2 is read).
* \param indexes Determines the offsets into \p array where the values are gathered from/scattered
* to. The type of indexes can either be an integer vector or a type that supports
* operator[] access.
*/
template <typename S1, typename S2, typename IT>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline void gather(const S1 *array, const S2 S1::*member1,
const EntryType S2::*member2, IT indexes)
{
gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
array, indexes)[member1][member2]
.gatherArguments());
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
* struct (i.e. array[i].*member1.*member2 is read).
* \param indexes Determines the offsets into \p array where the values are gathered from/scattered
* to. The type of indexes can either be an integer vector or a type that supports
* operator[] access.
* \param mask If a mask is given only the active entries will be gathered/scattered.
*/
template <typename S1, typename S2, typename IT>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline void gather(const S1 *array, const S2 S1::*member1,
const EntryType S2::*member2, IT indexes,
MaskArgument mask)
{
gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
array, indexes)[member1][member2]
.gatherArguments(),
mask);
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param outerIndexes
* \param innerIndexes
*/
template <typename S1, typename IT1, typename IT2>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline void gather(const S1 *array,
const EntryType *const S1::*ptrMember1,
IT1 outerIndexes, IT2 innerIndexes)
{
gather(Common::SubscriptOperation<const S1, IT1, std::ratio<1, 1>, true>(
array, outerIndexes)[ptrMember1][innerIndexes]
.gatherArguments());
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param outerIndexes
* \param innerIndexes
* \param mask If a mask is given only the active entries will be gathered/scattered.
*/
template <typename S1, typename IT1, typename IT2>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline void gather(const S1 *array,
const EntryType *const S1::*ptrMember1,
IT1 outerIndexes, IT2 innerIndexes,
MaskArgument mask)
{
gather(Common::SubscriptOperation<const S1, IT1, std::ratio<1, 1>, true>(
array, outerIndexes)[ptrMember1][innerIndexes]
.gatherArguments(),
mask);
}
///@}

View File

@ -0,0 +1,61 @@
/* This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
public:
///////////////////////////////////////////////////////////////////////////
// init to zero
Vc_INTRINSIC Vector() = default;
///////////////////////////////////////////////////////////////////////////
// types
///////////////////////////////////////////////////////////////////////////
// constants
static constexpr std::size_t size() { return Size; }
///////////////////////////////////////////////////////////////////////////
// constant Vectors
explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R;
explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R;
explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R;
static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); }
static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); }
static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero()
{
return Vector(Vc::IndexesFromZero);
}
///////////////////////////////////////////////////////////////////////////
// generator ctor
template <class G, int = 0,
class = typename std::enable_if<std::is_convertible<
decltype(std::declval<G>()(size_t())), value_type>::value>::type>
explicit Vector(G &&g) : Vector(generate(std::forward<G>(g)))
{
}
// vim: foldmethod=marker

97
Vc/common/iif.h Normal file
View File

@ -0,0 +1,97 @@
/* This file is part of the Vc library. {{{
Copyright © 2012-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_IIF_H_
#define VC_COMMON_IIF_H_
#include "../type_traits"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
/**
* \ingroup Utilities
*
* Function to mimic the ternary operator '?:' (inline-if).
*
* \param condition Determines which values are returned. This is analog to the first argument to
* the ternary operator.
* \param trueValue The values to return where \p condition is \c true.
* \param falseValue The values to return where \p condition is \c false.
* \return A combination of entries from \p trueValue and \p falseValue, according to \p condition.
*
* So instead of the scalar variant
* \code
* float x = a > 1.f ? b : b + c;
* \endcode
* you'd write
* \code
* float_v x = Vc::iif (a > 1.f, b, b + c);
* \endcode
*
* Assuming \c a has the values [0, 3, 5, 1], \c b is [1, 1, 1, 1], and \c c is [1, 2, 3, 4], then x
* will be [2, 2, 3, 5].
*/
template <typename Mask, typename T>
Vc_ALWAYS_INLINE enable_if<is_simd_mask<Mask>::value && is_simd_vector<T>::value, T> iif(
const Mask &condition, const T &trueValue, const T &falseValue)
{
T result(falseValue);
Vc::where(condition) | result = trueValue;
return result;
}
/**\internal
* The following declaration makes it explicit that `iif (Mask, non-vector, non-vector)`
* is not supposed to work. Doing the same thing with \c static_assert would break SFINAE.
*/
template <typename Mask, typename T>
enable_if<is_simd_mask<Mask>::value && !is_simd_vector<T>::value, T> iif(
const Mask &, const T &, const T &) = delete;
/**
* \ingroup Utilities
*
* Overload of the above for boolean conditions.
*
* This typically results in direct use of the ternary operator. This function makes it easier to
* switch from a Vc type to a builtin type.
*
* \param condition Determines which value is returned. This is analog to the first argument to
* the ternary operator.
* \param trueValue The value to return if \p condition is \c true.
* \param falseValue The value to return if \p condition is \c false.
* \return Either \p trueValue or \p falseValue, depending on \p condition.
*/
template<typename T> constexpr T iif (bool condition, const T &trueValue, const T &falseValue)
{
return condition ? trueValue : falseValue;
}
} // namespace Vc
#endif // VC_COMMON_IIF_H_

79
Vc/common/indexsequence.h Normal file
View File

@ -0,0 +1,79 @@
/* This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_INDEXSEQUENCE_H_
#define VC_COMMON_INDEXSEQUENCE_H_
#include "../global.h"
namespace Vc_VERSIONED_NAMESPACE
{
/** \internal
* Helper class for a sequence of size_t values from 0 to N. This type will be included in
* C++14.
*/
template <std::size_t... I> struct index_sequence
{
static constexpr std::size_t size() noexcept { return sizeof...(I); }
};
/** \internal
* This struct builds an index_sequence type from a given upper bound \p N.
* It does so recursively via concatenation of to index sequences of length N/2.
*/
template <std::size_t N> struct make_index_sequence_impl {
template <std::size_t Offset, std::size_t... Ns>
static index_sequence<Ns..., (Ns + Offset)...> join(std::false_type,
index_sequence<Ns...>);
template <std::size_t Offset, std::size_t... Ns>
static index_sequence<Ns..., Offset - 1, (Ns + Offset)...> join(
std::true_type, index_sequence<Ns...>);
using is_odd = std::integral_constant<bool, N & 1>;
using half = typename make_index_sequence_impl<N / 2>::type;
using type = decltype(join<(N + 1) / 2>(is_odd(), half()));
};
template <> struct make_index_sequence_impl<0> {
using type = index_sequence<>;
};
template <> struct make_index_sequence_impl<1> {
using type = index_sequence<0>;
};
template <> struct make_index_sequence_impl<2> {
using type = index_sequence<0, 1>;
};
/** \internal
* Creates an index_sequence type for the upper bound \p N.
*/
template <std::size_t N>
using make_index_sequence = typename make_index_sequence_impl<N>::type;
}
#endif // VC_COMMON_INDEXSEQUENCE_H_
// vim: foldmethod=marker

63
Vc/common/interleave.h Normal file
View File

@ -0,0 +1,63 @@
/* This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_INTERLEAVE_H_
#define VC_COMMON_INTERLEAVE_H_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
/** \ingroup Utilities
Interleaves the entries from \p a and \p b into two vectors of the same type. The order
in the returned vector contains the elements `a[0], b[0], a[1], b[1], a[2], b[2], a[3],
b[3], ...`.
Example:
\code
Vc::SimdArray<int, 4> a = { 1, 2, 3, 4 };
Vc::SimdArray<int, 4> b = { 9, 8, 7, 6 };
std::tie(a, b) = Vc::interleave(a, b);
std::cout << a << b;
// prints:
// <1 9 2 8><3 7 4 6>
\endcode
\param a input vector whose data will appear at even indexes in the output
\param b input vector whose data will appear at odd indexes in the output
\return two vectors with data from \p a and \p b interleaved
*/
template <typename V, typename = enable_if<Traits::is_simd_vector<V>::value>>
std::pair<V, V> interleave(const V &a, const V &b)
{
return {a.interleaveLow(b), a.interleaveHigh(b)};
}
} // namespace Vc
#endif // VC_COMMON_INTERLEAVE_H_
// vim: foldmethod=marker

View File

@ -0,0 +1,351 @@
/* This file is part of the Vc library. {{{
Copyright © 2012-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_INTERLEAVEDMEMORY_H_
#define VC_COMMON_INTERLEAVEDMEMORY_H_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
/**
* \internal
*/
template<typename V, typename I, bool Readonly> struct InterleavedMemoryAccessBase
{
// Partial specialization doesn't work for functions without partial specialization of the whole
// class. Therefore we capture the contents of InterleavedMemoryAccessBase in a macro to easily
// copy it into its specializations.
typedef typename std::conditional<
Readonly, typename std::add_const<typename V::EntryType>::type,
typename V::EntryType>::type T;
typedef typename V::AsArg VArg;
typedef T Ta Vc_MAY_ALIAS;
const I m_indexes;
Ta *const m_data;
Vc_ALWAYS_INLINE InterleavedMemoryAccessBase(typename I::AsArg indexes, Ta *data)
: m_indexes(indexes), m_data(data)
{
}
// implementations of the following are in {scalar,sse,avx}/detail.h
template <typename... Vs> Vc_INTRINSIC void deinterleave(Vs &&... vs) const
{
Impl::deinterleave(m_data, m_indexes, std::forward<Vs>(vs)...);
}
protected:
using Impl = Vc::Detail::InterleaveImpl<V, V::Size, sizeof(V)>;
template <typename T, std::size_t... Indexes>
Vc_INTRINSIC void callInterleave(T &&a, index_sequence<Indexes...>)
{
Impl::interleave(m_data, m_indexes, a[Indexes]...);
}
};
/**
* \internal
*/
// delay execution of the deinterleaving gather until operator=
template <size_t StructSize, typename V, typename I = typename V::IndexType,
bool Readonly>
struct InterleavedMemoryReadAccess : public InterleavedMemoryAccessBase<V, I, Readonly>
{
typedef InterleavedMemoryAccessBase<V, I, Readonly> Base;
typedef typename Base::Ta Ta;
Vc_ALWAYS_INLINE InterleavedMemoryReadAccess(Ta *data, typename I::AsArg indexes)
: Base(StructSize == 1u
? indexes
: StructSize == 2u
? indexes << 1
: StructSize == 4u
? indexes << 2
: StructSize == 8u
? indexes << 3
: StructSize == 16u ? indexes << 4
: indexes * I(int(StructSize)),
data)
{
}
template <typename T, std::size_t... Indexes>
Vc_ALWAYS_INLINE T deinterleave_unpack(index_sequence<Indexes...>) const
{
T r;
Base::Impl::deinterleave(this->m_data, this->m_indexes, std::get<Indexes>(r)...);
return r;
}
template <typename T,
typename = enable_if<(std::is_default_constructible<T>::value &&
std::is_same<V, Traits::decay<decltype(std::get<0>(
std::declval<T &>()))>>::value)>>
Vc_ALWAYS_INLINE operator T() const
{
return deinterleave_unpack<T>(make_index_sequence<std::tuple_size<T>::value>());
}
};
///\internal Runtime check (NDEBUG) for asserting unique indexes.
template<typename I> struct CheckIndexesUnique
{
#ifdef NDEBUG
static Vc_INTRINSIC void test(const I &) {}
#else
static void test(const I &indexes)
{
const I test = indexes.sorted();
Vc_ASSERT(I::Size == 1 || (test == test.rotated(1)).isEmpty())
}
#endif
};
///\internal For SuccessiveEntries there can never be a problem.
template<size_t S> struct CheckIndexesUnique<SuccessiveEntries<S> >
{
static Vc_INTRINSIC void test(const SuccessiveEntries<S> &) {}
};
/**
* \internal
*/
template <size_t StructSize, typename V, typename I = typename V::IndexType>
struct InterleavedMemoryAccess : public InterleavedMemoryReadAccess<StructSize, V, I, false>
{
typedef InterleavedMemoryAccessBase<V, I, false> Base;
typedef typename Base::Ta Ta;
Vc_ALWAYS_INLINE InterleavedMemoryAccess(Ta *data, typename I::AsArg indexes)
: InterleavedMemoryReadAccess<StructSize, V, I, false>(data, indexes)
{
CheckIndexesUnique<I>::test(indexes);
}
template <int N> Vc_ALWAYS_INLINE void operator=(VectorReferenceArray<N, V> &&rhs)
{
static_assert(N <= StructSize,
"You_are_trying_to_scatter_more_data_into_the_struct_than_it_has");
this->callInterleave(std::move(rhs), make_index_sequence<N>());
}
template <int N> Vc_ALWAYS_INLINE void operator=(VectorReferenceArray<N, const V> &&rhs)
{
static_assert(N <= StructSize,
"You_are_trying_to_scatter_more_data_into_the_struct_than_it_has");
this->callInterleave(std::move(rhs), make_index_sequence<N>());
}
};
/**
* Wraps a pointer to memory with convenience functions to access it via vectors.
*
* \param S The type of the struct.
* \param V The type of the vector to be returned when read. This should reflect the type of the
* members inside the struct.
*
* \see operator[]
* \ingroup Containers
* \headerfile interleavedmemory.h <Vc/Memory>
*/
template<typename S, typename V> class InterleavedMemoryWrapper
{
typedef typename std::conditional<std::is_const<S>::value,
const typename V::EntryType,
typename V::EntryType>::type T;
typedef typename V::IndexType I;
typedef typename V::AsArg VArg;
typedef const I &IndexType;
static constexpr std::size_t StructSize = sizeof(S) / sizeof(T);
using ReadAccess = InterleavedMemoryReadAccess<StructSize, V>;
using Access =
typename std::conditional<std::is_const<T>::value, ReadAccess,
InterleavedMemoryAccess<StructSize, V>>::type;
using ReadSuccessiveEntries =
InterleavedMemoryReadAccess<StructSize, V, SuccessiveEntries<StructSize>>;
using AccessSuccessiveEntries = typename std::conditional<
std::is_const<T>::value, ReadSuccessiveEntries,
InterleavedMemoryAccess<StructSize, V, SuccessiveEntries<StructSize>>>::type;
typedef T Ta Vc_MAY_ALIAS;
Ta *const m_data;
static_assert(StructSize * sizeof(T) == sizeof(S),
"InterleavedMemoryAccess_does_not_support_packed_structs");
public:
/**
* Constructs the wrapper object.
*
* \param s A pointer to a C-array.
*/
Vc_ALWAYS_INLINE InterleavedMemoryWrapper(S *s)
: m_data(reinterpret_cast<Ta *>(s))
{
}
/**
* Interleaved scatter/gather access.
*
* Assuming you have a struct of floats and a vector of \p indexes into the array, this function
* can be used to access the struct entries as vectors using the minimal number of store or load
* instructions.
*
* \param indexes Vector of indexes that determine the gather locations.
*
* \return A special (magic) object that executes the loads and deinterleave on assignment to a
* vector tuple.
*
* Example:
* \code
* struct Foo {
* float x, y, z;
* };
*
* void fillWithBar(Foo *_data, uint_v indexes)
* {
* Vc::InterleavedMemoryWrapper<Foo, float_v> data(_data);
* const float_v x = bar(1);
* const float_v y = bar(2);
* const float_v z = bar(3);
* data[indexes] = (x, y, z);
* // it's also possible to just store a subset at the front of the struct:
* data[indexes] = (x, y);
* // if you want to store a single entry, use scatter:
* z.scatter(_data, &Foo::x, indexes);
* }
*
* float_v normalizeStuff(Foo *_data, uint_v indexes)
* {
* Vc::InterleavedMemoryWrapper<Foo, float_v> data(_data);
* float_v x, y, z;
* (x, y, z) = data[indexes];
* // it is also possible to just load a subset from the front of the struct:
* // (x, y) = data[indexes];
* return Vc::sqrt(x * x + y * y + z * z);
* }
* \endcode
*
* You may think of the gather operation (or scatter as the inverse) like this:
\verbatim
Memory: {x0 y0 z0 x1 y1 z1 x2 y2 z2 x3 y3 z3 x4 y4 z4 x5 y5 z5 x6 y6 z6 x7 y7 z7 x8 y8 z8}
indexes: [5, 0, 1, 7]
Result in (x, y, z): ({x5 x0 x1 x7}, {y5 y0 y1 y7}, {z5 z0 z1 z7})
\endverbatim
*
* \warning If \p indexes contains non-unique entries on scatter, the result is undefined. If
* \c NDEBUG is not defined the implementation will assert that the \p indexes entries are unique.
*/
template <typename IT>
Vc_ALWAYS_INLINE enable_if<!std::is_convertible<IT, size_t>::value &&
std::is_convertible<IT, IndexType>::value &&
!std::is_const<S>::value,
Access>
operator[](IT indexes)
{
return Access(m_data, indexes);
}
/// const overload (gathers only) of the above function
Vc_ALWAYS_INLINE ReadAccess operator[](IndexType indexes) const
{
return ReadAccess(m_data, indexes);
}
/// alias of the above function
Vc_ALWAYS_INLINE ReadAccess gather(IndexType indexes) const { return operator[](indexes); }
/**
* Interleaved access.
*
* This function is an optimization of the function above, for cases where the index vector
* contains consecutive values. It will load \p V::Size consecutive entries from memory and
* deinterleave them into Vc vectors.
*
* \param first The first of \p V::Size indizes to be accessed.
*
* \return A special (magic) object that executes the loads and deinterleave on assignment to a
* vector tuple.
*
* Example:
* \code
* struct Foo {
* float x, y, z;
* };
*
* void foo(Foo *_data)
* {
* Vc::InterleavedMemoryWrapper<Foo, float_v> data(_data);
* for (size_t i = 0; i < 32U; i += float_v::Size) {
* float_v x, y, z;
* (x, y, z) = data[i];
* // now:
* // x = { _data[i].x, _data[i + 1].x, _data[i + 2].x, ... }
* // y = { _data[i].y, _data[i + 1].y, _data[i + 2].y, ... }
* // z = { _data[i].z, _data[i + 1].z, _data[i + 2].z, ... }
* ...
* }
* }
* \endcode
*/
Vc_ALWAYS_INLINE ReadSuccessiveEntries operator[](size_t first) const
{
return ReadSuccessiveEntries(m_data, first);
}
Vc_ALWAYS_INLINE AccessSuccessiveEntries operator[](size_t first)
{
return AccessSuccessiveEntries(m_data, first);
}
//Vc_ALWAYS_INLINE Access scatter(I indexes, VArg v0, VArg v1);
};
} // namespace Common
using Common::InterleavedMemoryWrapper;
/**
* Creates an adapter around a given array of structure (AoS) that enables optimized loads
* + deinterleaving operations / interleaving operations + stores for vector access (using
* \p V).
*
* \tparam V The `Vc::Vector<T>` type to use per element of the structure.
* \param s A pointer to an array of structures containing data members of type `T`.
*
* \see Vc::Common::InterleavedMemoryWrapper
*
* \todo Support destructuring via structured bindings.
*/
template <typename V, typename S>
inline Common::InterleavedMemoryWrapper<S, V> make_interleave_wrapper(S *s)
{
return Common::InterleavedMemoryWrapper<S, V>(s);
}
} // namespace Vc
#endif // VC_COMMON_INTERLEAVEDMEMORY_H_

282
Vc/common/iterators.h Normal file
View File

@ -0,0 +1,282 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_ITERATORS_H_
#define VC_COMMON_ITERATORS_H_
#include <array>
#include <iterator>
#ifdef Vc_MSVC
#include <intrin.h> // for _BitScanForward
#endif // Vc_MSVC
#include "where.h"
#include "elementreference.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template<typename _V, typename Flags> class MemoryVector;
template<typename _V, typename Flags> class MemoryVectorIterator;
template <typename V> class Iterator;
template <typename V, bool> class IteratorBase;
template <typename V> class IteratorBase<V, true>
{
public:
using iterator_category = std::input_iterator_tag;
using value_type = typename V::value_type;
using difference_type = int;
using reference = value_type;
Vc_ALWAYS_INLINE reference operator*() const { return v()[i()]; }
Vc_ALWAYS_INLINE reference operator[](difference_type i2) const { return v()[i2]; }
private:
Vc_INTRINSIC V &v() const { return *static_cast<const Iterator<V> *>(this)->v; }
Vc_INTRINSIC difference_type i() const
{
return static_cast<const Iterator<V> *>(this)->i;
}
};
template <typename V> class IteratorBase<V, false>
{
public:
using iterator_category = std::input_iterator_tag;
using value_type = typename V::value_type;
using difference_type = int;
using reference = Vc::Detail::ElementReference<V, IteratorBase>;
Vc_ALWAYS_INLINE reference operator*() const { return {*v(), i()}; }
Vc_ALWAYS_INLINE reference operator[](difference_type i2) const { return {*v(), i2}; }
private:
Vc_INTRINSIC V *v() const { return static_cast<const Iterator<V> *>(this)->v; }
Vc_INTRINSIC difference_type i() const
{
return static_cast<const Iterator<V> *>(this)->i;
}
friend reference;
static Vc_INTRINSIC value_type get(const V &o, int i)
{
return o[i];
}
template <typename T> static Vc_INTRINSIC void set(V &o, int i, T &&v)
{
o[i] = std::forward<T>(v);
}
};
// class Iterator {{{
template <typename V> class Iterator : public IteratorBase<V, std::is_const<V>::value>
{
using Base = IteratorBase<V, std::is_const<V>::value>;
friend Base;
public:
using typename Base::iterator_category;
using typename Base::value_type;
using typename Base::difference_type;
using pointer = const Iterator *;
using typename Base::reference;
constexpr Iterator() = default;
constexpr Iterator(V &_v, difference_type _i) : v(&_v), i(_i) {}
// rely on implicit copy constructor/assignment
Vc_ALWAYS_INLINE pointer operator->() const { return this; }
using Base::operator*;
Vc_ALWAYS_INLINE Iterator &operator++() { ++i; return *this; }
Vc_ALWAYS_INLINE Iterator operator++(int) { Iterator tmp = *this; ++i; return tmp; }
// bidirectional iteration is supported
Vc_ALWAYS_INLINE Iterator &operator--() { --i; return *this; }
Vc_ALWAYS_INLINE Iterator operator--(int) { Iterator tmp = *this; --i; return tmp; }
// RandomAccessIterator:
using Base::operator[];
Vc_ALWAYS_INLINE Iterator &operator+=(difference_type d) { i += d; return *this; }
Vc_ALWAYS_INLINE Iterator &operator-=(difference_type d) { i -= d; return *this; }
Vc_ALWAYS_INLINE Iterator operator+(difference_type d) const { return {*v, i + d}; }
Vc_ALWAYS_INLINE Iterator operator-(difference_type d) const { return {*v, i - d}; }
Vc_ALWAYS_INLINE difference_type operator-(const Iterator &rhs) const { return i - rhs.i; }
friend Vc_ALWAYS_INLINE Iterator operator+(difference_type d, const Iterator &rhs)
{
return {*rhs.v, rhs.i + d};
}
// InputIterator would not need to test v == rhs.v, but except for `reference` this
// class implements a complete RandomAccessIterator
Vc_ALWAYS_INLINE bool operator==(const Iterator<V> &rhs) const { return v == rhs.v && i == rhs.i; }
Vc_ALWAYS_INLINE bool operator!=(const Iterator<V> &rhs) const { return v == rhs.v && i != rhs.i; }
Vc_ALWAYS_INLINE bool operator< (const Iterator<V> &rhs) const { return v == rhs.v && i < rhs.i; }
Vc_ALWAYS_INLINE bool operator<=(const Iterator<V> &rhs) const { return v == rhs.v && i <= rhs.i; }
Vc_ALWAYS_INLINE bool operator> (const Iterator<V> &rhs) const { return v == rhs.v && i > rhs.i; }
Vc_ALWAYS_INLINE bool operator>=(const Iterator<V> &rhs) const { return v == rhs.v && i >= rhs.i; }
private:
V *v = nullptr;
difference_type i = 0;
};/*}}}*/
template <typename V> using ConstIterator = Iterator<const V>;
class BitmaskIterator/*{{{*/
{
#ifdef Vc_MSVC
unsigned long mask;
unsigned long bit;
#else
size_t mask;
size_t bit;
#endif
void nextBit()
{
#ifdef Vc_GNU_ASM
bit = __builtin_ctzl(mask);
#elif defined(Vc_MSVC)
_BitScanForward(&bit, mask);
#else
#error "Not implemented yet. Please contact vc-devel@compeng.uni-frankfurt.de"
#endif
}
void resetLsb()
{
// 01100100 - 1 = 01100011
mask &= (mask - 1);
/*
#ifdef Vc_GNU_ASM
__asm__("btr %1,%0" : "+r"(mask) : "r"(bit));
#elif defined(_WIN64)
_bittestandreset64(&mask, bit);
#elif defined(_WIN32)
_bittestandreset(&mask, bit);
#else
#error "Not implemented yet. Please contact vc-devel@compeng.uni-frankfurt.de"
#endif
*/
}
public:
BitmaskIterator(decltype(mask) m) : mask(m) { nextBit(); }
BitmaskIterator(const BitmaskIterator &) = default;
BitmaskIterator(BitmaskIterator &&) = default;
Vc_ALWAYS_INLINE size_t operator->() const { return bit; }
Vc_ALWAYS_INLINE size_t operator*() const { return bit; }
Vc_ALWAYS_INLINE BitmaskIterator &operator++() { resetLsb(); nextBit(); return *this; }
Vc_ALWAYS_INLINE BitmaskIterator operator++(int) { BitmaskIterator tmp = *this; resetLsb(); nextBit(); return tmp; }
Vc_ALWAYS_INLINE bool operator==(const BitmaskIterator &rhs) const { return mask == rhs.mask; }
Vc_ALWAYS_INLINE bool operator!=(const BitmaskIterator &rhs) const { return mask != rhs.mask; }
};/*}}}*/
template <typename T>
Vc_ALWAYS_INLINE
enable_if<Traits::is_simd_vector<T>::value || Traits::is_simd_mask<T>::value,
Iterator<typename std::remove_reference<T>::type>>
begin(T &&x)
{
return {std::forward<T>(x), 0};
}
template <typename T>
Vc_ALWAYS_INLINE
enable_if<Traits::is_simd_vector<T>::value || Traits::is_simd_mask<T>::value,
Iterator<typename std::remove_reference<T>::type>>
end(T &&x)
{
using TT = typename std::decay<T>::type;
return {std::forward<T>(x), int(TT::size())};
}
template <typename T>
Vc_ALWAYS_INLINE enable_if<
Traits::is_simd_mask<T>::value || Traits::is_simd_vector<T>::value, ConstIterator<T>>
cbegin(const T &v)
{
return {v, 0};
}
template <typename T>
Vc_ALWAYS_INLINE enable_if<
Traits::is_simd_mask<T>::value || Traits::is_simd_vector<T>::value, ConstIterator<T>>
cend(const T &v)
{
return {v, int(T::size())};
}
template<typename M> Vc_ALWAYS_INLINE BitmaskIterator begin(const WhereImpl::WhereMask<M> &w)
{
return w.mask.toInt();
}
template<typename M> Vc_ALWAYS_INLINE BitmaskIterator end(const WhereImpl::WhereMask<M> &)
{
return 0;
}
template<typename V, typename Flags, typename T> Vc_ALWAYS_INLINE MemoryVectorIterator<V, Flags>
makeIterator(T *mem, Flags)
{
return new(mem) MemoryVector<V, Flags>;
}
template<typename V, typename Flags, typename T> Vc_ALWAYS_INLINE MemoryVectorIterator<const V, Flags>
makeIterator(const T *mem, Flags)
{
return new(const_cast<T *>(mem)) MemoryVector<const V, Flags>;
}
template<typename V, typename Flags, typename FlagsX> Vc_ALWAYS_INLINE MemoryVectorIterator<V, Flags>
makeIterator(MemoryVector<V, FlagsX> &mv, Flags)
{
return new(&mv) MemoryVector<V, Flags>;
}
template<typename V, typename Flags, typename FlagsX> Vc_ALWAYS_INLINE MemoryVectorIterator<const V, Flags>
makeIterator(MemoryVector<const V, FlagsX> &mv, Flags)
{
return new(&mv) MemoryVector<const V, Flags>;
}
} // namespace Common
using Common::begin;
using Common::end;
using Common::cbegin;
using Common::cend;
using Common::makeIterator;
} // namespace Vc
#endif // VC_COMMON_ITERATORS_H_
// vim: foldmethod=marker

105
Vc/common/loadinterface.h Normal file
View File

@ -0,0 +1,105 @@
/* This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
// load ctors{{{1
/**
* Construct a vector from loading its entries from the array at \p mem.
*
* \param mem A pointer to data. The pointer must not be aligned on a
* MemoryAlignment boundary unless you add the Vc::Aligned flag as a second
* argument.
*/
explicit Vc_INTRINSIC Vector(const EntryType *mem)
{
load(mem);
}
/**
* Construct a vector from loading its entries from the array at \p mem.
*
* \param mem A pointer to data. If \p flags contains the Vc::Aligned flag, the pointer
* must be aligned on a MemoryAlignment boundary.
* \param flags A (combination of) flag object(s), such as Vc::Aligned, Vc::Streaming,
* Vc::Unaligned, and/or Vc::PrefetchDefault.
*/
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
{
load(mem, flags);
}
template <typename U, typename Flags = DefaultLoadTag,
typename = enable_if<
(!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
sizeof(EntryType) >= sizeof(U)) &&
std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
{
load<U, Flags>(x, flags);
}
// load member functions{{{1
/**
* Load the vector entries from \p mem, overwriting the previous values.
*
* \param mem
* A pointer to data. The pointer must not be aligned on a MemoryAlignment boundary unless
* you add the Vc::Aligned flag as a second argument.
*/
Vc_INTRINSIC void load(const EntryType *mem)
{
load(mem, DefaultLoadTag());
}
/**
* Load the vector entries from \p mem, overwriting the previous values.
*
* \param mem
* A pointer to data. If \p flags contains the Vc::Aligned flag, the pointer must be
* aligned on a MemoryAlignment boundary.
* \param flags
* A (combination of) flag object(s), such as Vc::Aligned, Vc::Streaming, Vc::Unaligned,
* and/or Vc::PrefetchDefault.
*/
template <typename Flags>
Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
load(const EntryType *mem, Flags flags)
{
load<EntryType, Flags>(mem, flags);
}
private:
template <typename U, typename Flags>
struct load_concept : public std::enable_if<
(!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
sizeof(EntryType) >= sizeof(U)) &&
std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
{};
public:
template <typename U, typename Flags = DefaultLoadTag>
Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
//}}}1
// vim: foldmethod=marker

243
Vc/common/loadstoreflags.h Normal file
View File

@ -0,0 +1,243 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_LOADSTOREFLAGS_H_
#define VC_COMMON_LOADSTOREFLAGS_H_
#include "../traits/type_traits.h"
namespace Vc_VERSIONED_NAMESPACE
{
/**
* Hint for \ref Prefetch to select prefetches that mark the memory as exclusive.
*
* This hint may optimize the prefetch if the memory will subsequently be written to.
*/
struct Exclusive {};
/**
* Hint for \ref Prefetch to select prefetches that mark the memory as shared.
*/
struct Shared {};
namespace LoadStoreFlags
{
struct StreamingFlag {};
struct UnalignedFlag {};
struct PrefetchFlagBase {};
// TODO: determine a good default for typical CPU use
template <size_t L1 = 16 * 64, size_t L2 = 128 * 64, typename ExclusiveOrShared_ = void>
struct PrefetchFlag : public PrefetchFlagBase {
typedef ExclusiveOrShared_ ExclusiveOrShared;
static constexpr size_t L1Stride = L1;
static constexpr size_t L2Stride = L2;
static constexpr bool IsExclusive = std::is_same<ExclusiveOrShared, Exclusive>::value;
static constexpr bool IsShared = std::is_same<ExclusiveOrShared, Shared>::value;
};
template<typename Base, typename Default, typename... LoadStoreFlags> struct ExtractType
{
typedef Default type;
};
template<typename Base, typename Default, typename T, typename... LoadStoreFlags> struct ExtractType<Base, Default, T, LoadStoreFlags...>
{
typedef typename std::conditional<std::is_base_of<Base, T>::value, T, typename ExtractType<Base, Default, LoadStoreFlags...>::type>::type type;
};
// ICC warns about the constexpr members in LoadStoreFlags: member "LoadStoreFlags<Flags...>::IsAligned" was declared but never referenced
// who needs that warning, especially if it was referenced...
// The warning cannot be reenabled because it gets emitted whenever the LoadStoreFlags is instantiated
// somewhere, so it could be anywhere.
#ifdef Vc_ICC
#pragma warning(disable: 177)
#endif
/**\internal
* Implementation of the load/store flags mechanism. This is internal API. Only some
* concrete aliases are API-relevant types.
*/
template<typename... Flags> struct LoadStoreFlags
{
private:
// ICC doesn't grok this line:
//template<typename Test> using TestFlag = std::is_same<typename ExtractType<StreamingFlag, void, Flags...>::type, void>;
typedef typename ExtractType<PrefetchFlagBase, PrefetchFlag<0, 0>, Flags...>::type Prefetch;
public:
constexpr LoadStoreFlags() {}
static constexpr bool IsStreaming = !std::is_same<typename ExtractType<StreamingFlag, void, Flags...>::type, void>::value;
static constexpr bool IsUnaligned = !std::is_same<typename ExtractType<UnalignedFlag, void, Flags...>::type, void>::value;
static constexpr bool IsAligned = !IsUnaligned;
static constexpr bool IsPrefetch = !std::is_same<typename ExtractType<PrefetchFlagBase, void, Flags...>::type, void>::value;
static constexpr bool IsExclusivePrefetch = Prefetch::IsExclusive;
static constexpr bool IsSharedPrefetch = Prefetch::IsShared;
static constexpr size_t L1Stride = Prefetch::L1Stride;
static constexpr size_t L2Stride = Prefetch::L2Stride;
typedef LoadStoreFlags<typename std::conditional<std::is_same<Flags, UnalignedFlag>::value, void, Flags>::type...> UnalignedRemoved;
// The following EnableIf* convenience types cannot use enable_if because then no LoadStoreFlags type
// could ever be instantiated. Instead these types are defined either as void* or void. The
// function that does SFINAE then assigns "= nullptr" to this type. Thus, the ones with just
// void result in substitution failure.
typedef typename std::conditional<IsAligned && !IsStreaming, void *, void>::type EnableIfAligned;
typedef typename std::conditional<IsAligned && IsStreaming, void *, void>::type EnableIfStreaming;
typedef typename std::conditional<IsUnaligned && !IsStreaming, void *, void>::type EnableIfUnalignedNotStreaming;
typedef typename std::conditional<IsUnaligned && IsStreaming, void *, void>::type EnableIfUnalignedAndStreaming;
typedef typename std::conditional<IsUnaligned , void *, void>::type EnableIfUnaligned;
typedef typename std::conditional<!IsUnaligned , void *, void>::type EnableIfNotUnaligned;
typedef typename std::conditional<IsPrefetch , void *, void>::type EnableIfPrefetch;
typedef typename std::conditional<!IsPrefetch , void *, void>::type EnableIfNotPrefetch;
};
/**\internal
* Specialization for no flags (i.e aligned, non-streaming, no prefetching)
*/
template<> struct LoadStoreFlags<>
{
constexpr LoadStoreFlags() {}
static constexpr bool IsStreaming = false;
static constexpr bool IsUnaligned = false;
static constexpr bool IsAligned = !IsUnaligned;
static constexpr bool IsPrefetch = false;
static constexpr bool IsExclusivePrefetch = false;
static constexpr bool IsSharedPrefetch = false;
static constexpr size_t L1Stride = 0;
static constexpr size_t L2Stride = 0;
typedef void* EnableIfAligned;
typedef void* EnableIfNotUnaligned;
typedef void* EnableIfNotPrefetch;
};
/**
* Operator for concatenation of LoadStoreFlags.
*
* Example:
* \code
* float_v x(mem, Vc::Aligned | Vc::Streaming);
* \endcode
*/
template<typename... LFlags, typename... RFlags>
constexpr LoadStoreFlags<LFlags..., RFlags...> operator|(LoadStoreFlags<LFlags...>, LoadStoreFlags<RFlags...>)
{
return LoadStoreFlags<LFlags..., RFlags...>();
}
} // LoadStoreFlags namespace
using LoadStoreFlags::PrefetchFlag;
typedef LoadStoreFlags::LoadStoreFlags<> AlignedTag;
typedef LoadStoreFlags::LoadStoreFlags<LoadStoreFlags::StreamingFlag> StreamingTag;
typedef LoadStoreFlags::LoadStoreFlags<LoadStoreFlags::UnalignedFlag> UnalignedTag;
/// The default load tag type uses unaligned (non-streaming) loads.
typedef UnalignedTag DefaultLoadTag;
/// The default store tag type uses unaligned (non-streaming) stores.
typedef UnalignedTag DefaultStoreTag;
/**\addtogroup Utilities
* @{
*/
/**
* Use this object for a \p flags parameter to request aligned loads and stores.
*
* It specifies that a load/store can expect a memory address that is aligned on
* the correct boundary. (i.e. \p MemoryAlignment)
*
* \warning
* If you specify Aligned, but the memory address is not aligned the program
* will most likely crash.
*/
constexpr AlignedTag Aligned;
/**
* Use this object for a \p flags parameter to request unaligned loads and stores.
*
* It specifies that a load/store can \em not expect a memory address that is
* aligned on the correct boundary. (i.e. alignment is less than
* \p MemoryAlignment)
*
* \note
* If you specify Unaligned, but the memory address is aligned the load/store
* will execute slightly slower than necessary.
*/
constexpr UnalignedTag Unaligned;
/**
* Use this object for a \p flags parameter to request streaming loads and stores.
*
* It specifies that the cache should be bypassed for the given load/store.
* Whether this will actually be done depends on the target system's capabilities.
*
* Streaming stores can be interesting when the code calculates values that, after being
* written to memory, will not be used for a long time or used by a different thread.
*
* \note
* Expect that most target systems do not support unaligned streaming loads or stores.
* Therefore, make sure that you also specify Aligned.
*/
constexpr StreamingTag Streaming;
/**
* Use this object for a \p flags parameter to request default software prefetches to be
* emitted.
*/
constexpr LoadStoreFlags::LoadStoreFlags<PrefetchFlag<>> PrefetchDefault;
///@}
/**
* \tparam L1
* \tparam L2
* \tparam ExclusiveOrShared
*/
template <size_t L1 = PrefetchFlag<>::L1Stride,
size_t L2 = PrefetchFlag<>::L2Stride,
typename ExclusiveOrShared = PrefetchFlag<>::ExclusiveOrShared>
struct Prefetch : public LoadStoreFlags::LoadStoreFlags<PrefetchFlag<L1, L2, ExclusiveOrShared>>
{
};
namespace Traits
{
///\internal partial specialization for detecting LoadStoreFlags types
template <typename... Ts>
struct is_loadstoreflag_internal<LoadStoreFlags::LoadStoreFlags<Ts...>> : public std::true_type
{
};
///\internal partial specialization for detecting the derived Prefetch type as a
/// load/store flag.
template <size_t L1, size_t L2, typename ExclusiveOrShared>
struct is_loadstoreflag_internal<Prefetch<L1, L2, ExclusiveOrShared>> : public std::true_type
{
};
} // namespace Traits
} // namespace Vc
#endif // VC_COMMON_LOADSTOREFLAGS_H_

276
Vc/common/logarithm.h Normal file
View File

@ -0,0 +1,276 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
/* The log implementations are based on code from Julien Pommier which carries the following
copyright information:
*/
/*
Inspired by Intel Approximate Math library, and based on the
corresponding algorithms of the cephes math library
*/
/* Copyright (C) 2007 Julien Pommier
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
#ifdef Vc_COMMON_MATH_H_INTERNAL
enum LogarithmBase {
BaseE, Base10, Base2
};
namespace Detail
{
template <typename T, typename Abi>
using Const = typename std::conditional<std::is_same<Abi, VectorAbi::Avx>::value,
AVX::Const<T>, SSE::Const<T>>::type;
template<LogarithmBase Base>
struct LogImpl
{
template<typename T, typename Abi> static Vc_ALWAYS_INLINE void log_series(Vector<T, Abi> &Vc_RESTRICT x, typename Vector<T, Abi>::AsArg exponent) {
typedef Vector<T, Abi> V;
typedef Detail::Const<T, Abi> C;
// Taylor series around x = 2^exponent
// f(x) = ln(x) → exponent * ln(2) → C::ln2_small + C::ln2_large
// f'(x) = x⁻¹ → x → 1
// f''(x) = - x⁻² → -x² / 2 → C::_1_2()
// = 2!x⁻³ → x³ / 3 → C::P(8)
// = -3!x⁻⁴ → -x⁴ / 4 → C::P(7)
// = 4!x⁻⁵ → x⁵ / 5 → C::P(6)
// ...
// The high order coefficients are adjusted to reduce the error that occurs from ommission
// of higher order terms.
// P(0) is the smallest term and |x| < 1 ⇒ |xⁿ| > |xⁿ⁺¹|
// The order of additions must go from smallest to largest terms
const V x2 = x * x; // 0 → 4
#ifdef Vc_LOG_ILP
V y2 = (C::P(6) * /*4 → 8*/ x2 + /* 8 → 11*/ C::P(7) * /*1 → 5*/ x) + /*11 → 14*/ C::P(8);
V y0 = (C::P(0) * /*5 → 9*/ x2 + /* 9 → 12*/ C::P(1) * /*2 → 6*/ x) + /*12 → 15*/ C::P(2);
V y1 = (C::P(3) * /*6 → 10*/ x2 + /*10 → 13*/ C::P(4) * /*3 → 7*/ x) + /*13 → 16*/ C::P(5);
const V x3 = x2 * x; // 7 → 11
const V x6 = x3 * x3; // 11 → 15
const V x9 = x6 * x3; // 15 → 19
V y = (y0 * /*19 → 23*/ x9 + /*23 → 26*/ y1 * /*16 → 20*/ x6) + /*26 → 29*/ y2 * /*14 → 18*/ x3;
#elif defined Vc_LOG_ILP2
/*
* name start done
* movaps %xmm0, %xmm1 ; x 0 1
* movaps %xmm0, %xmm2 ; x 0 1
* mulps %xmm1, %xmm1 ; x2 1 5 *xmm1
* movaps <P8>, %xmm15 ; y8 1 2
* mulps %xmm1, %xmm2 ; x3 5 9 *xmm2
* movaps %xmm1, %xmm3 ; x2 5 6
* movaps %xmm1, %xmm4 ; x2 5 6
* mulps %xmm3, %xmm3 ; x4 6 10 *xmm3
* movaps %xmm2, %xmm5 ; x3 9 10
* movaps %xmm2, %xmm6 ; x3 9 10
* mulps %xmm2, %xmm4 ; x5 9 13 *xmm4
* movaps %xmm3, %xmm7 ; x4 10 11
* movaps %xmm3, %xmm8 ; x4 10 11
* movaps %xmm3, %xmm9 ; x4 10 11
* mulps %xmm5, %xmm5 ; x6 10 14 *xmm5
* mulps %xmm3, %xmm6 ; x7 11 15 *xmm6
* mulps %xmm7, %xmm7 ; x8 12 16 *xmm7
* movaps %xmm4, %xmm10 ; x5 13 14
* mulps %xmm4, %xmm8 ; x9 13 17 *xmm8
* mulps %xmm5, %xmm10 ; x11 14 18 *xmm10
* mulps %xmm5, %xmm9 ; x10 15 19 *xmm9
* mulps <P0>, %xmm10 ; y0 18 22
* mulps <P1>, %xmm9 ; y1 19 23
* mulps <P2>, %xmm8 ; y2 20 24
* mulps <P3>, %xmm7 ; y3 21 25
* addps %xmm10, %xmm9 ; y 23 26
* addps %xmm9, %xmm8 ; y 26 29
* addps %xmm8, %xmm7 ; y 29 32
*/
const V x3 = x2 * x; // 4 → 8
const V x4 = x2 * x2; // 5 → 9
const V x5 = x2 * x3; // 8 → 12
const V x6 = x3 * x3; // 9 → 13
const V x7 = x4 * x3; //
const V x8 = x4 * x4;
const V x9 = x5 * x4;
const V x10 = x5 * x5;
const V x11 = x5 * x6; // 13 → 17
V y = C::P(0) * x11 + C::P(1) * x10 + C::P(2) * x9 + C::P(3) * x8 + C::P(4) * x7
+ C::P(5) * x6 + C::P(6) * x5 + C::P(7) * x4 + C::P(8) * x3;
#else
V y = C::P(0);
Vc::Common::unrolled_loop<int, 1, 9>([&](int i) { y = y * x + C::P(i); });
y *= x * x2;
#endif
switch (Base) {
case BaseE:
// ln(2) is split in two parts to increase precision (i.e. ln2_small + ln2_large = ln(2))
y += exponent * C::ln2_small();
y -= x2 * C::_1_2(); // [0, 0.25[
x += y;
x += exponent * C::ln2_large();
break;
case Base10:
y += exponent * C::ln2_small();
y -= x2 * C::_1_2(); // [0, 0.25[
x += y;
x += exponent * C::ln2_large();
x *= C::log10_e();
break;
case Base2:
{
const V x_ = x;
x *= C::log2_e();
y *= C::log2_e();
y -= x_ * x * C::_1_2(); // [0, 0.25[
x += y;
x += exponent;
break;
}
}
}
template <typename Abi>
static Vc_ALWAYS_INLINE void log_series(Vector<double, Abi> &Vc_RESTRICT x,
typename Vector<double, Abi>::AsArg exponent)
{
typedef Vector<double, Abi> V;
typedef Detail::Const<double, Abi> C;
const V x2 = x * x;
V y = C::P(0);
V y2 = C::Q(0) + x;
Vc::Common::unrolled_loop<int, 1, 5>([&](int i) {
y = y * x + C::P(i);
y2 = y2 * x + C::Q(i);
});
y2 = x / y2;
y = y * x + C::P(5);
y = x2 * y * y2;
// TODO: refactor the following with the float implementation:
switch (Base) {
case BaseE:
// ln(2) is split in two parts to increase precision (i.e. ln2_small + ln2_large = ln(2))
y += exponent * C::ln2_small();
y -= x2 * C::_1_2(); // [0, 0.25[
x += y;
x += exponent * C::ln2_large();
break;
case Base10:
y += exponent * C::ln2_small();
y -= x2 * C::_1_2(); // [0, 0.25[
x += y;
x += exponent * C::ln2_large();
x *= C::log10_e();
break;
case Base2:
{
const V x_ = x;
x *= C::log2_e();
y *= C::log2_e();
y -= x_ * x * C::_1_2(); // [0, 0.25[
x += y;
x += exponent;
break;
}
}
}
template <typename T, typename Abi, typename V = Vector<T, Abi>>
static inline Vector<T, Abi> calc(V _x)
{
typedef typename V::Mask M;
typedef Detail::Const<T, Abi> C;
V x(_x);
const M invalidMask = x < V::Zero();
const M infinityMask = x == V::Zero();
const M denormal = x <= C::min();
x(denormal) *= V(Vc::Detail::doubleConstant<1, 0, 54>()); // 2²⁵
V exponent = Detail::exponent(x.data()); // = ⎣log₂(x)⎦
exponent(denormal) -= 54;
x.setZero(C::exponentMask()); // keep only the fractional part ⇒ x ∈ [1, 2[
x = Detail::operator|(x,
C::_1_2()); // and set the exponent to 2⁻¹ ⇒ x ∈ [½, 1[
// split calculation in two cases:
// A: x ∈ [½, √½[
// B: x ∈ [√½, 1[
// √½ defines the point where Δe(x) := log₂(x) - ⎣log₂(x)⎦ = ½, i.e.
// log₂(√½) - ⎣log₂(√½)⎦ = ½ * -1 - ⎣½ * -1⎦ = -½ + 1 = ½
const M smallX = x < C::_1_sqrt2();
x(smallX) += x; // => x ∈ [√½, 1[ [1.5, 1 + √½[
x -= V::One(); // => x ∈ [√½ - 1, 0[ [0.5, √½[
exponent(!smallX) += V::One();
log_series(x, exponent); // A: (ˣ⁄₂ᵉ - 1, e) B: (ˣ⁄₂ᵉ⁺¹ - 1, e + 1)
x.setQnan(invalidMask); // x < 0 → NaN
x(infinityMask) = C::neginf(); // x = 0 → -∞
return x;
}
};
} // namespace Detail
template <typename T, typename Abi>
Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log(
const Vector<T, Abi> &x)
{
return Detail::LogImpl<BaseE>::calc<T, Abi>(x);
}
template <typename T, typename Abi>
Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log10(
const Vector<T, Abi> &x)
{
return Detail::LogImpl<Base10>::calc<T, Abi>(x);
}
template <typename T, typename Abi>
Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log2(
const Vector<T, Abi> &x)
{
return Detail::LogImpl<Base2>::calc<T, Abi>(x);
}
#endif // Vc_COMMON_MATH_H_INTERNAL

318
Vc/common/macros.h Normal file
View File

@ -0,0 +1,318 @@
/* This file is part of the Vc library. {{{
Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_MACROS_H_
#define VC_COMMON_MACROS_H_
#include "../global.h"
#ifdef Vc_MSVC
#define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_) \
typedef __declspec(align(n_)) type_ new_type_
#elif __GNUC__
#define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_) \
typedef type_ new_type_[[gnu::aligned(n_)]]
#else // the following is actually ill-formed according to C++1[14]
#define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_) \
using new_type_ alignas(sizeof(n_)) = type_
#endif
// On Windows (WIN32) we might see macros called min and max. Just undefine them and hope
// noone (re)defines them (NOMINMAX should help).
#ifdef WIN32
#define NOMINMAX 1
#if defined min
#undef min
#endif
#if defined max
#undef max
#endif
#endif // WIN32
#if defined Vc_GCC && Vc_GCC >= 0x60000
// GCC 6 drops all attributes on types passed as template arguments. This is important
// if a may_alias gets lost and therefore needs to be readded in the implementation of
// the class template.
#define Vc_TEMPLATES_DROP_ATTRIBUTES 1
#endif
#if defined Vc_CLANG || defined Vc_APPLECLANG
# define Vc_UNREACHABLE __builtin_unreachable
# define Vc_NEVER_INLINE [[gnu::noinline]]
# define Vc_INTRINSIC_L inline
# define Vc_INTRINSIC_R __attribute__((always_inline))
# define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R
# define Vc_FLATTEN
# define Vc_CONST __attribute__((const))
# define Vc_CONST_L
# define Vc_CONST_R Vc_CONST
# define Vc_PURE __attribute__((pure))
# define Vc_PURE_L
# define Vc_PURE_R Vc_PURE
# define Vc_MAY_ALIAS __attribute__((may_alias))
# define Vc_ALWAYS_INLINE_L inline
# define Vc_ALWAYS_INLINE_R __attribute__((always_inline))
# define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R
# define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0)
# define Vc_IS_LIKELY(x) __builtin_expect(x, 1)
# define Vc_RESTRICT __restrict__
# define Vc_DEPRECATED(msg)
# define Vc_DEPRECATED_ALIAS(msg)
# define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
#elif defined(__GNUC__)
# define Vc_UNREACHABLE __builtin_unreachable
# if defined Vc_GCC && !defined __OPTIMIZE__
# define Vc_MAY_ALIAS
# else
# define Vc_MAY_ALIAS __attribute__((__may_alias__))
# endif
# define Vc_INTRINSIC_R __attribute__((__always_inline__, __artificial__))
# define Vc_INTRINSIC_L inline
# define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R
# define Vc_FLATTEN __attribute__((__flatten__))
# define Vc_ALWAYS_INLINE_L inline
# define Vc_ALWAYS_INLINE_R __attribute__((__always_inline__))
# define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R
# ifdef Vc_ICC
// ICC miscompiles if there are functions marked as pure or const
# define Vc_PURE
# define Vc_CONST
# define Vc_NEVER_INLINE
# else
# define Vc_NEVER_INLINE [[gnu::noinline]]
# define Vc_PURE __attribute__((__pure__))
# define Vc_CONST __attribute__((__const__))
# endif
# define Vc_CONST_L
# define Vc_CONST_R Vc_CONST
# define Vc_PURE_L
# define Vc_PURE_R Vc_PURE
# define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0)
# define Vc_IS_LIKELY(x) __builtin_expect(x, 1)
# define Vc_RESTRICT __restrict__
# ifdef Vc_ICC
# define Vc_DEPRECATED(msg)
# define Vc_DEPRECATED_ALIAS(msg)
# else
# define Vc_DEPRECATED(msg) __attribute__((__deprecated__(msg)))
# define Vc_DEPRECATED_ALIAS(msg) __attribute__((__deprecated__(msg)))
# endif
# define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
#else
# define Vc_NEVER_INLINE
# define Vc_FLATTEN
# ifdef Vc_PURE
# undef Vc_PURE
# endif
# define Vc_MAY_ALIAS
# ifdef Vc_MSVC
# define Vc_ALWAYS_INLINE inline __forceinline
# define Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE
# define Vc_ALWAYS_INLINE_R
# define Vc_CONST __declspec(noalias)
# define Vc_CONST_L Vc_CONST
# define Vc_CONST_R
# define Vc_PURE /*Vc_CONST*/
# define Vc_PURE_L Vc_PURE
# define Vc_PURE_R
# define Vc_INTRINSIC inline __forceinline
# define Vc_INTRINSIC_L Vc_INTRINSIC
# define Vc_INTRINSIC_R
namespace Vc_VERSIONED_NAMESPACE {
namespace detail
{
static Vc_INTRINSIC void unreachable() { __assume(0); }
} // namespace detail
}
# define Vc_UNREACHABLE Vc::detail::unreachable
# else
# define Vc_ALWAYS_INLINE
# define Vc_ALWAYS_INLINE_L
# define Vc_ALWAYS_INLINE_R
# define Vc_CONST
# define Vc_CONST_L
# define Vc_CONST_R
# define Vc_PURE
# define Vc_PURE_L
# define Vc_PURE_R
# define Vc_INTRINSIC
# define Vc_INTRINSIC_L
# define Vc_INTRINSIC_R
# define Vc_UNREACHABLE std::abort
# endif
# define Vc_IS_UNLIKELY(x) x
# define Vc_IS_LIKELY(x) x
# define Vc_RESTRICT __restrict
# define Vc_DEPRECATED(msg) __declspec(deprecated(msg))
# define Vc_DEPRECATED_ALIAS(msg)
# define Vc_WARN_UNUSED_RESULT
#endif
#ifdef Vc_CXX14
#undef Vc_DEPRECATED
#define Vc_DEPRECATED(msg_) [[deprecated(msg_)]]
#endif
#define Vc_NOTHING_EXPECTING_SEMICOLON static_assert(true, "")
#define Vc_FREE_STORE_OPERATORS_ALIGNED(align_) \
/**\name new/delete overloads for correct alignment */ \
/**@{*/ \
/*!\brief Allocates correctly aligned memory */ \
Vc_ALWAYS_INLINE void *operator new(size_t size) \
{ \
return Vc::Common::aligned_malloc<align_>(size); \
} \
/*!\brief Returns \p p. */ \
Vc_ALWAYS_INLINE void *operator new(size_t, void *p) { return p; } \
/*!\brief Allocates correctly aligned memory */ \
Vc_ALWAYS_INLINE void *operator new[](size_t size) \
{ \
return Vc::Common::aligned_malloc<align_>(size); \
} \
/*!\brief Returns \p p. */ \
Vc_ALWAYS_INLINE void *operator new[](size_t, void *p) { return p; } \
/*!\brief Frees aligned memory. */ \
Vc_ALWAYS_INLINE void operator delete(void *ptr, size_t) { Vc::Common::free(ptr); } \
/*!\brief Does nothing. */ \
Vc_ALWAYS_INLINE void operator delete(void *, void *) {} \
/*!\brief Frees aligned memory. */ \
Vc_ALWAYS_INLINE void operator delete[](void *ptr, size_t) \
{ \
Vc::Common::free(ptr); \
} \
/*!\brief Does nothing. */ \
Vc_ALWAYS_INLINE void operator delete[](void *, void *) {} \
/**@}*/ \
Vc_NOTHING_EXPECTING_SEMICOLON
#ifdef Vc_ASSERT
#define Vc_EXTERNAL_ASSERT 1
#else
#ifdef NDEBUG
#define Vc_ASSERT(x)
#else
#include <assert.h>
#define Vc_ASSERT(x) assert(x);
#endif
#endif
#if defined Vc_CLANG || defined Vc_APPLECLANG
#define Vc_HAS_BUILTIN(x) __has_builtin(x)
#else
#define Vc_HAS_BUILTIN(x) 0
#endif
#define Vc_CAT_HELPER_(a, b, c, d) a##b##c##d
#define Vc_CAT(a, b, c, d) Vc_CAT_HELPER_(a, b, c, d)
#define Vc_CAT_IMPL(a, b) a##b
#define Vc_CAT2(a, b) Vc_CAT_IMPL(a, b)
#define Vc_APPLY_IMPL_1_(macro, a, b, c, d, e) macro(a)
#define Vc_APPLY_IMPL_2_(macro, a, b, c, d, e) macro(a, b)
#define Vc_APPLY_IMPL_3_(macro, a, b, c, d, e) macro(a, b, c)
#define Vc_APPLY_IMPL_4_(macro, a, b, c, d, e) macro(a, b, c, d)
#define Vc_APPLY_IMPL_5_(macro, a, b, c, d, e) macro(a, b, c, d, e)
#define Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \
size(macro, double_v, a, b, c, d) \
size(macro, float_v, a, b, c, d)
#define Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) \
size(macro, int_v, a, b, c, d) \
size(macro, uint_v, a, b, c, d) \
size(macro, short_v, a, b, c, d) \
size(macro, ushort_v, a, b, c, d)
#define Vc_LIST_VECTOR_TYPES(size, macro, a, b, c, d) \
Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \
Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d)
#define Vc_LIST_COMPARES(size, macro, a, b, c, d) \
size(macro, ==, a, b, c, d) \
size(macro, !=, a, b, c, d) \
size(macro, <=, a, b, c, d) \
size(macro, >=, a, b, c, d) \
size(macro, < , a, b, c, d) \
size(macro, > , a, b, c, d)
#define Vc_LIST_LOGICAL(size, macro, a, b, c, d) \
size(macro, &&, a, b, c, d) \
size(macro, ||, a, b, c, d)
#define Vc_LIST_BINARY(size, macro, a, b, c, d) \
size(macro, |, a, b, c, d) \
size(macro, &, a, b, c, d) \
size(macro, ^, a, b, c, d)
#define Vc_LIST_SHIFTS(size, macro, a, b, c, d) \
size(macro, <<, a, b, c, d) \
size(macro, >>, a, b, c, d)
#define Vc_LIST_ARITHMETICS(size, macro, a, b, c, d) \
size(macro, +, a, b, c, d) \
size(macro, -, a, b, c, d) \
size(macro, *, a, b, c, d) \
size(macro, /, a, b, c, d) \
size(macro, %, a, b, c, d)
#define Vc_APPLY_0(_list, macro) _list(Vc_APPLY_IMPL_1_, macro, 0, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_APPLY_1(_list, macro, a) _list(Vc_APPLY_IMPL_2_, macro, a, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_APPLY_2(_list, macro, a, b) _list(Vc_APPLY_IMPL_3_, macro, a, b, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_APPLY_3(_list, macro, a, b, c) _list(Vc_APPLY_IMPL_4_, macro, a, b, c, 0) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_APPLY_4(_list, macro, a, b, c, d) _list(Vc_APPLY_IMPL_5_, macro, a, b, c, d) Vc_NOTHING_EXPECTING_SEMICOLON
#define Vc_ALL_COMPARES(macro) Vc_APPLY_0(Vc_LIST_COMPARES, macro)
#define Vc_ALL_LOGICAL(macro) Vc_APPLY_0(Vc_LIST_LOGICAL, macro)
#define Vc_ALL_BINARY(macro) Vc_APPLY_0(Vc_LIST_BINARY, macro)
#define Vc_ALL_SHIFTS(macro) Vc_APPLY_0(Vc_LIST_SHIFTS, macro)
#define Vc_ALL_ARITHMETICS(macro) Vc_APPLY_0(Vc_LIST_ARITHMETICS, macro)
#define Vc_ALL_FLOAT_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_FLOAT_VECTOR_TYPES, macro)
#define Vc_ALL_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_VECTOR_TYPES, macro)
#define Vc_EXACT_TYPE(_test, _reference, _type) \
typename std::enable_if<std::is_same<_test, _reference>::value, _type>::type
#define Vc_make_unique(name) Vc_CAT(Vc_,name,_,__LINE__)
#if defined(Vc_NO_NOEXCEPT)
#define Vc_NOEXCEPT throw()
#else
#define Vc_NOEXCEPT noexcept
#endif
#ifdef Vc_NO_ALWAYS_INLINE
#undef Vc_ALWAYS_INLINE
#undef Vc_ALWAYS_INLINE_L
#undef Vc_ALWAYS_INLINE_R
#define Vc_ALWAYS_INLINE inline
#define Vc_ALWAYS_INLINE_L inline
#define Vc_ALWAYS_INLINE_R
#undef Vc_INTRINSIC
#undef Vc_INTRINSIC_L
#undef Vc_INTRINSIC_R
#define Vc_INTRINSIC inline
#define Vc_INTRINSIC_L inline
#define Vc_INTRINSIC_R
#endif
#endif // VC_COMMON_MACROS_H_

150
Vc/common/makeContainer.h Normal file
View File

@ -0,0 +1,150 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_MAKECONTAINER_H_
#define VC_COMMON_MAKECONTAINER_H_
#include "../vector.h"
#include <initializer_list>
namespace Vc_VERSIONED_NAMESPACE
{
namespace
{
template<typename Container, typename T> struct make_container_helper
{
static constexpr Container help(std::initializer_list<T> list) { return { list }; }
};
template <typename T_, typename Abi, typename Alloc,
template <class, class> class Container>
struct make_container_helper<Container<Vector<T_, Abi>, Alloc>,
typename Vector<T_, Abi>::EntryType> {
typedef Vector<T_, Abi> V;
typedef typename V::EntryType T;
typedef Container<V, Alloc> C;
static inline C help(std::initializer_list<T> list) {
const std::size_t size = (list.size() + (V::Size - 1)) / V::Size;
C v(size);
auto containerIt = v.begin();
auto init = std::begin(list);
const auto initEnd = std::end(list);
for (std::size_t i = 0; i < size - 1; ++i) {
*containerIt++ = V(init, Vc::Unaligned);
init += V::Size;
}
Vc_ASSERT(all_of(*containerIt == V::Zero()));
int j = 0;
while (init != initEnd) {
(*containerIt)[j++] = *init++;
}
return v;
}
};
template <typename T_, typename Abi, std::size_t N,
template <class, std::size_t> class Container>
struct make_container_helper<Container<Vector<T_, Abi>, N>,
typename Vector<T_, Abi>::EntryType> {
typedef Vector<T_, Abi> V;
typedef typename V::EntryType T;
static constexpr std::size_t size = (N + (V::Size - 1)) / V::Size;
typedef Container<
V,
#if defined Vc_CLANG && Vc_CLANG < 0x30700 // TODO: when did Vc_APPLECLANG fix it?
// clang before 3.7.0 has a bug when returning std::array<__m256x, 1>. So
// increase it to std::array<__m256x, 2> and fill it with zeros. Better
// than returning garbage.
(size == 1 && std::is_same<Abi, VectorAbi::Avx>::value) ? 2 :
#endif
size> C;
static inline C help(std::initializer_list<T> list) {
Vc_ASSERT(N == list.size())
Vc_ASSERT(size == (list.size() + (V::Size - 1)) / V::Size)
C v;
auto containerIt = v.begin();
auto init = std::begin(list);
const auto initEnd = std::end(list);
for (std::size_t i = 0; i < size - 1; ++i) {
*containerIt++ = V(init, Vc::Unaligned);
init += V::Size;
}
Vc_ASSERT(all_of(*containerIt == V::Zero()));
int j = 0;
while (init != initEnd) {
(*containerIt)[j++] = *init++;
}
return v;
}
};
} // anonymous namespace
/**
* \ingroup Containers
* \headerfile makeContainer.h <Vc/Utils>
*
* Construct a container of Vc vectors from a std::initializer_list of scalar entries.
*
* \tparam Container The container type to construct.
* \tparam T The scalar type to use for the initializer_list.
*
* \param list An initializer list of arbitrary size. The type of the entries is important!
* If you pass a list of integers you will get a container filled with Vc::int_v objects.
* If, instead, you want to have a container of Vc::float_v objects, be sure the include a
* period (.) and the 'f' postfix in the literals. Alternatively, you can pass the
* type as second template argument to makeContainer.
*
* \return Returns a container of the requested class filled with the minimum number of SIMD
* vectors to hold the values in the initializer list.
* If the number of values in \p list does not match the number of values in the
* returned container object, the remaining values in the returned object will be
* zero-initialized.
*
* Example:
* \code
* auto data = Vc::makeContainer<std::vector<float_v>>({ 1.f, 2.f, 3.f, 4.f, 5.f });
* // data.size() == 5 if float_v::Size == 1 (i.e. Vc_IMPL=Scalar)
* // data.size() == 2 if float_v::Size == 4 (i.e. Vc_IMPL=SSE)
* // data.size() == 1 if float_v::Size == 8 (i.e. Vc_IMPL=AVX)
* \endcode
*/
template<typename Container, typename T>
constexpr auto makeContainer(std::initializer_list<T> list) -> decltype(make_container_helper<Container, T>::help(list))
{
return make_container_helper<Container, T>::help(list);
}
template<typename Container, typename T>
constexpr auto make_container(std::initializer_list<T> list) -> decltype(makeContainer<Container, T>(list))
{
return makeContainer<Container, T>(list);
}
} // namespace Vc
#endif // VC_COMMON_MAKECONTAINER_H_

56
Vc/common/make_unique.h Normal file
View File

@ -0,0 +1,56 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_MAKE_UNIQUE_H_
#define VC_COMMON_MAKE_UNIQUE_H_
#include <memory>
#include "malloc.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template<typename T> struct Deleter
{
Vc_ALWAYS_INLINE void operator()(T *ptr) {
ptr->~T();
Vc::free(ptr);
}
};
template<class T, MallocAlignment A = Vc::AlignOnVector, class... Args>
inline std::unique_ptr<T, Deleter<T>> make_unique(Args&&... args)
{
return std::unique_ptr<T, Deleter<T>>(new(Vc::malloc<T, A>(1)) T(std::forward<Args>(args)...));
}
} // namespace Common
} // namespace Vc
#endif // VC_COMMON_MAKE_UNIQUE_H_

169
Vc/common/malloc.h Normal file
View File

@ -0,0 +1,169 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_MALLOC_H_
#define VC_COMMON_MALLOC_H_
#ifndef Vc_VECTOR_DECLARED_
#error "Incorrect inclusion order. This header must be included from Vc/vector.h only."
#endif
#if defined _WIN32 || defined _WIN64
#include <malloc.h>
#else
#include <cstdlib>
#endif
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template <size_t X> static constexpr size_t nextMultipleOf(size_t value)
{
return (value % X) > 0 ? value + X - (value % X) : value;
}
template <std::size_t alignment> Vc_INTRINSIC void *aligned_malloc(std::size_t n)
{
#ifdef __MIC__
return _mm_malloc(nextMultipleOf<alignment>(n), alignment);
#elif defined(_WIN32)
# ifdef __GNUC__
return __mingw_aligned_malloc(nextMultipleOf<alignment>(n), alignment);
# else
return _aligned_malloc(nextMultipleOf<alignment>(n), alignment);
# endif
#else
void *ptr = nullptr;
if (0 == posix_memalign(&ptr, alignment < sizeof(void *) ? sizeof(void *) : alignment,
nextMultipleOf<alignment>(n))) {
return ptr;
}
return ptr;
#endif
}
template <Vc::MallocAlignment A> Vc_ALWAYS_INLINE void *malloc(size_t n)
{
switch (A) {
case Vc::AlignOnVector:
return aligned_malloc<Vc::VectorAlignment>(n);
case Vc::AlignOnCacheline:
// TODO: hardcoding 64 is not such a great idea
return aligned_malloc<64>(n);
case Vc::AlignOnPage:
// TODO: hardcoding 4096 is not such a great idea
return aligned_malloc<4096>(n);
}
return nullptr;
}
Vc_ALWAYS_INLINE void free(void *p)
{
#ifdef __MIC__
_mm_free(p);
#elif defined(_WIN32)
# ifdef __GNUC__
return __mingw_aligned_free(p);
# else
return _aligned_free(p);
# endif
#else
std::free(p);
#endif
}
} // namespace Common
/**
* Allocates memory on the Heap with alignment and padding suitable for vectorized access.
*
* Memory that was allocated with this function must be released with Vc::free! Other methods might
* work but are not portable.
*
* \param n Specifies the number of objects the allocated memory must be able to store.
* \tparam T The type of the allocated memory. Note, that the constructor is not called.
* \tparam A Determines the alignment of the memory. See \ref Vc::MallocAlignment.
*
* \return Pointer to memory of the requested type, or 0 on error. The allocated memory is padded at
* the end to be a multiple of the requested alignment \p A. Thus if you request memory for 21
* int objects, aligned via Vc::AlignOnCacheline, you can safely read a full cacheline until the
* end of the array, without generating an out-of-bounds access. For a cacheline size of 64 Bytes
* and an int size of 4 Bytes you would thus get an array of 128 Bytes to work with.
*
* \warning
* \li The standard malloc function specifies the number of Bytes to allocate whereas this
* function specifies the number of values, thus differing in a factor of sizeof(T).
* \li This function is mainly meant for use with builtin types. If you use a custom
* type with a sizeof that is not a multiple of 2 the results might not be what you expect.
* \li The constructor of T is not called. You can make up for this:
* \code
* SomeType *array = new(Vc::malloc<SomeType, Vc::AlignOnCacheline>(N)) SomeType[N];
* \endcode
*
* \see Vc::free
*
* \ingroup Utilities
* \headerfile memory.h <Vc/Memory>
*/
template<typename T, Vc::MallocAlignment A>
Vc_ALWAYS_INLINE T *malloc(size_t n)
{
return static_cast<T *>(Common::malloc<A>(n * sizeof(T)));
}
/**
* Frees memory that was allocated with Vc::malloc.
*
* \param p The pointer to the memory to be freed.
*
* \tparam T The type of the allocated memory.
*
* \warning The destructor of T is not called. If needed, you can call the destructor before calling
* free:
* \code
* for (int i = 0; i < N; ++i) {
* p[i].~T();
* }
* Vc::free(p);
* \endcode
*
* \ingroup Utilities
* \headerfile memory.h <Vc/Memory>
*
* \see Vc::malloc
*/
template<typename T>
Vc_ALWAYS_INLINE void free(T *p)
{
Common::free(p);
}
} // namespace Vc
#endif // VC_COMMON_MALLOC_H_

435
Vc/common/mask.h Normal file
View File

@ -0,0 +1,435 @@
/* This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_MASK_H_
#define VC_COMMON_MASK_H_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
/**
* \class Mask mask.h <Vc/vector.h>
* \ingroup Masks
*
* The main SIMD mask class.
*/
template <typename T, typename Abi = VectorAbi::Best<T>> class Mask
{
public:
/**
* Returns the number of boolean components (\VSize{T}) in a mask of this type.
*
* The size of the mask. I.e. the number of boolean entries in the mask. Do not
* make any assumptions about the size of masks.
*
* In addition, you can easily use if clauses that compare sizes. The compiler can
* statically evaluate and fully optimize dead code away (very much like \#ifdef, but
* with syntax checking).
*
* \returns The number of components (i.e. \VSize{T}) objects of this mask type store
* and manipulate.
*/
static constexpr size_t size() { return VectorTraits<T, Abi>::size(); }
///\copydoc size
///\deprecated Use Vc::Mask::size instead.
static constexpr size_t Size = VectorTraits<T, Abi>::size();
/**
* Specifies the alignment requirement for aligned load and store calls for objects of
* this mask type.
*/
static constexpr size_t MemoryAlignment = VectorTraits<T, Abi>::maskMemoryAlignment();
/// The ABI tag type of the current template instantiation.
using abi = Abi;
/**
* The \c EntryType of masks is always \c bool, independent of \c T.
*/
using EntryType = bool;
/// \copydoc EntryType
using value_type = EntryType;
/// The reference wrapper type used for accessing individual mask components.
using EntryReference = typename VectorTraits<T, Abi>::EntryReference;
/// \copydoc EntryReference
using value_reference = EntryReference;
/**
* The \c VectorEntryType, in contrast to \c EntryType, reveals information about the SIMD
* implementation.
* This type is useful for the \c sizeof operator in generic functions.
*/
using VectorEntryType = typename VectorTraits<T, Abi>::VectorEntryType;
/**\internal
* The \c VectorType reveals the implementation-specific internal type used for the SIMD type.
*/
using VectorType = typename VectorTraits<T, Abi>::VectorType;
/**\internal
* \copydoc VectorType
*/
using vector_type = VectorType;
/*
* The associated Vector<T> type.
*/
//using Vector = Vector<T, Abi>;
/// \name Generators
///@{
/**
* Creates a new mask object initialized to zero/\c false.
*
* \returns A mask object with zero-initialized components.
*/
Vc_INTRINSIC static Mask Zero();
/**
* Creates a mask object initialized to one/\c true.
*
* \returns A mask object with components initialized to \c true.
*/
Vc_INTRINSIC static Mask One();
/// Generate a mask object from booleans returned from the function \p gen.
template <typename G> static Vc_INTRINSIC Mask generate(G &&gen);
///@}
/// \name Compile-Time Constant Initialization
///@{
/**
* Construct a zero-initialized vector object.
*
* This constructor follows the behavior of the underlying \c bool type in that the
* expression `bool()` zero-initializes the object (to \c false). On the other hand
* the variable \c x in `bool x;` is uninitialized.
* Since, for class types, both expressions call the default constructor `Mask<T> x`
* must zero-initialize \c x as well.
*/
Vc_INTRINSIC Mask() = default;
/// Zero-initialize the new mask object (\c false).
/// \see Vc::Zero, Zero()
Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero);
/// Initialize the new mask object to one (\c true).
/// \see Vc::One, One()
Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne);
///@}
/// \name Conversion/Broadcast Constructors
///@{
/**
* Broadcast constructor.
*
* Set all components of the new mask object to \p b.
*
* \param b Determines the initial state of the mask.
*/
Vc_INTRINSIC explicit Mask(bool b);
/**
* Implicit conversion from a compatible (equal \VSize{T} on every platform) mask
* object.
*
* \param otherMask The mask to be converted.
*/
template <typename U>
Vc_INTRINSIC Mask(U &&otherMask,
Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg);
#if Vc_IS_VERSION_1
/**
* Explicit conversion (static_cast) from a mask object that potentially has a
* different \VSize{T}.
*
* \param otherMask The mask to be converted.
*
* \internal This is implemented via simd_cast in scalar/simd_cast_caller.h
*/
template <typename U>
Vc_DEPRECATED(
"use simd_cast instead of explicit type casting to convert between mask types")
Vc_INTRINSIC_L
explicit Mask(U &&otherMask, Common::enable_if_mask_converts_explicitly<T, U> =
nullarg) Vc_INTRINSIC_R;
///@}
#endif
/**
* \name Loads & Stores
*/
///@{
/**
* Load constructor from an array of \c bool.
*
* This constructor implements an explicit conversion from an array of booleans to a
* mask object. It corresponds to a Vector load constructor.
*
* \param mem A pointer to the start of the array of booleans.
* \see Mask(const bool *, Flags), load(const bool *)
*/
Vc_ALWAYS_INLINE explicit Mask(const bool *mem);
/**
* Overload of the above with a load/store flag argument.
*
* \param mem A pointer to the start of the array of booleans.
* \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming,
* Vc::Unaligned, Vc::PrefetchDefault, ...
* \see load(const bool *, Flags)
*/
template <typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags flags);
/**
* Load the components of the mask from an array of \c bool.
*
* \param mem A pointer to the start of the array of booleans.
* \see load(const bool *, Flags), Mask(const bool *)
*/
Vc_ALWAYS_INLINE void load(const bool *mem);
/**
* Overload of the above with a load/store flag argument.
*
* \param mem A pointer to the start of the array of booleans.
* \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming,
* Vc::Unaligned, Vc::PrefetchDefault, ...
* \see Mask(const bool *, Flags)
*/
template <typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags flags);
/**
* Store the values of the mask to an array of \c bool.
*
* \param mem A pointer to the start of the array of booleans.
* \see store(bool *, Flags)
*/
Vc_ALWAYS_INLINE void store(bool *mem) const;
/**
* Overload of the above with a load/store flag argument.
*
* \param mem A pointer to the start of the array of booleans.
* \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming,
* Vc::Unaligned, Vc::PrefetchDefault, ...
*/
template <typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags flags) const;
///@}
/// \name Comparison Operators
///@{
/**
* Returns whether the two masks are equal in all components.
*
* \param mask The other mask to compare against.
* \returns A scalar boolean value that says whether all components of the two masks
* are equal.
*
* \note If you expected a behavior similar to the compare operator of Vc::Vector,
* consider that the bitwise operators already implement such functionality. There is
* little use, typically, in having `a == b` return the same as `a ^ b`. In general,
* it is more useful to query `all_of(a ^ b)` which is the same as this equality
* operator.
*/
Vc_ALWAYS_INLINE bool operator==(const Mask &mask) const;
/**
* Returns whether the two masks are different in at least one component.
*
* \param mask The other mask to compare against.
* \returns A scalar boolean value that says whether at least one component of the two masks is different.
*
* \note `(a == b) == !(a != b)` holds
* \see Mask::operator==(const Mask &)
*/
Vc_ALWAYS_INLINE bool operator!=(const Mask &mask) const;
///@}
/**
* \name Logical and Binary Operators
*
* \brief Component-wise logical/binary operations on mask objects.
*
* The effect of logical and binary \c AND and \c OR is equivalent for mask types (as
* it is for \c bool).
*/
///@{
/// Returns the component-wise application of a logical \c AND to \p mask.
Vc_ALWAYS_INLINE Mask operator&&(const Mask &mask) const;
/// Returns the component-wise application of a binary \c AND to \p mask.
Vc_ALWAYS_INLINE Mask operator&(const Mask &mask) const;
/// Returns the component-wise application of a logical \c OR to \p mask.
Vc_ALWAYS_INLINE Mask operator||(const Mask &mask) const;
/// Returns the component-wise application of a binary \c OR to \p mask.
Vc_ALWAYS_INLINE Mask operator|(const Mask &mask) const;
/// Returns the component-wise application of a binary \c XOR to \p mask.
Vc_ALWAYS_INLINE Mask operator^(const Mask &mask) const;
/// Returns a mask with inverted components.
Vc_ALWAYS_INLINE Mask operator!() const;
/// Modifies the mask using an \c AND operation with \p mask.
Vc_ALWAYS_INLINE Mask &operator&=(const Mask &mask);
/// Modifies the mask using an \c OR operation with \p mask.
Vc_ALWAYS_INLINE Mask &operator|=(const Mask &mask);
/// Modifies the mask using an \c XOR operation with \p mask.
Vc_ALWAYS_INLINE Mask &operator^=(const Mask &mask);
///@}
/**
* \name Reductions
*
* \see any_of, all_of, none_of, some_of
*/
///@{
/// Returns a logical \c AND of all components.
Vc_ALWAYS_INLINE bool isFull() const;
/// Returns a logical \c OR of all components.
Vc_ALWAYS_INLINE bool isNotEmpty() const;
/// Returns \c true if components are \c false, \c false otherwise.
Vc_ALWAYS_INLINE bool isEmpty() const;
/// Returns `!isFull() && !isEmpty()`.
Vc_ALWAYS_INLINE bool isMix() const;
///@}
/**\internal
* \name Internal Data Access
*/
///@{
Vc_ALWAYS_INLINE bool data() const;
Vc_ALWAYS_INLINE bool dataI() const;
Vc_ALWAYS_INLINE bool dataD() const;
///@}
/// \name Scalar Subscript Operators
///@{
/**
* Lvalue-reference-like access to mask entries.
*
* \param index Determines the boolean to be accessed.
* \return a temporary proxy object referencing the \p index th entry of the mask.
*
* \warning This operator does not return an lvalue reference (to \c bool), but rather
* a temporary (rvalue) object that mimics an lvalue reference (as much as is possible
* with C++11/14).
*/
Vc_ALWAYS_INLINE EntryReference operator[](size_t index);
/**
* Read-only access to mask entries.
*
* \param index Determines the boolean to be accessed.
* \return The \p index th entry of the mask as a \c bool (rvalue).
*
* \warning This operator does not return an lvalue reference (to `const bool`), but
* rather a temporary (rvalue) \c bool.
*/
Vc_ALWAYS_INLINE EntryType operator[](size_t index) const;
///@}
/// Returns how many components of the mask are \c true.
Vc_ALWAYS_INLINE int count() const;
/**
* Returns the index of the first one in the mask.
*
* \returns the index of the first component that is \c true.
*
* \warning The return value is undefined if the mask is empty.
*
* Thus, unless `none_of(mask)`, `mask[mask.firstOne()] == true` holds and `mask[i] ==
* false` for all `i < mask.firstOne()`.
*/
Vc_ALWAYS_INLINE int firstOne() const;
/**
* Convert the boolean components of the mask into bits of an integer.
*
* \return An \c int where each bit corresponds to the boolean value in the mask.
*
* For example, the mask `[true, false, false, true]` results in a `9` (in binary: `1001`).
*/
Vc_ALWAYS_INLINE int toInt() const;
/// Returns a mask with components shifted by \p amount places.
Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const;
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask));
private:
VectorType d;
};
/**
* \ingroup Utilities
*
* \name Boolean Reductions
*/
//@{
/** \ingroup Utilities
* Returns whether all entries in the mask \p m are \c true.
*/
template<typename Mask> constexpr bool all_of(const Mask &m) { return m.isFull(); }
/** \ingroup Utilities
* Returns \p b
*/
constexpr bool all_of(bool b) { return b; }
/** \ingroup Utilities
* Returns whether at least one entry in the mask \p m is \c true.
*/
template<typename Mask> constexpr bool any_of(const Mask &m) { return m.isNotEmpty(); }
/** \ingroup Utilities
* Returns \p b
*/
constexpr bool any_of(bool b) { return b; }
/** \ingroup Utilities
* Returns whether all entries in the mask \p m are \c false.
*/
template<typename Mask> constexpr bool none_of(const Mask &m) { return m.isEmpty(); }
/** \ingroup Utilities
* Returns \p !b
*/
constexpr bool none_of(bool b) { return !b; }
/** \ingroup Utilities
* Returns whether at least one entry in \p m is \c true and at least one entry in \p m is \c
* false.
*/
template<typename Mask> constexpr bool some_of(const Mask &m) { return m.isMix(); }
/** \ingroup Utilities
* Returns \c false
*/
constexpr bool some_of(bool) { return false; }
//@}
} // namespace Vc
#endif // VC_COMMON_MASK_H_
// vim: foldmethod=marker

98
Vc/common/maskbool.h Normal file
View File

@ -0,0 +1,98 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_MASKENTRY_H_
#define VC_COMMON_MASKENTRY_H_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
namespace
{
template<size_t Bytes> struct MaskBoolStorage;
// the following for typedefs must use std::intN_t and NOT! Vc::intN_t. The latter
// segfaults ICC 15.0.3.
template<> struct MaskBoolStorage<1> { typedef std::int8_t type; };
template<> struct MaskBoolStorage<2> { typedef std::int16_t type; };
template<> struct MaskBoolStorage<4> { typedef std::int32_t type; };
template<> struct MaskBoolStorage<8> { typedef std::int64_t type; };
} // anonymous namespace
template<size_t Bytes> class MaskBool
{
typedef typename MaskBoolStorage<Bytes>::type storage_type Vc_MAY_ALIAS;
storage_type data;
public:
constexpr MaskBool(bool x) noexcept : data(x ? -1 : 0) {}
Vc_ALWAYS_INLINE MaskBool &operator=(bool x) noexcept { data = x ? -1 : 0; return *this; }
template <typename T, typename = enable_if<(!std::is_same<T, bool>::value &&
std::is_fundamental<T>::value)>>
Vc_ALWAYS_INLINE MaskBool &operator=(T x) noexcept
{
data = reinterpret_cast<const storage_type &>(x);
return *this;
}
Vc_ALWAYS_INLINE MaskBool(const MaskBool &) noexcept = default;
Vc_ALWAYS_INLINE MaskBool &operator=(const MaskBool &) noexcept = default;
template <typename T, typename = enable_if<(std::is_same<T, bool>::value ||
(std::is_fundamental<T>::value &&
sizeof(storage_type) == sizeof(T)))>>
constexpr operator T() const noexcept
{
return std::is_same<T, bool>::value ? T((data & 1) != 0) : aliasing_cast<T>(data);
}
} Vc_MAY_ALIAS;
template <typename A,
typename B,
typename std::enable_if<
std::is_convertible<A, bool>::value &&std::is_convertible<B, bool>::value,
int>::type = 0>
constexpr bool operator==(A &&a, B &&b)
{
return static_cast<bool>(a) == static_cast<bool>(b);
}
template <typename A,
typename B,
typename std::enable_if<
std::is_convertible<A, bool>::value &&std::is_convertible<B, bool>::value,
int>::type = 0>
constexpr bool operator!=(A &&a, B &&b)
{
return static_cast<bool>(a) != static_cast<bool>(b);
}
} // namespace Common
} // namespace Vc
#endif // VC_COMMON_MASKENTRY_H_

142
Vc/common/math.h Normal file
View File

@ -0,0 +1,142 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_MATH_H_
#define VC_COMMON_MATH_H_
#define Vc_COMMON_MATH_H_INTERNAL 1
#include "trigonometric.h"
#include "const.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
// TODO, not vectorized:
template <class T, class Abi>
SimdArray<int, Vector<T, Abi>::size()> fpclassify(const Vector<T, Abi> &x)
{
return SimdArray<int, Vector<T, Abi>::size()>(
[&](std::size_t i) { return std::fpclassify(x[i]); });
}
template <class T, size_t N> SimdArray<int, N> fpclassify(const SimdArray<T, N> &x)
{
return SimdArray<int, N>([&](std::size_t i) { return std::fpclassify(x[i]); });
}
#ifdef Vc_IMPL_SSE
// for SSE, AVX, and AVX2
#include "logarithm.h"
#include "exponential.h"
#ifdef Vc_IMPL_AVX
inline AVX::double_v exp(AVX::double_v _x)
{
AVX::Vector<double> x = _x;
typedef AVX::Vector<double> V;
typedef V::Mask M;
typedef AVX::Const<double> C;
const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>(); // max log
const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>(); // min log
V px = floor(C::log2_e() * x + 0.5);
__m128i tmp = _mm256_cvttpd_epi32(px.data());
const SimdArray<int, V::Size> n = SSE::int_v{tmp};
x -= px * C::ln2_large(); //Vc::Detail::doubleConstant<1, 0x00062e4000000000ull, -1>(); // ln2
x -= px * C::ln2_small(); //Vc::Detail::doubleConstant<1, 0x0007f7d1cf79abcaull, -20>(); // ln2
const double P[] = {
Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(),
Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(),
Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>()
};
const double Q[] = {
Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(),
Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(),
Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(),
Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>()
};
const V x2 = x * x;
px = x * ((P[0] * x2 + P[1]) * x2 + P[2]);
x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px);
x = V::One() + 2.0 * x;
x = ldexp(x, n); // == x * 2ⁿ
x(overflow) = std::numeric_limits<double>::infinity();
x.setZero(underflow);
return x;
}
#endif // Vc_IMPL_AVX
inline SSE::double_v exp(SSE::double_v::AsArg _x) {
SSE::Vector<double> x = _x;
typedef SSE::Vector<double> V;
typedef V::Mask M;
typedef SSE::Const<double> C;
const M overflow = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>(); // max log
const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>(); // min log
V px = floor(C::log2_e() * x + 0.5);
SimdArray<int, V::Size> n;
_mm_storel_epi64(reinterpret_cast<__m128i *>(&n), _mm_cvttpd_epi32(px.data()));
x -= px * C::ln2_large(); //Vc::Detail::doubleConstant<1, 0x00062e4000000000ull, -1>(); // ln2
x -= px * C::ln2_small(); //Vc::Detail::doubleConstant<1, 0x0007f7d1cf79abcaull, -20>(); // ln2
const double P[] = {
Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(),
Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull, -6>(),
Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 0>()
};
const double Q[] = {
Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(),
Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull, -9>(),
Vc::Detail::doubleConstant<1, 0x000d17099887e074ull, -3>(),
Vc::Detail::doubleConstant<1, 0x0000000000000000ull, 1>()
};
const V x2 = x * x;
px = x * ((P[0] * x2 + P[1]) * x2 + P[2]);
x = px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px);
x = V::One() + 2.0 * x;
x = ldexp(x, n); // == x * 2ⁿ
x(overflow) = std::numeric_limits<double>::infinity();
x.setZero(underflow);
return x;
}
#endif
} // namespace Vc
#undef Vc_COMMON_MATH_H_INTERNAL
#endif // VC_COMMON_MATH_H_

591
Vc/common/memory.h Normal file
View File

@ -0,0 +1,591 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_MEMORY_H_
#define VC_COMMON_MEMORY_H_
#include "memorybase.h"
#include <assert.h>
#include <algorithm>
#include <cstring>
#include <cstddef>
#include <initializer_list>
#include "memoryfwd.h"
#include "malloc.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template<typename V, size_t Size> struct _MemorySizeCalculation
{
enum AlignmentCalculations {
Alignment = V::Size,
AlignmentMask = Alignment - 1,
MaskedSize = Size & AlignmentMask,
Padding = Alignment - MaskedSize,
PaddedSize = MaskedSize == 0 ? Size : Size + Padding
};
};
/**
* \ingroup Containers
* \headerfile memory.h <Vc/Memory>
*
* A helper class for fixed-size two-dimensional arrays.
*
* \param V The vector type you want to operate on. (e.g. float_v or uint_v)
* \param Size1 Number of rows
* \param Size2 Number of columns
*/
template <typename V, size_t Size1, size_t Size2, bool InitPadding>
class Memory : public MemoryBase<V, Memory<V, Size1, Size2, InitPadding>, 2,
Memory<V, Size2, 0, InitPadding>>
{
public:
typedef typename V::EntryType EntryType;
private:
using RowMemory = Memory<V, Size2, 0, InitPadding>;
typedef MemoryBase<V, Memory<V, Size1, Size2, InitPadding>, 2, RowMemory> Base;
friend class MemoryBase<V, Memory<V, Size1, Size2, InitPadding>, 2, RowMemory>;
friend class MemoryDimensionBase<V, Memory<V, Size1, Size2, InitPadding>, 2,
RowMemory>;
enum : size_t {
Alignment = V::MemoryAlignment,
PaddedSize2 = _MemorySizeCalculation<V, Size2>::PaddedSize
};
alignas(static_cast<size_t>(Alignment)) // GCC complains about 'is not an
// integer constant' unless the
// static_cast is present
RowMemory m_mem[Size1];
public:
using Base::vector;
enum Constants {
RowCount = Size1,
VectorsCount = PaddedSize2 / V::Size
};
Memory() = default;
/**
* \return the number of rows in the array.
*
* \note This function can be eliminated by an optimizing compiler.
*/
static constexpr size_t rowsCount() { return RowCount; }
/**
* \return the number of scalar entries in the whole array.
*
* \warning Do not use this function for scalar iteration over the array since there will be
* padding between rows if \c Size2 is not divisible by \c V::Size.
*
* \note This function can be optimized into a compile-time constant.
*/
static constexpr size_t entriesCount() { return Size1 * Size2; }
/**
* \return the number of vectors in the whole array.
*
* \note This function can be optimized into a compile-time constant.
*/
static constexpr size_t vectorsCount() { return VectorsCount * Size1; }
/**
* Copies the data from a different object.
*
* \param rhs The object to copy the data from.
*
* \return reference to the modified Memory object.
*
* \note Both objects must have the exact same vectorsCount().
*/
template<typename Parent, typename RM>
Vc_ALWAYS_INLINE Memory &operator=(const MemoryBase<V, Parent, 2, RM> &rhs) {
assert(vectorsCount() == rhs.vectorsCount());
Detail::copyVectors(*this, rhs);
return *this;
}
Vc_ALWAYS_INLINE Memory &operator=(const Memory &rhs) {
Detail::copyVectors(*this, rhs);
return *this;
}
/**
* Initialize all data with the given vector.
*
* \param v This vector will be used to initialize the memory.
*
* \return reference to the modified Memory object.
*/
inline Memory &operator=(const V &v) {
for (size_t i = 0; i < vectorsCount(); ++i) {
vector(i) = v;
}
return *this;
}
};
/**
* A helper class to simplify usage of correctly aligned and padded memory, allowing both vector and
* scalar access.
*
* Example:
* \code
Vc::Memory<int_v, 11> array;
// scalar access:
for (size_t i = 0; i < array.entriesCount(); ++i) {
int x = array[i]; // read
array[i] = x; // write
}
// more explicit alternative:
for (size_t i = 0; i < array.entriesCount(); ++i) {
int x = array.scalar(i); // read
array.scalar(i) = x; // write
}
// vector access:
for (size_t i = 0; i < array.vectorsCount(); ++i) {
int_v x = array.vector(i); // read
array.vector(i) = x; // write
}
* \endcode
* This code allocates a small array and implements three equivalent loops (that do nothing useful).
* The loops show how scalar and vector read/write access is best implemented.
*
* Since the size of 11 is not a multiple of int_v::Size (unless you use the
* scalar Vc implementation) the last write access of the vector loop would normally be out of
* bounds. But the Memory class automatically pads the memory such that the whole array can be
* accessed with correctly aligned memory addresses.
*
* \param V The vector type you want to operate on. (e.g. float_v or uint_v)
* \param Size The number of entries of the scalar base type the memory should hold. This
* is thus the same number as you would use for a normal C array (e.g. float mem[11] becomes
* Memory<float_v, 11> mem).
*
* \see Memory<V, 0u>
*
* \ingroup Containers
* \headerfile memory.h <Vc/Memory>
*/
template <typename V, size_t Size, bool InitPadding>
class Memory<V, Size, 0u, InitPadding> :
public MemoryBase<V, Memory<V, Size, 0u, InitPadding>, 1, void>
{
public:
typedef typename V::EntryType EntryType;
private:
typedef MemoryBase<V, Memory<V, Size, 0u, InitPadding>, 1, void> Base;
friend class MemoryBase<V, Memory<V, Size, 0u, InitPadding>, 1, void>;
friend class MemoryDimensionBase<V, Memory<V, Size, 0u, InitPadding>, 1, void>;
enum : size_t {
Alignment = V::MemoryAlignment, // in Bytes
MaskedSize = Size & (V::Size - 1), // the fraction of Size that exceeds
// an integral multiple of V::Size
Padding = V::Size - MaskedSize,
PaddedSize = MaskedSize == 0 ? Size : Size + Padding
};
alignas(static_cast<size_t>(Alignment)) // GCC complains about 'is not an
// integer constant' unless the
// static_cast is present
EntryType m_mem[PaddedSize];
public:
using Base::vector;
enum Constants {
EntriesCount = Size,
VectorsCount = PaddedSize / V::Size
};
Memory()
{
if (InitPadding) {
Base::lastVector() = V::Zero();
}
}
Memory(std::initializer_list<EntryType> init)
{
Vc_ASSERT(init.size() <= Size);
Base::lastVector() = V::Zero();
std::copy(init.begin(), init.end(), &m_mem[0]);
}
/**
* Wrap existing data with the Memory convenience class.
*
* This function returns a \em reference to a Memory<V, Size, 0> object that you must
* capture to avoid a copy of the whole data:
* \code
* Memory<float_v, 16> &m = Memory<float_v, 16>::fromRawData(someAlignedPointerToFloat)
* \endcode
*
* \param ptr An aligned pointer to memory of type \p V::EntryType (e.g. \c float for
* Vc::float_v).
* \return A Memory object placed at the given location in memory.
*
* \warning The pointer \p ptr passed to this function must be aligned according to the
* alignment restrictions of \p V.
* \warning The size of the accessible memory must match \p Size. This includes the
* required padding at the end to allow the last entries to be accessed via vectors. If
* you know what you are doing you might violate this constraint.
* \warning It is your responsibility to ensure that the memory is released correctly
* (not too early/not leaked). This function simply adds convenience functions to \em
* access the memory.
*/
static Vc_ALWAYS_INLINE Vc_CONST Memory<V, Size, 0u, false> &fromRawData(EntryType *ptr)
{
// DANGER! This placement new has to use the right address. If the compiler decides
// RowMemory requires padding before the actual data then the address has to be adjusted
// accordingly
char *addr = reinterpret_cast<char *>(ptr);
typedef Memory<V, Size, 0u, false> MM;
addr -= offsetof(MM, m_mem);
return *new(addr) MM;
}
/**
* \return the number of scalar entries in the whole array.
*
* \note This function can be optimized into a compile-time constant.
*/
static constexpr size_t entriesCount() { return EntriesCount; }
/**
* \return the number of vectors in the whole array.
*
* \note This function can be optimized into a compile-time constant.
*/
static constexpr size_t vectorsCount() { return VectorsCount; }
inline Memory(const Memory &rhs)
{
Detail::copyVectors(*this, rhs);
}
template <size_t S> inline Memory(const Memory<V, S> &rhs)
{
assert(vectorsCount() == rhs.vectorsCount());
Detail::copyVectors(*this, rhs);
}
inline Memory &operator=(const Memory &rhs)
{
Detail::copyVectors(*this, rhs);
return *this;
}
template <size_t S> inline Memory &operator=(const Memory<V, S> &rhs)
{
assert(vectorsCount() == rhs.vectorsCount());
Detail::copyVectors(*this, rhs);
return *this;
}
Vc_ALWAYS_INLINE Memory &operator=(const EntryType *rhs) {
std::memcpy(m_mem, rhs, entriesCount() * sizeof(EntryType));
return *this;
}
inline Memory &operator=(const V &v) {
for (size_t i = 0; i < vectorsCount(); ++i) {
vector(i) = v;
}
return *this;
}
};
/**
* A helper class that is very similar to Memory<V, Size> but with dynamically allocated memory and
* thus dynamic size.
*
* Example:
* \code
size_t size = 11;
Vc::Memory<int_v> array(size);
// scalar access:
for (size_t i = 0; i < array.entriesCount(); ++i) {
array[i] = i;
}
// vector access:
for (size_t i = 0; i < array.vectorsCount(); ++i) {
array.vector(i) = int_v::IndexesFromZero() + i * int_v::Size;
}
* \endcode
* This code allocates a small array with 11 scalar entries
* and implements two equivalent loops that initialize the memory.
* The scalar loop writes each individual int. The vectorized loop writes int_v::Size values to
* memory per iteration. Since the size of 11 is not a multiple of int_v::Size (unless you use the
* scalar Vc implementation) the last write access of the vector loop would normally be out of
* bounds. But the Memory class automatically pads the memory such that the whole array can be
* accessed with correctly aligned memory addresses.
* (Note: the scalar loop can be auto-vectorized, except for the last three assignments.)
*
* \note The internal data pointer is not declared with the \c __restrict__ keyword. Therefore
* modifying memory of V::EntryType will require the compiler to assume aliasing. If you want to use
* the \c __restrict__ keyword you need to use a standard pointer to memory and do the vector
* address calculation and loads and stores manually.
*
* \param V The vector type you want to operate on. (e.g. float_v or uint_v)
*
* \see Memory<V, Size>
*
* \ingroup Containers
* \headerfile memory.h <Vc/Memory>
*/
template<typename V> class Memory<V, 0u, 0u, true> : public MemoryBase<V, Memory<V, 0u, 0u, true>, 1, void>
{
public:
typedef typename V::EntryType EntryType;
private:
typedef MemoryBase<V, Memory<V>, 1, void> Base;
friend class MemoryBase<V, Memory<V>, 1, void>;
friend class MemoryDimensionBase<V, Memory<V>, 1, void>;
enum InternalConstants {
Alignment = V::Size,
AlignmentMask = Alignment - 1
};
size_t m_entriesCount;
size_t m_vectorsCount;
EntryType *m_mem;
size_t calcPaddedEntriesCount(size_t x)
{
size_t masked = x & AlignmentMask;
return (masked == 0 ? x : x + (Alignment - masked));
}
public:
using Base::vector;
/**
* Allocate enough memory to access \p size values of type \p V::EntryType.
*
* The allocated memory is aligned and padded correctly for fully vectorized access.
*
* \param size Determines how many scalar values will fit into the allocated memory.
*/
Vc_ALWAYS_INLINE Memory(size_t size)
: m_entriesCount(size),
m_vectorsCount(calcPaddedEntriesCount(m_entriesCount)),
m_mem(Vc::malloc<EntryType, Vc::AlignOnVector>(m_vectorsCount))
{
m_vectorsCount /= V::Size;
Base::lastVector() = V::Zero();
}
/**
* Copy the memory into a new memory area.
*
* The allocated memory is aligned and padded correctly for fully vectorized access.
*
* \param rhs The Memory object to copy from.
*/
template<typename Parent, typename RM>
Vc_ALWAYS_INLINE Memory(const MemoryBase<V, Parent, 1, RM> &rhs)
: m_entriesCount(rhs.entriesCount()),
m_vectorsCount(rhs.vectorsCount()),
m_mem(Vc::malloc<EntryType, Vc::AlignOnVector>(m_vectorsCount * V::Size))
{
Detail::copyVectors(*this, rhs);
}
/**
* Overload of the above function.
*
* (Because C++ would otherwise not use the templated cctor and use a default-constructed cctor instead.)
*
* \param rhs The Memory object to copy from.
*/
Vc_ALWAYS_INLINE Memory(const Memory &rhs)
: m_entriesCount(rhs.entriesCount()),
m_vectorsCount(rhs.vectorsCount()),
m_mem(Vc::malloc<EntryType, Vc::AlignOnVector>(m_vectorsCount * V::Size))
{
Detail::copyVectors(*this, rhs);
}
/**
* Frees the memory which was allocated in the constructor.
*/
Vc_ALWAYS_INLINE ~Memory()
{
Vc::free(m_mem);
}
/**
* Swap the contents and size information of two Memory objects.
*
* \param rhs The other Memory object to swap.
*/
inline void swap(Memory &rhs) {
std::swap(m_mem, rhs.m_mem);
std::swap(m_entriesCount, rhs.m_entriesCount);
std::swap(m_vectorsCount, rhs.m_vectorsCount);
}
/**
* \return the number of scalar entries in the whole array.
*/
Vc_ALWAYS_INLINE Vc_PURE size_t entriesCount() const { return m_entriesCount; }
/**
* \return the number of vectors in the whole array.
*/
Vc_ALWAYS_INLINE Vc_PURE size_t vectorsCount() const { return m_vectorsCount; }
/**
* Overwrite all entries with the values stored in \p rhs.
*
* \param rhs The object to copy the data from.
*
* \return reference to the modified Memory object.
*
* \note this function requires the vectorsCount() of both Memory objects to be equal.
*/
template<typename Parent, typename RM>
Vc_ALWAYS_INLINE Memory &operator=(const MemoryBase<V, Parent, 1, RM> &rhs) {
assert(vectorsCount() == rhs.vectorsCount());
Detail::copyVectors(*this, rhs);
return *this;
}
Vc_ALWAYS_INLINE Memory &operator=(const Memory &rhs) {
assert(vectorsCount() == rhs.vectorsCount());
Detail::copyVectors(*this, rhs);
return *this;
}
/**
* Overwrite all entries with the values stored in the memory at \p rhs.
*
* \param rhs The array to copy the data from.
*
* \return reference to the modified Memory object.
*
* \note this function requires that there are entriesCount() many values accessible from \p rhs.
*/
Vc_ALWAYS_INLINE Memory &operator=(const EntryType *rhs) {
std::memcpy(m_mem, rhs, entriesCount() * sizeof(EntryType));
return *this;
}
};
/**
* Prefetch the cacheline containing \p addr for a single read access.
*
* This prefetch completely bypasses the cache, not evicting any other data.
*
* \param addr The cacheline containing \p addr will be prefetched.
*
* \ingroup Utilities
* \headerfile memory.h <Vc/Memory>
*/
Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr)
{
Vc::Detail::prefetchForOneRead(addr, VectorAbi::Best<float>());
}
/**
* Prefetch the cacheline containing \p addr for modification.
*
* This prefetch evicts data from the cache. So use it only for data you really will use. When the
* target system supports it the cacheline will be marked as modified while prefetching, saving work
* later on.
*
* \param addr The cacheline containing \p addr will be prefetched.
*
* \ingroup Utilities
* \headerfile memory.h <Vc/Memory>
*/
Vc_ALWAYS_INLINE void prefetchForModify(const void *addr)
{
Vc::Detail::prefetchForModify(addr, VectorAbi::Best<float>());
}
/**
* Prefetch the cacheline containing \p addr to L1 cache.
*
* This prefetch evicts data from the cache. So use it only for data you really will use.
*
* \param addr The cacheline containing \p addr will be prefetched.
*
* \ingroup Utilities
* \headerfile memory.h <Vc/Memory>
*/
Vc_ALWAYS_INLINE void prefetchClose(const void *addr)
{
Vc::Detail::prefetchClose(addr, VectorAbi::Best<float>());
}
/**
* Prefetch the cacheline containing \p addr to L2 cache.
*
* This prefetch evicts data from the cache. So use it only for data you really will use.
*
* \param addr The cacheline containing \p addr will be prefetched.
*
* \ingroup Utilities
* \headerfile memory.h <Vc/Memory>
*/
Vc_ALWAYS_INLINE void prefetchMid(const void *addr)
{
Vc::Detail::prefetchMid(addr, VectorAbi::Best<float>());
}
/**
* Prefetch the cacheline containing \p addr to L3 cache.
*
* This prefetch evicts data from the cache. So use it only for data you really will use.
*
* \param addr The cacheline containing \p addr will be prefetched.
*
* \ingroup Utilities
* \headerfile memory.h <Vc/Memory>
*/
Vc_ALWAYS_INLINE void prefetchFar(const void *addr)
{
Vc::Detail::prefetchFar(addr, VectorAbi::Best<float>());
}
} // namespace Common
using Common::Memory;
using Common::prefetchForOneRead;
using Common::prefetchForModify;
using Common::prefetchClose;
using Common::prefetchMid;
using Common::prefetchFar;
} // namespace Vc
namespace std
{
template<typename V> Vc_ALWAYS_INLINE void swap(Vc::Memory<V> &a, Vc::Memory<V> &b) { a.swap(b); }
} // namespace std
#endif // VC_COMMON_MEMORY_H_

819
Vc/common/memorybase.h Normal file
View File

@ -0,0 +1,819 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_MEMORYBASE_H_
#define VC_COMMON_MEMORYBASE_H_
#include <assert.h>
#include <type_traits>
#include <iterator>
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
#define Vc_MEM_OPERATOR_EQ(op) \
template<typename T> \
Vc_ALWAYS_INLINE enable_if_mutable<T, MemoryVector &> operator op##=(const T &x) { \
const V v = value() op x; \
v.store(&m_data[0], Flags()); \
return *this; \
}
/*dox{{{*/
/**
* Helper class for the Memory::vector(size_t) class of functions.
*
* You will never need to directly make use of this class. It is an implementation detail of the
* Memory API.
*
* \headerfile memorybase.h <Vc/Memory>
*//*}}}*/
template<typename _V, typename Flags> class MemoryVector/*{{{*/
{
typedef typename std::remove_cv<_V>::type V;
template<typename T, typename R> using enable_if_mutable =
typename std::enable_if<std::is_same<T, T>::value && !std::is_const<_V>::value, R>::type;
using EntryType =
typename std::conditional<std::is_const<_V>::value, const typename V::EntryType,
typename V::EntryType>::type;
typedef typename V::Mask Mask;
EntryType m_data[V::Size];
public:
// It is important that neither initialization nor cleanup is done as MemoryVector aliases
// other memory
Vc_INTRINSIC MemoryVector() = default;
// disable copies because this type is supposed to alias the data in a Memory object,
// nothing else
MemoryVector(const MemoryVector &) = delete;
MemoryVector(MemoryVector &&) = delete;
// Do not disable MemoryVector &operator=(const MemoryVector &) = delete; because it is
// covered nicely by the operator= below.
//! \internal
Vc_ALWAYS_INLINE Vc_PURE V value() const { return V(&m_data[0], Flags()); }
/**
* Cast to \p V operator.
*
* This function allows to assign this object to any object of type \p V.
*/
Vc_ALWAYS_INLINE Vc_PURE operator V() const { return value(); }
template<typename T>
Vc_ALWAYS_INLINE enable_if_mutable<T, MemoryVector &> operator=(const T &x) {
V v;
v = x;
v.store(&m_data[0], Flags());
return *this;
}
Vc_ALL_BINARY(Vc_MEM_OPERATOR_EQ);
Vc_ALL_ARITHMETICS(Vc_MEM_OPERATOR_EQ);
Vc_ALWAYS_INLINE EntryType &operator[](size_t i) { return m_data[i]; }
Vc_ALWAYS_INLINE const EntryType &operator[](size_t i) const { return m_data[i]; }
};
template<typename _V, typename Flags> class MemoryVectorIterator
{
typedef typename std::remove_cv<_V>::type V;
template<typename T, typename R> using enable_if_mutable =
typename std::enable_if<std::is_same<T, T>::value && !std::is_const<_V>::value, R>::type;
using iterator_traits = std::iterator_traits<MemoryVector<_V, Flags> *>;
MemoryVector<_V, Flags> *d;
public:
typedef typename iterator_traits::difference_type difference_type;
typedef typename iterator_traits::value_type value_type;
typedef typename iterator_traits::pointer pointer;
typedef typename iterator_traits::reference reference;
typedef typename iterator_traits::iterator_category iterator_category;
constexpr MemoryVectorIterator(MemoryVector<_V, Flags> *dd) : d(dd) {}
constexpr MemoryVectorIterator(const MemoryVectorIterator &) = default;
constexpr MemoryVectorIterator(MemoryVectorIterator &&) = default;
Vc_ALWAYS_INLINE MemoryVectorIterator &operator=(const MemoryVectorIterator &) = default;
Vc_ALWAYS_INLINE void *orderBy() const { return d; }
Vc_ALWAYS_INLINE difference_type operator-(const MemoryVectorIterator &rhs) const { return d - rhs.d; }
Vc_ALWAYS_INLINE reference operator[](size_t i) const { return d[i]; }
Vc_ALWAYS_INLINE reference operator*() const { return *d; }
Vc_ALWAYS_INLINE pointer operator->() const { return d; }
Vc_ALWAYS_INLINE MemoryVectorIterator &operator++() { ++d; return *this; }
Vc_ALWAYS_INLINE MemoryVectorIterator operator++(int) { MemoryVectorIterator r(*this); ++d; return r; }
Vc_ALWAYS_INLINE MemoryVectorIterator &operator--() { --d; return *this; }
Vc_ALWAYS_INLINE MemoryVectorIterator operator--(int) { MemoryVectorIterator r(*this); --d; return r; }
Vc_ALWAYS_INLINE MemoryVectorIterator &operator+=(size_t n) { d += n; return *this; }
Vc_ALWAYS_INLINE MemoryVectorIterator &operator-=(size_t n) { d -= n; return *this; }
Vc_ALWAYS_INLINE MemoryVectorIterator operator+(size_t n) const { return MemoryVectorIterator(d + n); }
Vc_ALWAYS_INLINE MemoryVectorIterator operator-(size_t n) const { return MemoryVectorIterator(d - n); }
};
template<typename V, typename FlagsL, typename FlagsR>
Vc_ALWAYS_INLINE bool operator==(const MemoryVectorIterator<V, FlagsL> &l, const MemoryVectorIterator<V, FlagsR> &r)
{
return l.orderBy() == r.orderBy();
}
template<typename V, typename FlagsL, typename FlagsR>
Vc_ALWAYS_INLINE bool operator!=(const MemoryVectorIterator<V, FlagsL> &l, const MemoryVectorIterator<V, FlagsR> &r)
{
return l.orderBy() != r.orderBy();
}
template<typename V, typename FlagsL, typename FlagsR>
Vc_ALWAYS_INLINE bool operator>=(const MemoryVectorIterator<V, FlagsL> &l, const MemoryVectorIterator<V, FlagsR> &r)
{
return l.orderBy() >= r.orderBy();
}
template<typename V, typename FlagsL, typename FlagsR>
Vc_ALWAYS_INLINE bool operator<=(const MemoryVectorIterator<V, FlagsL> &l, const MemoryVectorIterator<V, FlagsR> &r)
{
return l.orderBy() <= r.orderBy();
}
template<typename V, typename FlagsL, typename FlagsR>
Vc_ALWAYS_INLINE bool operator> (const MemoryVectorIterator<V, FlagsL> &l, const MemoryVectorIterator<V, FlagsR> &r)
{
return l.orderBy() > r.orderBy();
}
template<typename V, typename FlagsL, typename FlagsR>
Vc_ALWAYS_INLINE bool operator< (const MemoryVectorIterator<V, FlagsL> &l, const MemoryVectorIterator<V, FlagsR> &r)
{
return l.orderBy() < r.orderBy();
}
/*}}}*/
#undef Vc_MEM_OPERATOR_EQ
#define Vc_VPH_OPERATOR(op) \
template <typename V1, typename Flags1, typename V2, typename Flags2> \
decltype(std::declval<V1>() op std::declval<V2>()) operator op( \
const MemoryVector<V1, Flags1> &x, const MemoryVector<V2, Flags2> &y) \
{ \
return x.value() op y.value(); \
}
Vc_ALL_ARITHMETICS(Vc_VPH_OPERATOR);
Vc_ALL_BINARY (Vc_VPH_OPERATOR);
Vc_ALL_COMPARES (Vc_VPH_OPERATOR);
#undef Vc_VPH_OPERATOR
template<typename V, typename Parent, typename Flags = Prefetch<>> class MemoryRange/*{{{*/
{
Parent *m_parent;
size_t m_first;
size_t m_last;
public:
MemoryRange(Parent *p, size_t firstIndex, size_t lastIndex)
: m_parent(p), m_first(firstIndex), m_last(lastIndex)
{}
MemoryVectorIterator<V, Flags> begin() const { return &m_parent->vector(m_first , Flags()); }
MemoryVectorIterator<V, Flags> end() const { return &m_parent->vector(m_last + 1, Flags()); }
};/*}}}*/
template<typename V, typename Parent, int Dimension, typename RowMemory> class MemoryDimensionBase;
template<typename V, typename Parent, typename RowMemory> class MemoryDimensionBase<V, Parent, 1, RowMemory> // {{{1
{
private:
Parent *p() { return static_cast<Parent *>(this); }
const Parent *p() const { return static_cast<const Parent *>(this); }
public:
/**
* The type of the scalar entries in the array.
*/
typedef typename V::EntryType EntryType;
/**
* Returns a pointer to the start of the allocated memory.
*/
Vc_ALWAYS_INLINE Vc_PURE EntryType *entries() { return &p()->m_mem[0]; }
/// Const overload of the above function.
Vc_ALWAYS_INLINE Vc_PURE const EntryType *entries() const { return &p()->m_mem[0]; }
/**
* Returns the \p i-th scalar value in the memory.
*/
Vc_ALWAYS_INLINE Vc_PURE EntryType &scalar(size_t i) { return entries()[i]; }
/// Const overload of the above function.
Vc_ALWAYS_INLINE Vc_PURE const EntryType scalar(size_t i) const { return entries()[i]; }
#ifdef DOXYGEN
/**
* Cast operator to the scalar type. This allows to use the object very much like a standard
* C array.
*/
Vc_ALWAYS_INLINE Vc_PURE operator EntryType*() { return entries(); }
/// Const overload of the above function.
Vc_ALWAYS_INLINE Vc_PURE operator const EntryType*() const { return entries(); }
#else
// The above conversion operator allows implicit conversion to bool. To prohibit this
// conversion we use SFINAE to allow only conversion to EntryType* and void*.
template <typename T,
typename std::enable_if<
std::is_same<typename std::remove_const<T>::type, EntryType *>::value ||
std::is_same<typename std::remove_const<T>::type, void *>::value,
int>::type = 0>
Vc_ALWAYS_INLINE Vc_PURE operator T()
{
return entries();
}
template <typename T,
typename std::enable_if<std::is_same<T, const EntryType *>::value ||
std::is_same<T, const void *>::value,
int>::type = 0>
Vc_ALWAYS_INLINE Vc_PURE operator T() const
{
return entries();
}
#endif
/**
*
*/
template<typename Flags>
Vc_ALWAYS_INLINE MemoryRange<V, Parent, Flags> range(size_t firstIndex, size_t lastIndex, Flags) {
return MemoryRange<V, Parent, Flags>(p(), firstIndex, lastIndex);
}
Vc_ALWAYS_INLINE MemoryRange<V, Parent> range(size_t firstIndex, size_t lastIndex) {
return MemoryRange<V, Parent>(p(), firstIndex, lastIndex);
}
template<typename Flags>
Vc_ALWAYS_INLINE MemoryRange<const V, Parent, Flags> range(size_t firstIndex, size_t lastIndex, Flags) const {
return MemoryRange<const V, Parent, Flags>(p(), firstIndex, lastIndex);
}
Vc_ALWAYS_INLINE MemoryRange<const V, Parent> range(size_t firstIndex, size_t lastIndex) const {
return MemoryRange<const V, Parent>(p(), firstIndex, lastIndex);
}
/**
* Returns the \p i-th scalar value in the memory.
*/
Vc_ALWAYS_INLINE EntryType &operator[](size_t i) { return entries()[i]; }
/// Const overload of the above function.
Vc_ALWAYS_INLINE const EntryType &operator[](size_t i) const { return entries()[i]; }
/**
* Uses a vector gather to combine the entries at the indexes in \p i into the returned
* vector object.
*
* \param i An integer vector. It determines the entries to be gathered.
* \returns A vector object. Modification of this object will not modify the values in
* memory.
*
* \warning The API of this function might change in future versions of Vc to additionally
* support scatters.
*/
template<typename IndexT> Vc_ALWAYS_INLINE Vc_PURE V operator[](Vector<IndexT> i) const
{
return V(entries(), i);
}
};
template<typename V, typename Parent, typename RowMemory> class MemoryDimensionBase<V, Parent, 2, RowMemory> // {{{1
{
private:
Parent *p() { return static_cast<Parent *>(this); }
const Parent *p() const { return static_cast<const Parent *>(this); }
public:
/**
* The type of the scalar entries in the array.
*/
typedef typename V::EntryType EntryType;
static constexpr size_t rowCount() { return Parent::RowCount; }
/**
* Returns a pointer to the start of the allocated memory.
*/
Vc_ALWAYS_INLINE Vc_PURE EntryType *entries(size_t x = 0) { return &p()->m_mem[x][0]; }
/// Const overload of the above function.
Vc_ALWAYS_INLINE Vc_PURE const EntryType *entries(size_t x = 0) const { return &p()->m_mem[x][0]; }
/**
* Returns the \p i,j-th scalar value in the memory.
*/
Vc_ALWAYS_INLINE Vc_PURE EntryType &scalar(size_t i, size_t j) { return entries(i)[j]; }
/// Const overload of the above function.
Vc_ALWAYS_INLINE Vc_PURE const EntryType scalar(size_t i, size_t j) const { return entries(i)[j]; }
/**
* Returns the \p i-th row in the memory.
*/
Vc_ALWAYS_INLINE Vc_PURE RowMemory &operator[](size_t i) {
return p()->m_mem[i];
}
/// Const overload of the above function.
Vc_ALWAYS_INLINE Vc_PURE const RowMemory &operator[](size_t i) const {
return p()->m_mem[i];
}
/**
* \return the number of rows in the array.
*
* \note This function can be eliminated by an optimizing compiler.
*/
Vc_ALWAYS_INLINE Vc_PURE size_t rowsCount() const { return p()->rowsCount(); }
};
//dox{{{1
/**
* \headerfile memorybase.h <Vc/Memory>
*
* Common interface to all Memory classes, independent of allocation on the stack or heap.
*
* \param V The vector type you want to operate on. (e.g. float_v or uint_v)
* \param Parent This type is the complete type of the class that derives from MemoryBase.
* \param Dimension The number of dimensions the implementation provides.
* \param RowMemory Class to be used to work on a single row.
*/
template<typename V, typename Parent, int Dimension, typename RowMemory> class MemoryBase : public MemoryDimensionBase<V, Parent, Dimension, RowMemory> //{{{1
{
static_assert((V::size() * sizeof(typename V::EntryType)) % V::MemoryAlignment == 0,
"Vc::Memory can only be used for data-parallel types storing a number "
"of values that's a multiple of the memory alignment.");
private:
Parent *p() { return static_cast<Parent *>(this); }
const Parent *p() const { return static_cast<const Parent *>(this); }
template <class Flags>
using vector_reference = MayAlias<MemoryVector<V, Flags>> &;
template <class Flags>
using const_vector_reference = const MayAlias<MemoryVector<const V, Flags>> &;
public:
/**
* The type of the scalar entries in the array.
*/
typedef typename V::EntryType EntryType;
/**
* \return the number of scalar entries in the array. This function is optimized away
* if a constant size array is used.
*/
Vc_ALWAYS_INLINE Vc_PURE size_t entriesCount() const { return p()->entriesCount(); }
/**
* \return the number of vector entries that span the array. This function is optimized away
* if a constant size array is used.
*/
Vc_ALWAYS_INLINE Vc_PURE size_t vectorsCount() const { return p()->vectorsCount(); }
using MemoryDimensionBase<V, Parent, Dimension, RowMemory>::entries;
using MemoryDimensionBase<V, Parent, Dimension, RowMemory>::scalar;
/**
* Return a (vectorized) iterator to the start of this memory object.
*/
template<typename Flags = AlignedTag>
Vc_ALWAYS_INLINE MemoryVectorIterator< V, Flags> begin(Flags flags = Flags()) { return &firstVector(flags); }
//! const overload of the above
template<typename Flags = AlignedTag>
Vc_ALWAYS_INLINE MemoryVectorIterator<const V, Flags> begin(Flags flags = Flags()) const { return &firstVector(flags); }
/**
* Return a (vectorized) iterator to the end of this memory object.
*/
template<typename Flags = AlignedTag>
Vc_ALWAYS_INLINE MemoryVectorIterator< V, Flags> end(Flags flags = Flags()) { return &lastVector(flags) + 1; }
//! const overload of the above
template<typename Flags = AlignedTag>
Vc_ALWAYS_INLINE MemoryVectorIterator<const V, Flags> end(Flags flags = Flags()) const { return &lastVector(flags) + 1; }
/**
* \param i Selects the offset, where the vector should be read.
*
* \return a smart object to wrap the \p i-th vector in the memory.
*
* The return value can be used as any other vector object. I.e. you can substitute
* something like
* \code
* float_v a = ..., b = ...;
* a += b;
* \endcode
* with
* \code
* mem.vector(i) += b;
* \endcode
*
* This function ensures that only \em aligned loads and stores are used. Thus it only allows to
* access memory at fixed strides. If access to known offsets from the aligned vectors is
* needed the vector(size_t, int) function can be used.
*/
template <typename Flags = AlignedTag>
Vc_ALWAYS_INLINE Vc_PURE
typename std::enable_if<!std::is_convertible<Flags, int>::value,
vector_reference<Flags>>::type
vector(size_t i, Flags = Flags())
{
return *aliasing_cast<MemoryVector<V, Flags>>(&entries()[i * V::Size]);
}
/** \brief Const overload of the above function
*
* \param i Selects the offset, where the vector should be read.
*
* \return a smart object to wrap the \p i-th vector in the memory.
*/
template <typename Flags = AlignedTag>
Vc_ALWAYS_INLINE Vc_PURE
typename std::enable_if<!std::is_convertible<Flags, int>::value,
const_vector_reference<Flags>>::type
vector(size_t i, Flags = Flags()) const
{
return *aliasing_cast<MemoryVector<const V, Flags>>(&entries()[i * V::Size]);
}
/**
* \return a smart object to wrap the vector starting from the \p i-th scalar entry in the memory.
*
* Example:
* \code
* Memory<float_v, N> mem;
* mem.setZero();
* for (int i = 0; i < mem.entriesCount(); i += float_v::Size) {
* mem.vectorAt(i) += b;
* }
* \endcode
*
* \param i Specifies the scalar entry from where the vector will be loaded/stored. I.e. the
* values scalar(i), scalar(i + 1), ..., scalar(i + V::Size - 1) will be read/overwritten.
*
* \param flags You must take care to determine whether an unaligned load/store is
* required. Per default an unaligned load/store is used. If \p i is a multiple of \c V::Size
* you may want to pass Vc::Aligned here.
*/
template <typename Flags = UnalignedTag>
Vc_ALWAYS_INLINE Vc_PURE vector_reference<Flags> vectorAt(size_t i,
Flags flags = Flags())
{
return *aliasing_cast<MemoryVector<V, Flags>>(&entries()[i]);
}
/** \brief Const overload of the above function
*
* \return a smart object to wrap the vector starting from the \p i-th scalar entry in the memory.
*
* \param i Specifies the scalar entry from where the vector will be loaded/stored. I.e. the
* values scalar(i), scalar(i + 1), ..., scalar(i + V::Size - 1) will be read/overwritten.
*
* \param flags You must take care to determine whether an unaligned load/store is
* required. Per default an unaligned load/store is used. If \p i is a multiple of \c V::Size
* you may want to pass Vc::Aligned here.
*/
template <typename Flags = UnalignedTag>
Vc_ALWAYS_INLINE Vc_PURE const_vector_reference<Flags> vectorAt(
size_t i, Flags flags = Flags()) const
{
return *aliasing_cast<MemoryVector<const V, Flags>>(&entries()[i]);
}
/**
* \return a smart object to wrap the \p i-th vector + \p shift in the memory.
*
* This function ensures that only \em unaligned loads and stores are used.
* It allows to access memory at any location aligned to the entry type.
*
* \param i Selects the memory location of the i-th vector. Thus if \p V::Size == 4 and
* \p i is set to 3 the base address for the load/store will be the 12th entry
* (same as \p &mem[12]).
* \param shift Shifts the base address determined by parameter \p i by \p shift many
* entries. Thus \p vector(3, 1) for \p V::Size == 4 will load/store the
* 13th - 16th entries (same as \p &mem[13]).
*
* \note Any shift value is allowed as long as you make sure it stays within bounds of the
* allocated memory. Shift values that are a multiple of \p V::Size will \em not result in
* aligned loads. You have to use the above vector(size_t) function for aligned loads
* instead.
*
* \note Thus a simple way to access vectors randomly is to set \p i to 0 and use \p shift as the
* parameter to select the memory address:
* \code
* // don't use:
* mem.vector(i / V::Size, i % V::Size) += 1;
* // instead use:
* mem.vector(0, i) += 1;
* \endcode
*/
template <typename ShiftT, typename Flags = decltype(Unaligned)>
Vc_ALWAYS_INLINE Vc_PURE typename std::enable_if<
std::is_convertible<ShiftT, int>::value,
vector_reference<decltype(std::declval<Flags>() | Unaligned)>>::type
vector(size_t i, ShiftT shift, Flags = Flags())
{
return *aliasing_cast<
MemoryVector<V, decltype(std::declval<Flags>() | Unaligned)>>(
&entries()[i * V::Size + shift]);
}
/// Const overload of the above function.
template <typename ShiftT, typename Flags = decltype(Unaligned)>
Vc_ALWAYS_INLINE Vc_PURE typename std::enable_if<
std::is_convertible<ShiftT, int>::value,
const_vector_reference<decltype(std::declval<Flags>() | Unaligned)>>::type
vector(size_t i, ShiftT shift, Flags = Flags()) const
{
return *aliasing_cast<
MemoryVector<const V, decltype(std::declval<Flags>() | Unaligned)>>(
&entries()[i * V::Size + shift]);
}
/**
* \return the first vector in the allocated memory.
*
* This function is simply a shorthand for vector(0).
*/
template <typename Flags = AlignedTag>
Vc_ALWAYS_INLINE Vc_PURE vector_reference<Flags> firstVector(Flags f = Flags())
{
return vector(0, f);
}
/// Const overload of the above function.
template <typename Flags = AlignedTag>
Vc_ALWAYS_INLINE Vc_PURE const_vector_reference<Flags> firstVector(
Flags f = Flags()) const
{
return vector(0, f);
}
/**
* \return the last vector in the allocated memory.
*
* This function is simply a shorthand for vector(vectorsCount() - 1).
*/
template <typename Flags = AlignedTag>
Vc_ALWAYS_INLINE Vc_PURE vector_reference<Flags> lastVector(Flags f = Flags())
{
return vector(vectorsCount() - 1, f);
}
/// Const overload of the above function.
template <typename Flags = AlignedTag>
Vc_ALWAYS_INLINE Vc_PURE const_vector_reference<Flags> lastVector(
Flags f = Flags()) const
{
return vector(vectorsCount() - 1, f);
}
Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned char *indexes) const { return V(entries(), typename V::IndexType(indexes, Vc::Unaligned)); }
Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned short *indexes) const { return V(entries(), typename V::IndexType(indexes, Vc::Unaligned)); }
Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned int *indexes) const { return V(entries(), typename V::IndexType(indexes, Vc::Unaligned)); }
Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned long *indexes) const { return V(entries(), typename V::IndexType(indexes, Vc::Unaligned)); }
/**
* Zero the whole memory area.
*/
Vc_ALWAYS_INLINE void setZero() {
V zero(Vc::Zero);
for (size_t i = 0; i < vectorsCount(); ++i) {
vector(i) = zero;
}
}
/**
* Assign a value to all vectors in the array.
*/
template<typename U>
Vc_ALWAYS_INLINE Parent &operator=(U &&x) {
for (size_t i = 0; i < vectorsCount(); ++i) {
vector(i) = std::forward<U>(x);
}
}
/**
* (Inefficient) shorthand to add up two arrays.
*/
template<typename P2, typename RM>
inline Parent &operator+=(const MemoryBase<V, P2, Dimension, RM> &rhs) {
assert(vectorsCount() == rhs.vectorsCount());
for (size_t i = 0; i < vectorsCount(); ++i) {
vector(i) += rhs.vector(i);
}
return static_cast<Parent &>(*this);
}
/**
* (Inefficient) shorthand to subtract two arrays.
*/
template<typename P2, typename RM>
inline Parent &operator-=(const MemoryBase<V, P2, Dimension, RM> &rhs) {
assert(vectorsCount() == rhs.vectorsCount());
for (size_t i = 0; i < vectorsCount(); ++i) {
vector(i) -= rhs.vector(i);
}
return static_cast<Parent &>(*this);
}
/**
* (Inefficient) shorthand to multiply two arrays.
*/
template<typename P2, typename RM>
inline Parent &operator*=(const MemoryBase<V, P2, Dimension, RM> &rhs) {
assert(vectorsCount() == rhs.vectorsCount());
for (size_t i = 0; i < vectorsCount(); ++i) {
vector(i) *= rhs.vector(i);
}
return static_cast<Parent &>(*this);
}
/**
* (Inefficient) shorthand to divide two arrays.
*/
template<typename P2, typename RM>
inline Parent &operator/=(const MemoryBase<V, P2, Dimension, RM> &rhs) {
assert(vectorsCount() == rhs.vectorsCount());
for (size_t i = 0; i < vectorsCount(); ++i) {
vector(i) /= rhs.vector(i);
}
return static_cast<Parent &>(*this);
}
/**
* (Inefficient) shorthand to add a value to an array.
*/
inline Parent &operator+=(EntryType rhs) {
V v(rhs);
for (size_t i = 0; i < vectorsCount(); ++i) {
vector(i) += v;
}
return static_cast<Parent &>(*this);
}
/**
* (Inefficient) shorthand to subtract a value from an array.
*/
inline Parent &operator-=(EntryType rhs) {
V v(rhs);
for (size_t i = 0; i < vectorsCount(); ++i) {
vector(i) -= v;
}
return static_cast<Parent &>(*this);
}
/**
* (Inefficient) shorthand to multiply a value to an array.
*/
inline Parent &operator*=(EntryType rhs) {
V v(rhs);
for (size_t i = 0; i < vectorsCount(); ++i) {
vector(i) *= v;
}
return static_cast<Parent &>(*this);
}
/**
* (Inefficient) shorthand to divide an array with a value.
*/
inline Parent &operator/=(EntryType rhs) {
V v(rhs);
for (size_t i = 0; i < vectorsCount(); ++i) {
vector(i) /= v;
}
return static_cast<Parent &>(*this);
}
/**
* (Inefficient) shorthand compare equality of two arrays.
*/
template<typename P2, typename RM>
inline bool operator==(const MemoryBase<V, P2, Dimension, RM> &rhs) const {
assert(vectorsCount() == rhs.vectorsCount());
for (size_t i = 0; i < vectorsCount(); ++i) {
if (!(V(vector(i)) == V(rhs.vector(i))).isFull()) {
return false;
}
}
return true;
}
/**
* (Inefficient) shorthand compare two arrays.
*/
template<typename P2, typename RM>
inline bool operator!=(const MemoryBase<V, P2, Dimension, RM> &rhs) const {
assert(vectorsCount() == rhs.vectorsCount());
for (size_t i = 0; i < vectorsCount(); ++i) {
if (!(V(vector(i)) == V(rhs.vector(i))).isEmpty()) {
return false;
}
}
return true;
}
/**
* (Inefficient) shorthand compare two arrays.
*/
template<typename P2, typename RM>
inline bool operator<(const MemoryBase<V, P2, Dimension, RM> &rhs) const {
assert(vectorsCount() == rhs.vectorsCount());
for (size_t i = 0; i < vectorsCount(); ++i) {
if (!(V(vector(i)) < V(rhs.vector(i))).isFull()) {
return false;
}
}
return true;
}
/**
* (Inefficient) shorthand compare two arrays.
*/
template<typename P2, typename RM>
inline bool operator<=(const MemoryBase<V, P2, Dimension, RM> &rhs) const {
assert(vectorsCount() == rhs.vectorsCount());
for (size_t i = 0; i < vectorsCount(); ++i) {
if (!(V(vector(i)) <= V(rhs.vector(i))).isFull()) {
return false;
}
}
return true;
}
/**
* (Inefficient) shorthand compare two arrays.
*/
template<typename P2, typename RM>
inline bool operator>(const MemoryBase<V, P2, Dimension, RM> &rhs) const {
assert(vectorsCount() == rhs.vectorsCount());
for (size_t i = 0; i < vectorsCount(); ++i) {
if (!(V(vector(i)) > V(rhs.vector(i))).isFull()) {
return false;
}
}
return true;
}
/**
* (Inefficient) shorthand compare two arrays.
*/
template<typename P2, typename RM>
inline bool operator>=(const MemoryBase<V, P2, Dimension, RM> &rhs) const {
assert(vectorsCount() == rhs.vectorsCount());
for (size_t i = 0; i < vectorsCount(); ++i) {
if (!(V(vector(i)) >= V(rhs.vector(i))).isFull()) {
return false;
}
}
return true;
}
};
namespace Detail
{
template <typename V,
typename ParentL,
typename ParentR,
int Dimension,
typename RowMemoryL,
typename RowMemoryR>
inline void copyVectors(MemoryBase<V, ParentL, Dimension, RowMemoryL> &dst,
const MemoryBase<V, ParentR, Dimension, RowMemoryR> &src)
{
const size_t vectorsCount = dst.vectorsCount();
size_t i = 3;
for (; i < vectorsCount; i += 4) {
const V tmp3 = src.vector(i - 3);
const V tmp2 = src.vector(i - 2);
const V tmp1 = src.vector(i - 1);
const V tmp0 = src.vector(i - 0);
dst.vector(i - 3) = tmp3;
dst.vector(i - 2) = tmp2;
dst.vector(i - 1) = tmp1;
dst.vector(i - 0) = tmp0;
}
for (i -= 3; i < vectorsCount; ++i) {
dst.vector(i) = src.vector(i);
}
}
} // namespace Detail
} // namespace Common
} // namespace Vc
#endif // VC_COMMON_MEMORYBASE_H_
// vim: foldmethod=marker

46
Vc/common/memoryfwd.h Normal file
View File

@ -0,0 +1,46 @@
/* This file is part of the Vc library. {{{
Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_MEMORYFWD_H_
#define VC_COMMON_MEMORYFWD_H_
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template <typename V, std::size_t Size1 = 0, std::size_t Size2 = 0,
bool InitPadding = true>
class Memory;
template <typename V, typename Parent, int Dimension, typename RowMemory>
class MemoryBase;
} // namespace Common
using Common::Memory;
} // namespace Vc
#endif // VC_COMMON_MEMORYFWD_H_

258
Vc/common/operators.h Normal file
View File

@ -0,0 +1,258 @@
/* This file is part of the Vc library. {{{
Copyright © 2012-2016 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef COMMON_OPERATORS_H_
#define COMMON_OPERATORS_H_
#include "simdarray.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename T, typename Abi, typename U>
enable_if<!std::is_same<T, U>::value, U> is_convertible_to_any_vector(Vector<U, Abi>);
template <typename T, typename Abi> T is_convertible_to_any_vector(Vector<T, Abi>);
template <typename T, typename U, bool = std::is_integral<T>::value,
bool = std::is_integral<U>::value>
struct FundamentalReturnType;
template <class T, class U>
using fundamental_return_t = typename FundamentalReturnType<T, U>::type;
template <typename T, typename U> struct FundamentalReturnType<T, U, false, false> {
using type = typename std::conditional<
std::is_arithmetic<U>::value,
typename std::conditional<(sizeof(T) < sizeof(U)), U, T>::type,
// U is not arithmetic, e.g. an enum or a type with e.g. operator int()
T>::type;
};
template <typename T, typename U> struct FundamentalReturnType<T, U, true, false> {
using type = typename std::conditional<
std::is_arithmetic<U>::value, U,
// U is not arithmetic, e.g. an enum or a type with e.g. operator int()
T>::type;
};
template <typename T, typename U> struct FundamentalReturnType<T, U, false, true> {
using type = T;
};
template <typename T> struct my_make_signed : public std::make_signed<T> {
};
template <> struct my_make_signed<bool> {
using type = bool;
};
template <typename TT, typename UU>
struct higher_conversion_rank {
template <typename A>
using fix_sign =
typename std::conditional<(std::is_unsigned<TT>::value ||
std::is_unsigned<UU>::value),
typename std::make_unsigned<A>::type, A>::type;
using T = typename my_make_signed<TT>::type;
using U = typename my_make_signed<UU>::type;
template <typename Test, typename Otherwise>
using c = typename std::conditional<std::is_same<T, Test>::value ||
std::is_same<U, Test>::value,
Test, Otherwise>::type;
using type = fix_sign<c<long long, c<long, c<int, c<short, c<signed char, void>>>>>>;
};
template <typename T, typename U> struct FundamentalReturnType<T, U, true, true> {
template <bool B, class Then, class E>
using c = typename std::conditional<B, Then, E>::type;
using type =
c<(sizeof(T) > sizeof(U)), T,
c<(sizeof(T) < sizeof(U)), U, typename higher_conversion_rank<T, U>::type>>;
};
template <class V, class T, class Tq, class = void> struct ReturnTypeImpl {
// no type => SFINAE
};
// 1. Vector × Vector
template <class T, class U, class Abi, class Uq>
struct ReturnTypeImpl<Vector<T, Abi>, Vector<U, Abi>, Uq, void> {
using type = Vc::Vector<fundamental_return_t<T, U>, Abi>;
};
// 2. Vector × int
template <class T, class Abi, class Uq>
struct ReturnTypeImpl<Vector<T, Abi>, int, Uq, void> {
// conversion from int is always allowed (because its the default when you hardcode a
// number)
using type = Vc::Vector<T, Abi>;
};
// 3. Vector × unsigned
template <class T, class Abi, class Uq>
struct ReturnTypeImpl<Vector<T, Abi>, uint, Uq, void> {
// conversion from unsigned int is allowed for all integral Vector<T>, but ensures
// unsigned result
using type = Vc::Vector<
typename std::conditional<std::is_integral<T>::value, std::make_unsigned<T>,
std::enable_if<true, T>>::type::type,
Abi>;
};
// 4. Vector × {enum, arithmetic}
template <class T, class U, class Abi, class Uq>
struct ReturnTypeImpl<
Vector<T, Abi>, U, Uq,
enable_if<!std::is_class<U>::value && !std::is_same<U, int>::value &&
!std::is_same<U, uint>::value &&
Traits::is_valid_vector_argument<fundamental_return_t<T, U>>::value,
void>> {
using type = Vc::Vector<fundamental_return_t<T, U>, Abi>;
};
// 5. Vector × UDT
template <class T, class U, class Abi, class Uq>
struct ReturnTypeImpl<
Vector<T, Abi>, U, Uq,
enable_if<std::is_class<U>::value && !Traits::is_simd_vector<U>::value &&
Traits::is_valid_vector_argument<decltype(
is_convertible_to_any_vector<T, Abi>(std::declval<Uq>()))>::value,
void>> {
using type =
Vc::Vector<fundamental_return_t<T, decltype(is_convertible_to_any_vector<T, Abi>(
std::declval<Uq>()))>,
Abi>;
};
template <class V, class Tq, class T = remove_cvref_t<Tq>>
using ReturnType = typename ReturnTypeImpl<V, T, Tq>::type;
template <class T> struct is_a_type : public std::true_type {
};
#ifdef Vc_ENABLE_FLOAT_BIT_OPERATORS
#define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) true
#else
#define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) \
Detail::is_a_type<decltype(std::declval<typename R::value_type>() \
op_ std::declval<typename R::value_type>())>::value
#endif
} // namespace Detail
#define Vc_GENERIC_OPERATOR(op_) \
template <class T, class Abi, class U, \
class R = Detail::ReturnType<Vector<T, Abi>, U>> \
Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \
std::is_convertible<Vector<T, Abi>, R>::value && \
std::is_convertible<U, R>::value, \
R> \
operator op_(Vector<T, Abi> x, U &&y) \
{ \
return Detail::operator op_(R(x), R(std::forward<U>(y))); \
} \
template <class T, class Abi, class U, \
class R = Detail::ReturnType<Vector<T, Abi>, U>> \
Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \
!Traits::is_simd_vector<U>::value && \
std::is_convertible<Vector<T, Abi>, R>::value && \
std::is_convertible<U, R>::value, \
R> \
operator op_(U &&x, Vector<T, Abi> y) \
{ \
return Detail::operator op_(R(std::forward<U>(x)), R(y)); \
} \
template <class T, class Abi, class U, \
class R = Detail::ReturnType<Vector<T, Abi>, U>> \
Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) && \
std::is_convertible<Vector<T, Abi>, R>::value && \
std::is_convertible<U, R>::value, \
Vector<T, Abi> &> \
operator op_##=(Vector<T, Abi> &x, U &&y) \
{ \
x = Detail::operator op_(R(x), R(std::forward<U>(y))); \
return x; \
}
#define Vc_LOGICAL_OPERATOR(op_) \
template <class T, class Abi> \
Vc_ALWAYS_INLINE typename Vector<T, Abi>::Mask operator op_(Vector<T, Abi> x, \
Vector<T, Abi> y) \
{ \
return !!x op_ !!y; \
} \
template <class T, class Abi, class U> \
Vc_ALWAYS_INLINE \
enable_if<std::is_convertible<Vector<T, Abi>, Vector<U, Abi>>::value && \
std::is_convertible<Vector<U, Abi>, Vector<T, Abi>>::value, \
typename Detail::ReturnType<Vector<T, Abi>, Vector<U, Abi>>::Mask> \
operator op_(Vector<T, Abi> x, Vector<U, Abi> y) \
{ \
return !!x op_ !!y; \
} \
template <class T, class Abi, class U> \
Vc_ALWAYS_INLINE enable_if<std::is_same<bool, decltype(!std::declval<U>())>::value, \
typename Vector<T, Abi>::Mask> \
operator op_(Vector<T, Abi> x, U &&y) \
{ \
using M = typename Vector<T, Abi>::Mask; \
return !!x op_ M(!!std::forward<U>(y)); \
} \
template <class T, class Abi, class U> \
Vc_ALWAYS_INLINE enable_if<std::is_same<bool, decltype(!std::declval<U>())>::value, \
typename Vector<T, Abi>::Mask> \
operator op_(U &&x, Vector<T, Abi> y) \
{ \
using M = typename Vector<T, Abi>::Mask; \
return M(!!std::forward<U>(x)) op_ !!y; \
}
#define Vc_COMPARE_OPERATOR(op_) \
template <class T, class Abi, class U, \
class R = Detail::ReturnType<Vector<T, Abi>, U>> \
Vc_ALWAYS_INLINE enable_if<std::is_convertible<Vector<T, Abi>, R>::value && \
std::is_convertible<U, R>::value, \
typename R::Mask> \
operator op_(Vector<T, Abi> x, U &&y) \
{ \
return Detail::operator op_(R(x), R(std::forward<U>(y))); \
} \
template <class T, class Abi, class U, \
class R = Detail::ReturnType<Vector<T, Abi>, U>> \
Vc_ALWAYS_INLINE \
enable_if<!Traits::is_simd_vector_internal<remove_cvref_t<U>>::value && \
std::is_convertible<Vector<T, Abi>, R>::value && \
std::is_convertible<U, R>::value, \
typename R::Mask> \
operator op_(U &&x, Vector<T, Abi> y) \
{ \
return Detail::operator op_(R(std::forward<U>(x)), R(y)); \
}
Vc_ALL_LOGICAL (Vc_LOGICAL_OPERATOR);
Vc_ALL_BINARY (Vc_GENERIC_OPERATOR);
Vc_ALL_ARITHMETICS(Vc_GENERIC_OPERATOR);
Vc_ALL_COMPARES (Vc_COMPARE_OPERATOR);
#undef Vc_LOGICAL_OPERATOR
#undef Vc_GENERIC_OPERATOR
#undef Vc_COMPARE_OPERATOR
#undef Vc_INVALID_OPERATOR
} // namespace Vc
#endif // COMMON_OPERATORS_H_

44
Vc/common/permutation.h Normal file
View File

@ -0,0 +1,44 @@
/* This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_PERMUTATION_H_
#define VC_COMMON_PERMUTATION_H_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Permutation
{
struct ReversedTag {};
constexpr ReversedTag Reversed{};
} // namespace Permutation
}
#endif // VC_COMMON_PERMUTATION_H_
// vim: foldmethod=marker

View File

@ -0,0 +1,270 @@
/* This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_SCATTERIMPLEMENTATION_H_
#define VC_COMMON_SCATTERIMPLEMENTATION_H_
#include "gatherimplementation.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(SetIndexZeroT,
V &v,
MT *mem,
IT indexes,
typename V::MaskArgument mask)
{
indexes.setZeroInverted(static_cast<typename IT::Mask>(mask));
// Huh?
const V tmp(mem, indexes);
where(mask) | v = tmp;
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(SimpleLoopT,
V &v,
MT *mem,
const IT &indexes,
typename V::MaskArgument mask)
{
if (Vc_IS_UNLIKELY(mask.isEmpty())) {
return;
}
Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
if (mask[i])
mem[indexes[i]] = v[i];
});
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(BitScanLoopT,
V &v,
MT *mem,
const IT &indexes,
typename V::MaskArgument mask)
{
size_t bits = mask.toInt();
while (Vc_IS_LIKELY(bits > 0)) {
size_t i, j;
asm("bsf %[bits],%[i]\n\t"
"bsr %[bits],%[j]\n\t"
"btr %[i],%[bits]\n\t"
"btr %[j],%[bits]\n\t"
: [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
mem[indexes[i]] = v[i];
mem[indexes[j]] = v[j];
}
/* Alternative from Vc::SSE (0.7)
int bits = mask.toInt();
while (bits) {
const int i = _bit_scan_forward(bits);
bits ^= (1 << i); // btr?
mem[indexes[i]] = v[i];
}
*/
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
V &v,
MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 16> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low, high = 0;
switch (Vc::Detail::popcnt16(bits)) {
case 16:
v.scatter(mem, indexes);
break;
case 15:
low = _bit_scan_forward(bits);
bits ^= 1 << low;
mem[indexes[low]] = v[low];
case 14:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 13:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 12:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 11:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 10:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 9:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 8:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 7:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 6:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 5:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 4:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 3:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 2:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
case 1:
low = _bit_scan_forward(bits);
mem[indexes[low]] = v[low];
case 0:
break;
}
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
V &v,
MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 8> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low, high = 0;
switch (Vc::Detail::popcnt8(bits)) {
case 8:
v.scatter(mem, indexes);
break;
case 7:
low = _bit_scan_forward(bits);
bits ^= 1 << low;
mem[indexes[low]] = v[low];
case 6:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 5:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 4:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
high = (1 << high);
case 3:
low = _bit_scan_forward(bits);
bits ^= high | (1 << low);
mem[indexes[low]] = v[low];
case 2:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
case 1:
low = _bit_scan_forward(bits);
mem[indexes[low]] = v[low];
case 0:
break;
}
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
V &v,
MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 4> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low, high = 0;
switch (Vc::Detail::popcnt4(bits)) {
case 4:
v.scatter(mem, indexes);
break;
case 3:
low = _bit_scan_forward(bits);
bits ^= 1 << low;
mem[indexes[low]] = v[low];
case 2:
high = _bit_scan_reverse(bits);
mem[indexes[high]] = v[high];
case 1:
low = _bit_scan_forward(bits);
mem[indexes[low]] = v[low];
case 0:
break;
}
}
template <typename V, typename MT, typename IT>
Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
V &v,
MT *mem,
const IT &indexes,
typename V::MaskArgument mask,
enable_if<V::Size == 2> = nullarg)
{
unsigned int bits = mask.toInt();
unsigned int low;
switch (Vc::Detail::popcnt4(bits)) {
case 2:
v.scatter(mem, indexes);
break;
case 1:
low = _bit_scan_forward(bits);
mem[indexes[low]] = v[low];
case 0:
break;
}
}
} // namespace Common
} // namespace Vc
#endif // VC_COMMON_SCATTERIMPLEMENTATION_H_

View File

@ -0,0 +1,136 @@
/* This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
///////////////////////////////////////////////////////////////////////////////////////////
// scatters
// A scatter takes the following arguments:
// 1. A pointer to memory of any type that EntryType can convert to.
// 2. An indexes “vector”. The requirement is that the type implements the subscript operator,
// stores «Size» valid index values, and each offset to the pointer above yields a valid
// memory location for reading.
// 3. Optionally the third argument may be a mask. The mask disables several memory stores and
// thus removes the requirements in (2.) for the disabled entries.
private:
/**\internal
* This function implements a scatter given a pointer to memory \p mem and some
* container object storing the scatter \p indexes.
*
* \param mem This pointer must be aligned correctly for the type \p MT. This is the
* natural behavior of C++, so this is typically the case.
* \param indexes This object contains at least \VSize{T} indexes that denote the
* offset in \p mem where the components for the current vector should be copied to.
* The offset is not in Bytes, but in multiples of `sizeof(MT)`.
*/
// enable_if<std::can_convert<MT, EntryType>::value && has_subscript_operator<IT>::value>
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes) const;
/**\internal
* This overload of the above function adds a \p mask argument to disable memory
* accesses at the \p indexes offsets where \p mask is \c false.
*/
template <typename MT, typename IT>
inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
public:
#define Vc_ASSERT_SCATTER_PARAMETER_TYPES_ \
static_assert( \
std::is_convertible<EntryType, MT>::value, \
"The memory pointer needs to point to a type that the EntryType of this " \
"SIMD vector type can be converted to."); \
static_assert( \
Vc::Traits::has_subscript_operator<IT>::value, \
"The indexes argument must be a type that implements the subscript operator."); \
static_assert( \
!Traits::is_simd_vector<IT>::value || \
Traits::simd_vector_size<IT>::value >= Size, \
"If you use a SIMD vector for the indexes parameter, the index vector must " \
"have at least as many entries as this SIMD vector."); \
static_assert( \
!std::is_array<T>::value || \
(std::rank<T>::value == 1 && \
(std::extent<T>::value == 0 || std::extent<T>::value >= Size)), \
"If you use a simple array for the indexes parameter, the array must have " \
"at least as many entries as this SIMD vector.")
/**
* \name Scatter functions
*
* Stores a vector to the objects at `mem[indexes[0]]`, `mem[indexes[1]]`,
* `mem[indexes[2]]`, ...
*
* \param mem A pointer to memory which contains objects of type \p MT at the offsets
* given by \p indexes.
* \param indexes
* \param mask
*/
///@{
/// Scatter function
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes));
}
/// Masked scatter function
template <typename MT,
typename IT,
typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
{
Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
scatterImplementation(mem, std::forward<IT>(indexes), mask);
}
///@}
#include "scatterinterface_deprecated.h"
/**\internal
* \name Scatter function to use from Vc::Common::subscript_operator
*
* \param args
* \param mask
*/
///@{
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
{
scatter(args.address, args.indexes);
}
template <typename MT, typename IT>
Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
{
scatter(args.address, args.indexes, mask);
}
///@}
#undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_

View File

@ -0,0 +1,147 @@
/// \name Deprecated Members
///@{
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param indexes Determines the offsets into \p array where the values are gathered from/scattered
* to. The type of indexes can either be an integer vector or a type that supports
* operator[] access.
*/
template <typename S1, typename IT>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline void scatter(S1 *array, EntryType S1::*member1,
IT indexes) const
{
scatter(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
array, indexes)[member1]
.scatterArguments());
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param indexes Determines the offsets into \p array where the values are gathered from/scattered
* to. The type of indexes can either be an integer vector or a type that supports
* operator[] access.
* \param mask If a mask is given only the active entries will be gathered/scattered.
*/
template <typename S1, typename IT>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline void scatter(S1 *array, EntryType S1::*member1,
IT indexes, MaskArgument mask) const
{
scatter(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
array, indexes)[member1]
.scatterArguments(),
mask);
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
* struct (i.e. array[i].*member1.*member2 is read).
* \param indexes Determines the offsets into \p array where the values are gathered from/scattered
* to. The type of indexes can either be an integer vector or a type that supports
* operator[] access.
*/
template <typename S1, typename S2, typename IT>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline void scatter(S1 *array, S2 S1::*member1,
EntryType S2::*member2,
IT indexes) const
{
scatter(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
array, indexes)[member1][member2]
.scatterArguments());
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
* struct (i.e. array[i].*member1.*member2 is read).
* \param indexes Determines the offsets into \p array where the values are gathered from/scattered
* to. The type of indexes can either be an integer vector or a type that supports
* operator[] access.
* \param mask If a mask is given only the active entries will be gathered/scattered.
*/
template <typename S1, typename S2, typename IT>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline void scatter(S1 *array, S2 S1::*member1,
EntryType S2::*member2, IT indexes,
MaskArgument mask) const
{
scatter(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
array, indexes)[member1][member2]
.scatterArguments(),
mask);
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param outerIndexes
* \param innerIndexes
*/
template <typename S1, typename IT1, typename IT2>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline void scatter(S1 *array, EntryType *S1::*ptrMember1,
IT1 outerIndexes,
IT2 innerIndexes) const
{
scatter(Common::SubscriptOperation<S1, IT1, std::ratio<1, 1>, true>(
array, outerIndexes)[ptrMember1][innerIndexes]
.scatterArguments());
}
/**
* \deprecated Use Vc::array or Vc::vector subscripting instead.
*
* \param array A pointer into memory (without alignment restrictions).
* \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
* be read. Thus the offsets in \p indexes are relative to the \p array and not to
* the size of the gathered type (i.e. array[i].*member1 is accessed instead of
* (&(array->*member1))[i])
* \param outerIndexes
* \param innerIndexes
* \param mask If a mask is given only the active entries will be gathered/scattered.
*/
template <typename S1, typename IT1, typename IT2>
Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
"instead.") inline void scatter(S1 *array, EntryType *S1::*ptrMember1,
IT1 outerIndexes, IT2 innerIndexes,
MaskArgument mask) const
{
scatter(Common::SubscriptOperation<S1, IT1, std::ratio<1, 1>, true>(
array, outerIndexes)[ptrMember1][innerIndexes]
.scatterArguments(),
mask);
}
///@}

92
Vc/common/set.h Normal file
View File

@ -0,0 +1,92 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_SET_H_
#define VC_COMMON_SET_H_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace
{
static Vc_INTRINSIC Vc_CONST __m128i set(unsigned short x0, unsigned short x1, unsigned short x2, unsigned short x3,
unsigned short x4, unsigned short x5, unsigned short x6, unsigned short x7)
{
#if defined(Vc_GNU_ASM)
#if 0 // defined(__x86_64__)
// it appears that the 32bit variant is always faster
__m128i r;
unsigned long long tmp0 = x3; tmp0 = (tmp0 << 16) | x2;
unsigned long long tmp1 = x1; tmp1 = (tmp1 << 16) | x0;
asm("vmovq %1,%0" : "=x"(r) : "r"((tmp0 << 32) | tmp1));
unsigned long long tmp2 = x7; tmp2 = (tmp2 << 16) | x6;
unsigned long long tmp3 = x5; tmp3 = (tmp3 << 16) | x4;
asm("vpinsrq $1,%1,%0,%0" : "+x"(r) : "r"((tmp2 << 32) | tmp3));
return r;
#elif defined(Vc_USE_VEX_CODING)
__m128i r0, r1;
unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
asm("vmovd %1,%0" : "=x"(r0) : "r"(tmp0));
asm("vpinsrd $1,%1,%0,%0" : "+x"(r0) : "r"(tmp1));
asm("vmovd %1,%0" : "=x"(r1) : "r"(tmp2));
asm("vpinsrd $1,%1,%0,%0" : "+x"(r1) : "r"(tmp3));
asm("vpunpcklqdq %1,%0,%0" : "+x"(r0) : "x"(r1));
return r0;
#else
__m128i r0, r1;
unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
asm("movd %1,%0" : "=x"(r0) : "r"(tmp0));
asm("pinsrd $1,%1,%0" : "+x"(r0) : "r"(tmp1));
asm("movd %1,%0" : "=x"(r1) : "r"(tmp2));
asm("pinsrd $1,%1,%0" : "+x"(r1) : "r"(tmp3));
asm("punpcklqdq %1,%0" : "+x"(r0) : "x"(r1));
return r0;
#endif
#else
unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
#endif
}
static Vc_INTRINSIC Vc_CONST __m128i set(short x0, short x1, short x2, short x3, short x4, short x5, short x6, short x7)
{
return set(static_cast<unsigned short>(x0), static_cast<unsigned short>(x1), static_cast<unsigned short>(x2),
static_cast<unsigned short>(x3), static_cast<unsigned short>(x4), static_cast<unsigned short>(x5),
static_cast<unsigned short>(x6), static_cast<unsigned short>(x7));
}
} // anonymous namespace
} // namespace Vc
#endif // VC_COMMON_SET_H_

68
Vc/common/simd_cast.h Normal file
View File

@ -0,0 +1,68 @@
/* This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_SIMD_CAST_H_
#define VC_COMMON_SIMD_CAST_H_
#include <type_traits>
#include "macros.h"
// declare a bogus simd_cast function template in the global namespace to enable ADL for
// simd_cast<T>
template <class> void simd_cast();
namespace Vc_VERSIONED_NAMESPACE
{
/**
* Casts the argument \p x from type \p From to type \p To.
*
* This function implements the trivial case where \p To and \p From are the same type.
*
* \param x The object of type \p From to be converted to type \p To.
* \returns An object of type \p To with all vector components converted according to
* standard conversion behavior as mandated by the C++ standard for the
* underlying arithmetic types.
*/
template <typename To, typename From>
Vc_INTRINSIC Vc_CONST To
simd_cast(From &&x, enable_if<std::is_same<To, Traits::decay<From>>::value> = nullarg)
{
return std::forward<From>(x);
}
/**
* A cast from nothing results in default-initialization of \p To.
*
* This function can be useful in generic code where a parameter pack expands to nothing.
*
* \returns A zero-initialized object of type \p To.
*/
template <typename To> Vc_INTRINSIC Vc_CONST To simd_cast() { return To(); }
} // namespace Vc
#endif // VC_COMMON_SIMD_CAST_H_

View File

@ -0,0 +1,79 @@
/* This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_SIMD_CAST_CALLER_TCC_
#define VC_COMMON_SIMD_CAST_CALLER_TCC_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE {
template <class T, std::size_t N, class VectorType>
template <class U, class V, class>
Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
const SimdMaskArray<U, N, V> &x)
: data(simd_cast<mask_type>(internal_data(x)))
{
}
template <class T, std::size_t N, class VectorType>
template <class U, class V, class, class>
Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
const SimdMaskArray<U, N, V> &x)
: data(simd_cast<mask_type>(internal_data(internal_data0(x)),
internal_data(internal_data1(x))))
{
}
template <class T, std::size_t N, class VectorType>
template <class U, class V, class, class, class>
Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
const SimdMaskArray<U, N, V> &x)
: data(simd_cast<mask_type>(internal_data(internal_data0(internal_data0(x))),
internal_data(internal_data1(internal_data0(x))),
internal_data(internal_data0(internal_data1(x))),
internal_data(internal_data1(internal_data1(x)))))
{
}
// conversion from any Segment object (could be SimdMaskArray or Mask<T>)
template <class T, std::size_t N, class VectorType>
template <class M, std::size_t Pieces, std::size_t Index>
Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
Common::Segment<M, Pieces, Index> &&x,
enable_if<Traits::simd_vector_size<M>::value == Size * Pieces>)
: data(simd_cast<mask_type, Index>(x.data))
{
}
// conversion from Mask<T>
template <class T, std::size_t N, class VectorType>
template <class M, class>
Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(M k)
: data(simd_cast<mask_type>(k))
{
}
} // namespace Vc_VERSIONED_NAMESPACE
#endif // VC_COMMON_SIMD_CAST_CALLER_TCC_
// vim: foldmethod=marker

2778
Vc/common/simdarray.h Normal file

File diff suppressed because it is too large Load Diff

210
Vc/common/simdarrayfwd.h Normal file
View File

@ -0,0 +1,210 @@
/* This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_SIMDARRAYFWD_H_
#define VC_COMMON_SIMDARRAYFWD_H_
#include "../scalar/types.h"
#include "../sse/types.h"
#include "../avx/types.h"
#include "utility.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
// specialization of Vector for fixed_size<N> {{{
template <class T, int N>
class Vector<T, simd_abi::fixed_size<N>> : public SimdArray<T, N>
{
using SimdArray<T, N>::SimdArray;
public:
// overload copy to force argument passing via the stack. This makes the type more
// usable on ABI boundaries
Vc_INTRINSIC Vector(const Vector &x) : SimdArray<T, N>(x) {}
Vc_INTRINSIC Vector &operator=(const Vector &x)
{
SimdArray<T, N>::operator=(x);
return *this;
}
Vector() = default;
using abi_type = simd_abi::fixed_size<N>;
using abi = abi_type;
Vc_DEPRECATED("use Vector([](int n) { return n; }) instead of "
"Vector::IndexesFromZero()") static Vector IndexesFromZero()
{
return Vector([](size_t i) -> T { return i; });
}
Vc_DEPRECATED("use 0 instead of Vector::Zero()") static Vector Zero() { return 0; }
Vc_DEPRECATED("use 1 instead of Vector::One()") static Vector One() { return 1; }
};
template <class T, int N>
class Mask<T, simd_abi::fixed_size<N>> : public SimdMaskArray<T, N>
{
using SimdMaskArray<T, N>::SimdMaskArray;
public:
// overload copy to force argument passing via the stack. This makes the type more
// usable on ABI boundaries
Vc_INTRINSIC Mask(const Mask &x) : SimdMaskArray<T, N>(x) {}
Vc_INTRINSIC Mask &operator=(const Mask &x)
{
SimdMaskArray<T, N>::operator=(x);
return *this;
}
Mask() = default;
using abi_type = simd_abi::fixed_size<N>;
using abi = abi_type;
};
// }}}
/** \internal
* Simple traits for SimdArray to easily access internal types of non-atomic SimdArray
* types.
*/
template <typename T, std::size_t N> struct SimdArrayTraits {
static constexpr std::size_t N0 = Common::left_size<N>();
static constexpr std::size_t N1 = Common::right_size<N>();
using storage_type0 = fixed_size_simd<T, N0>;
using storage_type1 = fixed_size_simd<T, N1>;
};
template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
Vc_INTRINSIC_L typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
Vc_INTRINSIC_L typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
Vc_INTRINSIC_L const typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
const SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
Vc_INTRINSIC_L const typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
const SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename V>
Vc_INTRINSIC_L V &internal_data(SimdArray<T, N, V, N> &x) Vc_INTRINSIC_R;
template <typename T, std::size_t N, typename V>
Vc_INTRINSIC_L const V &internal_data(const SimdArray<T, N, V, N> &x) Vc_INTRINSIC_R;
namespace Traits
{
// is_fixed_size_simd {{{1
template <class T> struct is_fixed_size_simd : std::false_type {
};
template <class T, int N>
struct is_fixed_size_simd<fixed_size_simd<T, N>> : std::true_type {
};
template <class T, int N>
struct is_fixed_size_simd<fixed_size_simd_mask<T, N>> : std::true_type {
};
// is_simd_vector_internal {{{1
template <class T, int N>
struct is_simd_vector_internal<fixed_size_simd<T, N>> : is_valid_vector_argument<T> {};
// is_simd_mask_internal {{{1
template <class T, int N>
struct is_simd_mask_internal<fixed_size_simd_mask<T, N>> : is_valid_vector_argument<T> {};
// is_atomic_simdarray_internal {{{1
template <typename T, std::size_t N, typename V>
struct is_atomic_simdarray_internal<SimdArray<T, N, V, N>> : is_valid_vector_argument<T> {};
template <typename T, int N>
struct is_atomic_simdarray_internal<fixed_size_simd<T, N>>
: is_atomic_simdarray_internal<SimdArray<T, N>> {
};
// is_atomic_simd_mask_array_internal {{{1
template <typename T, std::size_t N, typename V>
struct is_atomic_simd_mask_array_internal<SimdMaskArray<T, N, V, N>>
: is_valid_vector_argument<T> {
};
template <typename T, int N>
struct is_atomic_simd_mask_array_internal<fixed_size_simd_mask<T, N>>
: is_atomic_simd_mask_array_internal<SimdMaskArray<T, N>> {
};
// is_simdarray_internal {{{1
template <typename T, std::size_t N, typename VectorType, std::size_t M>
struct is_simdarray_internal<SimdArray<T, N, VectorType, M>>
: is_valid_vector_argument<T> {
};
template <typename T, int N>
struct is_simdarray_internal<fixed_size_simd<T, N>> : is_valid_vector_argument<T> {
};
// is_simd_mask_array_internal {{{1
template <typename T, std::size_t N, typename VectorType, std::size_t M>
struct is_simd_mask_array_internal<SimdMaskArray<T, N, VectorType, M>>
: is_valid_vector_argument<T> {
};
template <typename T, int N>
struct is_simd_mask_array_internal<fixed_size_simd_mask<T, N>>
: is_valid_vector_argument<T> {
};
// is_integral_internal {{{1
template <typename T, std::size_t N, typename V, std::size_t M>
struct is_integral_internal<SimdArray<T, N, V, M>, false> : std::is_integral<T> {
};
// is_floating_point_internal {{{1
template <typename T, std::size_t N, typename V, std::size_t M>
struct is_floating_point_internal<SimdArray<T, N, V, M>, false>
: std::is_floating_point<T> {
};
// is_signed_internal {{{1
template <typename T, std::size_t N, typename V, std::size_t M>
struct is_signed_internal<SimdArray<T, N, V, M>, false> : std::is_signed<T> {
};
// is_unsigned_internal {{{1
template <typename T, std::size_t N, typename V, std::size_t M>
struct is_unsigned_internal<SimdArray<T, N, V, M>, false> : std::is_unsigned<T> {
};
// has_no_allocated_data_impl {{{1
template <typename T, std::size_t N>
struct has_no_allocated_data_impl<Vc::SimdArray<T, N>> : std::true_type {
};
// }}}1
} // namespace Traits
} // namespace Vc
#endif // VC_COMMON_SIMDARRAYFWD_H_
// vim: foldmethod=marker

593
Vc/common/simdarrayhelper.h Normal file
View File

@ -0,0 +1,593 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_SIMDARRAYHELPER_H_
#define VC_COMMON_SIMDARRAYHELPER_H_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
// private_init {{{
namespace
{
static constexpr struct private_init_t {} private_init = {};
} // unnamed namespace
// }}}
namespace Common
{
/// \addtogroup SimdArray
/// @{
namespace Operations/*{{{*/
{
struct tag {};
#define Vc_DEFINE_OPERATION(name_) \
struct name_ : public tag { \
template <typename V, typename... Args> \
Vc_INTRINSIC void operator()(V &v, Args &&... args) \
{ \
v.name_(std::forward<Args>(args)...); \
} \
}
Vc_DEFINE_OPERATION(gather);
Vc_DEFINE_OPERATION(scatter);
Vc_DEFINE_OPERATION(load);
Vc_DEFINE_OPERATION(store);
Vc_DEFINE_OPERATION(setZero);
Vc_DEFINE_OPERATION(setZeroInverted);
Vc_DEFINE_OPERATION(assign);
#undef Vc_DEFINE_OPERATION
#define Vc_DEFINE_OPERATION(name_, code_) \
struct name_ : public tag { \
template <typename V> Vc_INTRINSIC void operator()(V &v) { code_; } \
}
Vc_DEFINE_OPERATION(increment, ++(v));
Vc_DEFINE_OPERATION(decrement, --(v));
Vc_DEFINE_OPERATION(random, v = V::Random());
#undef Vc_DEFINE_OPERATION
#define Vc_DEFINE_OPERATION_FORWARD(name_) \
struct Forward_##name_ : public tag \
{ \
template <typename... Args, typename = decltype(name_(std::declval<Args>()...))> \
Vc_INTRINSIC void operator()(decltype(name_(std::declval<Args>()...)) &v, \
Args &&... args) \
{ \
v = name_(std::forward<Args>(args)...); \
} \
template <typename... Args, typename = decltype(name_(std::declval<Args>()...))> \
Vc_INTRINSIC void operator()(std::nullptr_t, Args && ... args) \
{ \
name_(std::forward<Args>(args)...); \
} \
}
Vc_DEFINE_OPERATION_FORWARD(abs);
Vc_DEFINE_OPERATION_FORWARD(asin);
Vc_DEFINE_OPERATION_FORWARD(atan);
Vc_DEFINE_OPERATION_FORWARD(atan2);
Vc_DEFINE_OPERATION_FORWARD(cos);
Vc_DEFINE_OPERATION_FORWARD(ceil);
Vc_DEFINE_OPERATION_FORWARD(copysign);
Vc_DEFINE_OPERATION_FORWARD(exp);
Vc_DEFINE_OPERATION_FORWARD(exponent);
Vc_DEFINE_OPERATION_FORWARD(fma);
Vc_DEFINE_OPERATION_FORWARD(floor);
Vc_DEFINE_OPERATION_FORWARD(frexp);
Vc_DEFINE_OPERATION_FORWARD(isfinite);
Vc_DEFINE_OPERATION_FORWARD(isinf);
Vc_DEFINE_OPERATION_FORWARD(isnan);
Vc_DEFINE_OPERATION_FORWARD(isnegative);
Vc_DEFINE_OPERATION_FORWARD(ldexp);
Vc_DEFINE_OPERATION_FORWARD(log);
Vc_DEFINE_OPERATION_FORWARD(log10);
Vc_DEFINE_OPERATION_FORWARD(log2);
Vc_DEFINE_OPERATION_FORWARD(reciprocal);
Vc_DEFINE_OPERATION_FORWARD(round);
Vc_DEFINE_OPERATION_FORWARD(rsqrt);
Vc_DEFINE_OPERATION_FORWARD(sin);
Vc_DEFINE_OPERATION_FORWARD(sincos);
Vc_DEFINE_OPERATION_FORWARD(sqrt);
Vc_DEFINE_OPERATION_FORWARD(trunc);
Vc_DEFINE_OPERATION_FORWARD(min);
Vc_DEFINE_OPERATION_FORWARD(max);
#undef Vc_DEFINE_OPERATION_FORWARD
template<typename T> using is_operation = std::is_base_of<tag, T>;
} // namespace Operations }}}
/**
* \internal
* Helper type to statically communicate segmentation of one vector register into 2^n parts
* (Pieces).
*
* Forward declaration in common/types.h.
*/
template <typename T_, std::size_t Pieces_, std::size_t Index_> struct Segment/*{{{*/
{
static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report.");
using type = T_;
using type_decayed = typename std::decay<type>::type;
static constexpr std::size_t Pieces = Pieces_;
static constexpr std::size_t Index = Index_;
using fixed_size_type =
fixed_size_simd<conditional_t<Traits::is_simd_vector<type_decayed>::value,
typename type_decayed::EntryType, float>,
type_decayed::Size / Pieces>;
type data;
static constexpr std::size_t EntryOffset = Index * type_decayed::Size / Pieces;
// no non-const operator[] needed
decltype(std::declval<const type &>()[0]) operator[](size_t i) const { return data[i + EntryOffset]; }
fixed_size_type to_fixed_size() const
{
return simd_cast<fixed_size_type, Index>(data);
}
};/*}}}*/
//Segment<T *, ...> specialization {{{
template <typename T_, std::size_t Pieces_, std::size_t Index_>
struct Segment<T_ *, Pieces_, Index_> {
static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report.");
using type = T_ *;
using type_decayed = typename std::decay<T_>::type;
static constexpr size_t Pieces = Pieces_;
static constexpr size_t Index = Index_;
using fixed_size_type = fixed_size_simd<
typename std::conditional<Traits::is_simd_vector<type_decayed>::value,
typename type_decayed::VectorEntryType, float>::type,
type_decayed::Size / Pieces> *;
type data;
static constexpr std::size_t EntryOffset = Index * type_decayed::size() / Pieces;
fixed_size_type to_fixed_size() const
{
return reinterpret_cast<
#ifdef Vc_GCC
// GCC might ICE if this type is declared with may_alias. If it doesn't
// ICE it warns about ignoring the attribute.
typename std::remove_pointer<fixed_size_type>::type
#else
MayAlias<typename std::remove_pointer<fixed_size_type>::type>
#endif
*>(data) +
Index;
}
//decltype(std::declval<type>()[0]) operator[](size_t i) { return data[i + EntryOffset]; }
//decltype(std::declval<type>()[0]) operator[](size_t i) const { return data[i + EntryOffset]; }
};/*}}}*/
/** \internal
Template class that is used to attach an offset value to an existing type. It is used
for IndexesFromZero construction in SimdArray. The \c data1 constructor needs to know
that the IndexesFromZero constructor requires an offset so that the whole data is
constructed as a correct sequence from `0` to `Size - 1`.
\tparam T The original type that needs the offset attached.
\tparam Offset An integral value that determines the offset in the complete SimdArray.
*/
template <typename T, std::size_t Offset> struct AddOffset
{
constexpr AddOffset() = default;
};
// class Split {{{1
/** \internal
Helper type with static functions to generically adjust arguments for the \c data0 and
\c data1 members of SimdArray and SimdMaskArray.
\tparam secondOffset The offset in number of elements that \c data1 has in the SimdArray
/ SimdMaskArray. This is essentially equal to the number of
elements in \c data0.
*/
template <std::size_t secondOffset> class Split
{
// split composite SimdArray
template <typename U, std::size_t N, typename V, std::size_t M,
typename = enable_if<N != M>>
static Vc_INTRINSIC auto loImpl(const SimdArray<U, N, V, M> &x)
-> decltype(internal_data0(x))
{
return internal_data0(x);
}
template <typename U, std::size_t N, typename V, std::size_t M,
typename = enable_if<N != M>>
static Vc_INTRINSIC auto hiImpl(const SimdArray<U, N, V, M> &x)
-> decltype(internal_data1(x))
{
return internal_data1(x);
}
template <typename U, std::size_t N, typename V, std::size_t M,
typename = enable_if<N != M>>
static Vc_INTRINSIC auto loImpl(SimdArray<U, N, V, M> *x)
-> decltype(&internal_data0(*x))
{
return &internal_data0(*x);
}
template <typename U, std::size_t N, typename V, std::size_t M,
typename = enable_if<N != M>>
static Vc_INTRINSIC auto hiImpl(SimdArray<U, N, V, M> *x)
-> decltype(&internal_data1(*x))
{
return &internal_data1(*x);
}
// split atomic SimdArray
template <typename U, std::size_t N, typename V>
static Vc_INTRINSIC Segment<V, 2, 0> loImpl(const SimdArray<U, N, V, N> &x)
{
return {internal_data(x)};
}
template <typename U, std::size_t N, typename V>
static Vc_INTRINSIC Segment<V, 2, 1> hiImpl(const SimdArray<U, N, V, N> &x)
{
return {internal_data(x)};
}
template <typename U, std::size_t N, typename V>
static Vc_INTRINSIC Segment<V *, 2, 0> loImpl(SimdArray<U, N, V, N> *x)
{
return {&internal_data(*x)};
}
template <typename U, std::size_t N, typename V>
static Vc_INTRINSIC Segment<V *, 2, 1> hiImpl(SimdArray<U, N, V, N> *x)
{
return {&internal_data(*x)};
}
// split composite SimdMaskArray
template <typename U, std::size_t N, typename V, std::size_t M>
static Vc_INTRINSIC auto loImpl(const SimdMaskArray<U, N, V, M> &x) -> decltype(internal_data0(x))
{
return internal_data0(x);
}
template <typename U, std::size_t N, typename V, std::size_t M>
static Vc_INTRINSIC auto hiImpl(const SimdMaskArray<U, N, V, M> &x) -> decltype(internal_data1(x))
{
return internal_data1(x);
}
template <typename U, std::size_t N, typename V>
static Vc_INTRINSIC Segment<typename SimdMaskArray<U, N, V, N>::mask_type, 2, 0> loImpl(
const SimdMaskArray<U, N, V, N> &x)
{
return {internal_data(x)};
}
template <typename U, std::size_t N, typename V>
static Vc_INTRINSIC Segment<typename SimdMaskArray<U, N, V, N>::mask_type, 2, 1> hiImpl(
const SimdMaskArray<U, N, V, N> &x)
{
return {internal_data(x)};
}
// split Vector<T> and Mask<T>
#ifdef Vc_IMPL_AVX
template <class T>
static Vc_INTRINSIC SSE::Vector<T> loImpl(Vector<T, VectorAbi::Avx> &&x)
{
return simd_cast<SSE::Vector<T>, 0>(x);
}
template <class T>
static Vc_INTRINSIC SSE::Vector<T> hiImpl(Vector<T, VectorAbi::Avx> &&x)
{
return simd_cast<SSE::Vector<T>, 1>(x);
}
template <class T>
static Vc_INTRINSIC SSE::Mask<T> loImpl(Mask<T, VectorAbi::Avx> &&x)
{
return simd_cast<SSE::Mask<T>, 0>(x);
}
template <class T>
static Vc_INTRINSIC SSE::Mask<T> hiImpl(Mask<T, VectorAbi::Avx> &&x)
{
return simd_cast<SSE::Mask<T>, 1>(x);
}
#endif // Vc_IMPL_AVX
template <typename T>
static constexpr bool is_vector_or_mask(){
return (Traits::is_simd_vector<T>::value && !Traits::isSimdArray<T>::value) ||
(Traits::is_simd_mask<T>::value && !Traits::isSimdMaskArray<T>::value);
}
template <typename V>
static Vc_INTRINSIC Segment<V, 2, 0> loImpl(V &&x, enable_if<is_vector_or_mask<V>()> = nullarg)
{
return {std::forward<V>(x)};
}
template <typename V>
static Vc_INTRINSIC Segment<V, 2, 1> hiImpl(V &&x, enable_if<is_vector_or_mask<V>()> = nullarg)
{
return {std::forward<V>(x)};
}
// split std::vector<T>
template <class T, class A>
static Vc_INTRINSIC const T *loImpl(const std::vector<T, A> &x)
{
return x.data();
}
template <class T, class A>
static Vc_INTRINSIC const T *hiImpl(const std::vector<T, A> &x)
{
return x.data() + secondOffset;
}
// generically split Segments
template <typename V, std::size_t Pieces, std::size_t Index>
static Vc_INTRINSIC Segment<V, 2 * Pieces, 2 * Index> loImpl(
const Segment<V, Pieces, Index> &x)
{
return {x.data};
}
template <typename V, std::size_t Pieces, std::size_t Index>
static Vc_INTRINSIC Segment<V, 2 * Pieces, 2 * Index + 1> hiImpl(
const Segment<V, Pieces, Index> &x)
{
return {x.data};
}
/** \internal
* \name Checks for existence of \c loImpl / \c hiImpl
*/
//@{
template <typename T, typename = decltype(loImpl(std::declval<T>()))>
static std::true_type have_lo_impl(int);
template <typename T> static std::false_type have_lo_impl(float);
template <typename T> static constexpr bool have_lo_impl()
{
return decltype(have_lo_impl<T>(1))::value;
}
template <typename T, typename = decltype(hiImpl(std::declval<T>()))>
static std::true_type have_hi_impl(int);
template <typename T> static std::false_type have_hi_impl(float);
template <typename T> static constexpr bool have_hi_impl()
{
return decltype(have_hi_impl<T>(1))::value;
}
//@}
public:
/** \internal
* \name with Operations tag
*
* These functions don't overload on the data parameter. The first parameter (the tag) clearly
* identifies the intended function.
*/
//@{
template <typename U>
static Vc_INTRINSIC const U *lo(Operations::gather, const U *ptr)
{
return ptr;
}
template <typename U>
static Vc_INTRINSIC const U *hi(Operations::gather, const U *ptr)
{
return ptr + secondOffset;
}
template <typename U, typename = enable_if<!std::is_pointer<U>::value>>
static Vc_ALWAYS_INLINE decltype(loImpl(std::declval<U>()))
lo(Operations::gather, U &&x)
{
return loImpl(std::forward<U>(x));
}
template <typename U, typename = enable_if<!std::is_pointer<U>::value>>
static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval<U>()))
hi(Operations::gather, U &&x)
{
return hiImpl(std::forward<U>(x));
}
template <typename U>
static Vc_INTRINSIC const U *lo(Operations::scatter, const U *ptr)
{
return ptr;
}
template <typename U>
static Vc_INTRINSIC const U *hi(Operations::scatter, const U *ptr)
{
return ptr + secondOffset;
}
//@}
/** \internal
\name without Operations tag
These functions are not clearly tagged as to where they are used and therefore
behave differently depending on the type of the parameter. Different behavior is
implemented via overloads of \c loImpl and \c hiImpl. They are not overloads of \c
lo and \c hi directly because it's hard to compete against a universal reference
(i.e. an overload for `int` requires overloads for `int &`, `const int &`, and `int
&&`. If one of them were missing `U &&` would win in overload resolution).
*/
//@{
template <typename U>
static Vc_ALWAYS_INLINE decltype(loImpl(std::declval<U>())) lo(U &&x)
{
return loImpl(std::forward<U>(x));
}
template <typename U>
static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval<U>())) hi(U &&x)
{
return hiImpl(std::forward<U>(x));
}
template <typename U>
static Vc_ALWAYS_INLINE enable_if<!have_lo_impl<U>(), U> lo(U &&x)
{
return std::forward<U>(x);
}
template <typename U>
static Vc_ALWAYS_INLINE enable_if<!have_hi_impl<U>(), U> hi(U &&x)
{
return std::forward<U>(x);
}
//@}
};
// actual_value {{{1
template <typename Op, typename U, std::size_t M, typename V>
static Vc_INTRINSIC const V &actual_value(Op, const SimdArray<U, M, V, M> &x)
{
return internal_data(x);
}
template <typename Op, typename U, std::size_t M, typename V>
static Vc_INTRINSIC V *actual_value(Op, SimdArray<U, M, V, M> *x)
{
return &internal_data(*x);
}
template <typename Op, typename T, size_t Pieces, size_t Index>
static Vc_INTRINSIC typename Segment<T, Pieces, Index>::fixed_size_type actual_value(
Op, Segment<T, Pieces, Index> &&seg)
{
return seg.to_fixed_size();
}
template <typename Op, typename U, std::size_t M, typename V>
static Vc_INTRINSIC const typename V::Mask &actual_value(Op, const SimdMaskArray<U, M, V, M> &x)
{
return internal_data(x);
}
template <typename Op, typename U, std::size_t M, typename V>
static Vc_INTRINSIC typename V::Mask *actual_value(Op, SimdMaskArray<U, M, V, M> *x)
{
return &internal_data(*x);
}
// unpackArgumentsAuto {{{1
/**\internal
* \name unpackArgumentsAuto
*
* Search for the right amount of SimdArray "unpacking" (via actual_value) to match the
* interface of the function to be called.
*
* The compiler can figure this out for us thanks to SFINAE. The approach is to have a
* number \c I that determines the indexes of the arguments to be transformed via
* actual_value. Each bit of \c I identifies an argument. unpackArgumentsAuto starts the
* recursion with `I = 0`, i.e. no actual_value transformations. If the overload calling
* \c op is unavailable due to a substitution failure \c I is incremented and the function
* recurses. Otherwise there are two unpackArgumentsAutoImpl functions in the overload
* set. The first argument (\c int / \c float) leads to a preference of the function
* calling \c op, thus ending the recursion.
*/
///@{
///\internal transforms \p arg via actual_value
template <typename Op, typename Arg>
Vc_INTRINSIC decltype(actual_value(std::declval<Op &>(), std::declval<Arg>()))
conditionalUnpack(std::true_type, Op op, Arg &&arg)
{
return actual_value(op, std::forward<Arg>(arg));
}
///\internal forwards \p arg to its return value
template <typename Op, typename Arg>
Vc_INTRINSIC Arg conditionalUnpack(std::false_type, Op, Arg &&arg)
{
return std::forward<Arg>(arg);
}
///\internal true-/false_type that selects whether the argument with index B should be unpacked
template <size_t A, size_t B>
struct selectorType : public std::integral_constant<bool, !((A & (size_t(1) << B)) != 0)> {
};
///\internal ends the recursion, transforms arguments, and calls \p op
template <size_t I, typename Op, typename R, typename... Args, size_t... Indexes>
Vc_INTRINSIC decltype(std::declval<Op &>()(std::declval<R &>(),
conditionalUnpack(selectorType<I, Indexes>(),
std::declval<Op &>(),
std::declval<Args>())...))
unpackArgumentsAutoImpl(int, index_sequence<Indexes...>, Op op, R &&r, Args &&... args)
{
op(std::forward<R>(r),
conditionalUnpack(selectorType<I, Indexes>(), op, std::forward<Args>(args))...);
}
///\internal the current actual_value calls don't work: recurse to I + 1
template <size_t I, typename Op, typename R, typename... Args, size_t... Indexes>
Vc_INTRINSIC enable_if<(I <= (size_t(1) << sizeof...(Args))), void> unpackArgumentsAutoImpl(
float, index_sequence<Indexes...> is, Op op, R &&r, Args &&... args)
{
// if R is nullptr_t then the return type cannot enforce that actually any unwrapping
// of the SimdArray types happens. Thus, you could get an endless loop of the
// SimdArray function overload calling itself, if the index goes up to (1 <<
// sizeof...(Args)) - 1 (which means no argument transformations via actual_value).
static_assert(
I < (1 << sizeof...(Args)) - (std::is_same<R, std::nullptr_t>::value ? 1 : 0),
"Vc or compiler bug. Please report. Failed to find a combination of "
"actual_value(arg) transformations that allows calling Op.");
unpackArgumentsAutoImpl<I + 1, Op, R, Args...>(int(), is, op, std::forward<R>(r),
std::forward<Args>(args)...);
}
#ifdef Vc_ICC
template <size_t, typename... Ts> struct IccWorkaround {
using type = void;
};
template <typename... Ts> struct IccWorkaround<2, Ts...> {
using type = typename std::remove_pointer<typename std::decay<
typename std::tuple_element<1, std::tuple<Ts...>>::type>::type>::type;
};
#endif
///\internal The interface to start the machinery.
template <typename Op, typename R, typename... Args>
Vc_INTRINSIC void unpackArgumentsAuto(Op op, R &&r, Args &&... args)
{
#ifdef Vc_ICC
// ugly hacky workaround for ICC:
// The compiler fails to do SFINAE right on recursion. We have to hit the right
// recursionStart number from the start.
const int recursionStart =
Traits::isSimdArray<
typename IccWorkaround<sizeof...(Args), Args...>::type>::value &&
(std::is_same<Op, Common::Operations::Forward_frexp>::value ||
std::is_same<Op, Common::Operations::Forward_ldexp>::value)
? 2
: 0;
#else
const int recursionStart = 0;
#endif
unpackArgumentsAutoImpl<recursionStart>(
int(), make_index_sequence<sizeof...(Args)>(), op, std::forward<R>(r),
std::forward<Args>(args)...);
}
///@}
//}}}1
///@}
} // namespace Common
} // namespace Vc
#endif // VC_COMMON_SIMDARRAYHELPER_H_
// vim: foldmethod=marker

1956
Vc/common/simdize.h Normal file

File diff suppressed because it is too large Load Diff

719
Vc/common/simdmaskarray.h Normal file
View File

@ -0,0 +1,719 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_SIMDMASKARRAY_H_
#define VC_COMMON_SIMDMASKARRAY_H_
#include <type_traits>
#include <array>
#include "simdarrayhelper.h"
#include "utility.h"
#include "maskbool.h"
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
/// \addtogroup SimdArray
/// @{
// atomic SimdMaskArray {{{1
/**\internal
* Specialization of `SimdMaskArray<T, N, VectorType, VectorSize>` for the case where `N
* == VectorSize`.
*
* This is specialized for implementation purposes: Since the general implementation uses
* two SimdMaskArray data members it recurses over different SimdMaskArray instantiations.
* The recursion is ended by this specialization, which has a single \p storage_type data
* member to which all functions are forwarded more or less directly.
*/
template <typename T, std::size_t N, typename VectorType_>
class SimdMaskArray<T, N, VectorType_, N>
{
public:
using VectorType = VectorType_;
using vector_type = VectorType;
using mask_type = typename vector_type::Mask;
using storage_type = mask_type;
friend storage_type &internal_data(SimdMaskArray &m) { return m.data; }
friend const storage_type &internal_data(const SimdMaskArray &m) { return m.data; }
static constexpr std::size_t size() { return N; }
static constexpr std::size_t Size = size();
static constexpr std::size_t MemoryAlignment = storage_type::MemoryAlignment;
static_assert(Size == vector_type::Size, "size mismatch");
using vectorentry_type = typename mask_type::VectorEntryType;
using value_type = typename mask_type::EntryType;
using Mask = mask_type;
using VectorEntryType = vectorentry_type;
using EntryType = value_type;
using EntryReference = Vc::Detail::ElementReference<storage_type, SimdMaskArray>;
using reference = EntryReference;
using Vector = fixed_size_simd<T, N>;
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(mask_type));
// zero init
SimdMaskArray() = default;
// default copy ctor/operator
SimdMaskArray(const SimdMaskArray &) = default;
SimdMaskArray(SimdMaskArray &&) = default;
SimdMaskArray &operator=(const SimdMaskArray &) = default;
SimdMaskArray &operator=(SimdMaskArray &&) = default;
// broadcasts
Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerOne one) : data(one) {}
Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerZero zero) : data(zero) {}
Vc_INTRINSIC explicit SimdMaskArray(bool b) : data(b) {}
Vc_INTRINSIC static SimdMaskArray Zero() { return {private_init, storage_type::Zero()}; }
Vc_INTRINSIC static SimdMaskArray One() { return {private_init, storage_type::One()}; }
// conversion (casts); implemented in simd_cast_caller.tcc
template <class U, class V, class = enable_if<N == V::Size>>
Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
template <class U, class V, class = enable_if<(N > V::Size && N <= 2 * V::Size)>,
class = U>
Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
template <class U, class V, class = enable_if<(N > 2 * V::Size && N <= 4 * V::Size)>,
class = U, class = U>
Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
// conversion from any Segment object (could be SimdMaskArray or Mask<T>)
template <typename M, std::size_t Pieces, std::size_t Index>
Vc_INTRINSIC_L SimdMaskArray(
Common::Segment<M, Pieces, Index> &&x,
enable_if<Traits::simd_vector_size<M>::value == Size * Pieces> = nullarg) Vc_INTRINSIC_R;
// conversion from Mask<T>
template <class M, class = enable_if<(Traits::is_simd_mask<M>::value &&
!Traits::isSimdMaskArray<M>::value &&
Traits::simd_vector_size<M>::value == Size)>>
Vc_INTRINSIC_L SimdMaskArray(M k) Vc_INTRINSIC_R;
// implicit conversion to Mask<U, AnyAbi> for if Mask<U, AnyAbi>::size() == N
template <class U, class A,
class = enable_if<Vc::Mask<U, A>::Size == N &&
!detail::is_fixed_size_abi<A>::value>>
operator Vc::Mask<U, A>() const
{
return simd_cast<Vc::Mask<U, A>>(data);
}
operator fixed_size_simd_mask<T, N> &()
{
return static_cast<fixed_size_simd_mask<T, N> &>(*this);
}
operator const fixed_size_simd_mask<T, N> &() const
{
return static_cast<const fixed_size_simd_mask<T, N> &>(*this);
}
// load/store (from/to bool arrays)
template <typename Flags = DefaultLoadTag>
Vc_INTRINSIC explicit SimdMaskArray(const bool *mem, Flags f = Flags())
: data(mem, f)
{
}
Vc_INTRINSIC void load(const bool *mem) { data.load(mem); }
template <typename Flags> Vc_INTRINSIC void load(const bool *mem, Flags f)
{
data.load(mem, f);
}
Vc_INTRINSIC void store(bool *mem) const { data.store(mem); }
template <typename Flags> Vc_INTRINSIC void store(bool *mem, Flags f) const
{
data.store(mem, f);
}
// compares
Vc_INTRINSIC Vc_PURE bool operator==(const SimdMaskArray &rhs) const
{
return data == rhs.data;
}
Vc_INTRINSIC Vc_PURE bool operator!=(const SimdMaskArray &rhs) const
{
return data != rhs.data;
}
// inversion
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator!() const
{
return {private_init, !data};
}
// binary operators
Vc_INTRINSIC SimdMaskArray &operator&=(const SimdMaskArray &rhs)
{
data &= rhs.data;
return *this;
}
Vc_INTRINSIC SimdMaskArray &operator|=(const SimdMaskArray &rhs)
{
data |= rhs.data;
return *this;
}
Vc_INTRINSIC SimdMaskArray &operator^=(const SimdMaskArray &rhs)
{
data ^= rhs.data;
return *this;
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&(
const SimdMaskArray &rhs) const
{
return {private_init, data & rhs.data};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator|(
const SimdMaskArray &rhs) const
{
return {private_init, data | rhs.data};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator^(
const SimdMaskArray &rhs) const
{
return {private_init, data ^ rhs.data};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&&(
const SimdMaskArray &rhs) const
{
return {private_init, data && rhs.data};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator||(
const SimdMaskArray &rhs) const
{
return {private_init, data || rhs.data};
}
Vc_INTRINSIC Vc_PURE bool isFull() const { return data.isFull(); }
Vc_INTRINSIC Vc_PURE bool isNotEmpty() const { return data.isNotEmpty(); }
Vc_INTRINSIC Vc_PURE bool isEmpty() const { return data.isEmpty(); }
Vc_INTRINSIC Vc_PURE bool isMix() const { return data.isMix(); }
Vc_INTRINSIC Vc_PURE int shiftMask() const { return data.shiftMask(); }
Vc_INTRINSIC Vc_PURE int toInt() const { return data.toInt(); }
private:
friend reference;
static Vc_INTRINSIC value_type get(const storage_type &k, int i) noexcept
{
return k[i];
}
template <typename U>
static Vc_INTRINSIC void set(storage_type &k, int i, U &&v) noexcept(
noexcept(std::declval<storage_type &>()[0] = std::declval<U>()))
{
k[i] = std::forward<U>(v);
}
public:
/**
* \note the returned object models the concept of a reference and
* as such it can exist longer than the data it is referencing.
* \note to avoid lifetime issues, we strongly advice not to store
* any reference objects.
*/
Vc_INTRINSIC Vc_PURE reference operator[](size_t index) noexcept
{
return {data, int(index)};
}
Vc_INTRINSIC Vc_PURE value_type operator[](size_t index) const noexcept
{
return data[index];
}
Vc_INTRINSIC Vc_PURE int count() const { return data.count(); }
/**
* Returns the index of the first one in the mask.
*
* The return value is undefined if the mask is empty.
*/
Vc_INTRINSIC Vc_PURE int firstOne() const { return data.firstOne(); }
template <typename G>
static Vc_INTRINSIC fixed_size_simd_mask<T, N> generate(const G &gen)
{
return {private_init, mask_type::generate(gen)};
}
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> shifted(int amount) const
{
return {private_init, data.shifted(amount)};
}
/// \internal execute specified Operation
template <typename Op, typename... Args>
static Vc_INTRINSIC fixed_size_simd_mask<T, N> fromOperation(Op op, Args &&... args)
{
fixed_size_simd_mask<T, N> r;
Common::unpackArgumentsAuto(op, r.data, std::forward<Args>(args)...);
return r;
}
/// \internal
Vc_INTRINSIC SimdMaskArray(private_init_t, mask_type &&x) : data(std::move(x)) {}
private:
// The alignas attribute attached to the class declaration above is ignored by ICC
// 17.0.0 (at least). So just move the alignas attribute down here where it works for
// all compilers.
alignas(static_cast<std::size_t>(
Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(VectorType_) /
VectorType_::size()>::value)) storage_type data;
};
template <typename T, std::size_t N, typename VectorType> constexpr std::size_t SimdMaskArray<T, N, VectorType, N>::Size;
template <typename T, std::size_t N, typename VectorType>
constexpr std::size_t SimdMaskArray<T, N, VectorType, N>::MemoryAlignment;
// generic SimdMaskArray {{{1
/**
* Data-parallel mask type with user-defined number of boolean elements.
*
* \tparam T The value type of the corresponding SimdArray. Depending on the target
* platform this type determines a different bit representation to work most
* efficient with SimdArray types instantiated for \p T.
*
* \tparam N The number of boolean elements to store and process concurrently. You can
* choose an arbitrary number, though not every number is a good idea.
* Generally, a power of two value or the sum of two power of two values might
* work efficiently, though this depends a lot on the target system.
*
* \tparam V Don't change the default value unless you really know what you are doing.
* This type is set to the underlying native Vc::Vector type used in the
* implementation of the type.
* Having it as part of the type name guards against some cases of ODR
* violations (i.e. linking incompatible translation units / libraries).
*
* \tparam Wt Don't ever change the default value.
* This parameter is an unfortunate implementation detail shining through.
*
* \headerfile simdmaskarray.h <Vc/SimdArray>
*/
template <typename T, size_t N, typename V, size_t Wt>
class SimdMaskArray
{
static constexpr std::size_t N0 = Common::left_size<N>();
using Split = Common::Split<N0>;
public:
using storage_type0 = fixed_size_simd_mask<T, N0>;
using storage_type1 = fixed_size_simd_mask<T, N - N0>;
static_assert(storage_type0::size() == N0, "");
using vector_type = fixed_size_simd<T, N>;
friend storage_type0 &internal_data0(SimdMaskArray &m) { return m.data0; }
friend storage_type1 &internal_data1(SimdMaskArray &m) { return m.data1; }
friend const storage_type0 &internal_data0(const SimdMaskArray &m) { return m.data0; }
friend const storage_type1 &internal_data1(const SimdMaskArray &m) { return m.data1; }
using mask_type = SimdMaskArray;
///\copydoc Mask::size()
static constexpr std::size_t size() { return N; }
///\copydoc Mask::Size
static constexpr std::size_t Size = size();
///\copydoc Mask::MemoryAlignment
static constexpr std::size_t MemoryAlignment =
storage_type0::MemoryAlignment > storage_type1::MemoryAlignment
? storage_type0::MemoryAlignment
: storage_type1::MemoryAlignment;
static_assert(Size == vector_type::Size, "size mismatch");
///\internal
using vectorentry_type = typename storage_type0::VectorEntryType;
///\copydoc Mask::value_type
using value_type = typename storage_type0::EntryType;
///\copydoc Mask::Mask
using MaskType = mask_type;
///\copydoc Mask::VectorEntryType
using VectorEntryType = vectorentry_type;
///\copydoc Mask::EntryType
using EntryType = value_type;
///\copydoc Mask::EntryReference
using EntryReference = Vc::Detail::ElementReference<SimdMaskArray>;
using reference = EntryReference;
/// An alias for the corresponding SimdArray type.
using Vector = fixed_size_simd<T, N>;
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(mask_type));
// zero init
///\copydoc Mask::Mask()
SimdMaskArray() = default;
// default copy ctor/operator
SimdMaskArray(const SimdMaskArray &) = default;
SimdMaskArray(SimdMaskArray &&) = default;
SimdMaskArray &operator=(const SimdMaskArray &) = default;
SimdMaskArray &operator=(SimdMaskArray &&) = default;
// implicit conversion from SimdMaskArray with same N
template <typename U, typename W>
Vc_INTRINSIC SimdMaskArray(const SimdMaskArray<U, N, W> &rhs)
: data0(Split::lo(rhs)), data1(Split::hi(rhs))
{
}
// conversion from any Segment object (could be SimdMaskArray or Mask<T>)
template <typename M, std::size_t Pieces, std::size_t Index>
Vc_INTRINSIC SimdMaskArray(
Common::Segment<M, Pieces, Index> &&rhs,
enable_if<Traits::simd_vector_size<M>::value == Size * Pieces> = nullarg)
: data0(Split::lo(rhs)), data1(Split::hi(rhs))
{
}
// conversion from Mask<T>
template <class M, class = enable_if<(Traits::is_simd_mask<M>::value &&
!Traits::isSimdMaskArray<M>::value &&
Traits::simd_vector_size<M>::value == Size)>>
Vc_INTRINSIC SimdMaskArray(M k) : data0(Split::lo(k)), data1(Split::hi(k))
{
}
// implicit conversion to Mask<U, AnyAbi> for if Mask<U, AnyAbi>::size() == N
template <class U, class A,
class = enable_if<Vc::Mask<U, A>::Size == N &&
!detail::is_fixed_size_abi<A>::value>>
operator Vc::Mask<U, A>() const
{
return simd_cast<Vc::Mask<U, A>>(data0, data1);
}
Vc_INTRINSIC operator fixed_size_simd_mask<T, N> &()
{
return static_cast<fixed_size_simd_mask<T, N> &>(*this);
}
Vc_INTRINSIC operator const fixed_size_simd_mask<T, N> &() const
{
return static_cast<const fixed_size_simd_mask<T, N> &>(*this);
}
///\copybrief Mask::Mask(VectorSpecialInitializerOne)
Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerOne one)
: data0(one), data1(one)
{
}
///\copybrief Mask::Mask(VectorSpecialInitializerZero)
Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerZero zero)
: data0(zero), data1(zero)
{
}
///\copydoc Mask::Mask(bool)
Vc_INTRINSIC explicit SimdMaskArray(bool b) : data0(b), data1(b) {}
///\copydoc Mask::Zero()
Vc_INTRINSIC static fixed_size_simd_mask<T, N> Zero()
{
return {storage_type0::Zero(), storage_type1::Zero()};
}
///\copydoc Mask::One()
Vc_INTRINSIC static fixed_size_simd_mask<T, N> One()
{
return {storage_type0::One(), storage_type1::One()};
}
///\name Loads & Stores
///@{
/**
* Load N boolean values from the consecutive addresses starting at \p mem.
*
* \param mem A pointer to an array of booleans.
* \param f A combination of flags to modify specific behavior of the load.
*/
template <typename Flags = DefaultLoadTag>
Vc_INTRINSIC explicit SimdMaskArray(const bool *mem, Flags f = Flags())
: data0(mem, f), data1(mem + storage_type0::size(), f)
{
}
/**
* Load N boolean values from the consecutive addresses starting at \p mem.
*
* \param mem A pointer to an array of booleans.
*/
Vc_INTRINSIC void load(const bool *mem)
{
data0.load(mem);
data1.load(mem + storage_type0::size());
}
/**
* Load N boolean values from the consecutive addresses starting at \p mem.
*
* \param mem A pointer to an array of booleans.
* \param f A combination of flags to modify specific behavior of the load.
*/
template <typename Flags> Vc_INTRINSIC void load(const bool *mem, Flags f)
{
data0.load(mem, f);
data1.load(mem + storage_type0::size(), f);
}
/**
* Store N boolean values to the consecutive addresses starting at \p mem.
*
* \param mem A pointer to an array of booleans.
*/
Vc_INTRINSIC void store(bool *mem) const
{
data0.store(mem);
data1.store(mem + storage_type0::size());
}
/**
* Store N boolean values to the consecutive addresses starting at \p mem.
*
* \param mem A pointer to an array of booleans.
* \param f A combination of flags to modify specific behavior of the load.
*/
template <typename Flags> Vc_INTRINSIC void store(bool *mem, Flags f) const
{
data0.store(mem, f);
data1.store(mem + storage_type0::size(), f);
}
///@}
///\copydoc Mask::operator==
Vc_INTRINSIC Vc_PURE bool operator==(const SimdMaskArray &mask) const
{
return data0 == mask.data0 && data1 == mask.data1;
}
///\copydoc Mask::operator!=
Vc_INTRINSIC Vc_PURE bool operator!=(const SimdMaskArray &mask) const
{
return data0 != mask.data0 || data1 != mask.data1;
}
///\copybrief Mask::operator!
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator!() const
{
return {!data0, !data1};
}
///\copybrief Mask::operator&=
Vc_INTRINSIC SimdMaskArray &operator&=(const SimdMaskArray &rhs)
{
data0 &= rhs.data0;
data1 &= rhs.data1;
return *this;
}
///\copybrief Mask::operator|=
Vc_INTRINSIC SimdMaskArray &operator|=(const SimdMaskArray &rhs)
{
data0 |= rhs.data0;
data1 |= rhs.data1;
return *this;
}
///\copybrief Mask::operator^=
Vc_INTRINSIC SimdMaskArray &operator^=(const SimdMaskArray &rhs)
{
data0 ^= rhs.data0;
data1 ^= rhs.data1;
return *this;
}
///\copybrief Mask::operator&
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&(
const SimdMaskArray &rhs) const
{
return {data0 & rhs.data0, data1 & rhs.data1};
}
///\copybrief Mask::operator|
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator|(
const SimdMaskArray &rhs) const
{
return {data0 | rhs.data0, data1 | rhs.data1};
}
///\copybrief Mask::operator^
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator^(
const SimdMaskArray &rhs) const
{
return {data0 ^ rhs.data0, data1 ^ rhs.data1};
}
///\copybrief Mask::operator&&
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&&(
const SimdMaskArray &rhs) const
{
return {data0 && rhs.data0, data1 && rhs.data1};
}
///\copybrief Mask::operator||
Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator||(
const SimdMaskArray &rhs) const
{
return {data0 || rhs.data0, data1 || rhs.data1};
}
///\copybrief Mask::isFull
Vc_INTRINSIC Vc_PURE bool isFull() const { return data0.isFull() && data1.isFull(); }
///\copybrief Mask::isNotEmpty
Vc_INTRINSIC Vc_PURE bool isNotEmpty() const { return data0.isNotEmpty() || data1.isNotEmpty(); }
///\copybrief Mask::isEmpty
Vc_INTRINSIC Vc_PURE bool isEmpty() const { return data0.isEmpty() && data1.isEmpty(); }
///\copybrief Mask::isMix
Vc_INTRINSIC Vc_PURE bool isMix() const { return !isFull() && !isEmpty(); }
///\copydoc Mask::toInt
Vc_INTRINSIC Vc_PURE int toInt() const
{
return data0.toInt() | (data1.toInt() << data0.size());
}
private:
friend reference;
static Vc_INTRINSIC value_type get(const SimdMaskArray &o, int i) noexcept
{
if (i < int(o.data0.size())) {
return o.data0[i];
} else {
return o.data1[i - o.data0.size()];
}
}
template <typename U>
static Vc_INTRINSIC void set(SimdMaskArray &o, int i, U &&v) noexcept(
noexcept(std::declval<storage_type0 &>()[0] = std::declval<U>()) &&
noexcept(std::declval<storage_type1 &>()[0] = std::declval<U>()))
{
if (i < int(o.data0.size())) {
o.data0[i] = std::forward<U>(v);
} else {
o.data1[i - o.data0.size()] = std::forward<U>(v);
}
}
public:
/**
* Return a smart reference to the boolean element at index \p index.
*
* \param index The element index to be accessed.
*
* \returns A temporary smart reference object which acts as much as an lvalue
* reference as possible.
*/
Vc_INTRINSIC Vc_PURE reference operator[](size_t index) noexcept
{
return {*this, int(index)};
}
/**
* Return a copy of the boolean element at index \p index.
*
* \param index The element index to be accessed.
*
* \returns A temporary boolean object with the value of the element at index \p
* index.
*/
Vc_INTRINSIC Vc_PURE value_type operator[](size_t index) const noexcept
{
return get(*this, index);
}
///\copybrief Mask::count
Vc_INTRINSIC Vc_PURE int count() const { return data0.count() + data1.count(); }
///\copydoc Mask::firstOne
Vc_INTRINSIC Vc_PURE int firstOne() const {
if (data0.isEmpty()) {
return data1.firstOne() + storage_type0::size();
}
return data0.firstOne();
}
///\copybrief Mask::generate
template <typename G>
static Vc_INTRINSIC fixed_size_simd_mask<T, N> generate(const G &gen)
{
return {storage_type0::generate(gen),
storage_type1::generate([&](std::size_t i) { return gen(i + N0); })};
}
///\copybrief Mask::shifted
inline Vc_PURE fixed_size_simd_mask<T, N> shifted(int amount) const
{
if (Vc_IS_UNLIKELY(amount == 0)) {
return *this;
}
return generate([&](unsigned i) {
// modulo arithmetic of unsigned makes the check for j >= 0 unnecessary
const unsigned j = i + amount;
return j < size() ? get(*this, j) : false;
});
}
/// \internal execute specified Operation
template <typename Op, typename... Args>
static Vc_INTRINSIC fixed_size_simd_mask<T, N> fromOperation(Op op, Args &&... args)
{
fixed_size_simd_mask<T, N> r = {
storage_type0::fromOperation(op, Split::lo(args)...), // no forward here - it
// could move and thus
// break the next line
storage_type1::fromOperation(op, Split::hi(std::forward<Args>(args))...)};
return r;
}
/// \internal
Vc_INTRINSIC SimdMaskArray(storage_type0 &&x, storage_type1 &&y)
: data0(std::move(x)), data1(std::move(y))
{
}
private:
// The alignas attribute attached to the class declaration above is ignored by ICC
// 17.0.0 (at least). So just move the alignas attribute down here where it works for
// all compilers.
alignas(static_cast<std::size_t>(
Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(V) /
V::size()>::value)) storage_type0 data0;
storage_type1 data1;
};
template <typename T, std::size_t N, typename V, std::size_t M>
constexpr std::size_t SimdMaskArray<T, N, V, M>::Size;
template <typename T, std::size_t N, typename V, std::size_t M>
constexpr std::size_t SimdMaskArray<T, N, V, M>::MemoryAlignment;
///}}}1
/// @}
} // namespace Vc
// XXX: this include should be in <Vc/vector.h>. But at least clang 3.4 then fails to compile the
// code. Not sure yet what is going on, but it looks a lot like a bug in clang.
#include "simd_cast_caller.tcc"
#endif // VC_COMMON_SIMDMASKARRAY_H_
// vim: foldmethod=marker

653
Vc/common/span.h Normal file
View File

@ -0,0 +1,653 @@
// -*- C++ -*-
//===------------------------------ span ---------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
//
// Adapted for use with Vc:
// Copyright © 2018 Matthias Kretz <kretz@kde.org>
//===---------------------------------------------------------------------===//
#ifndef VC_COMMON_SPAN_H_
#define VC_COMMON_SPAN_H_
#include <array> // for array
#include <cstddef> // for ptrdiff_t
#include <cstddef> // for std::byte
#include <iterator> // for iterators
#include <type_traits> // for remove_cv, etc
#include "subscript.h" // for AdaptSubscriptOperator
namespace Vc_VERSIONED_NAMESPACE
{
#ifdef __cpp_inline_variables
inline
#endif
constexpr ptrdiff_t dynamic_extent = -1;
namespace Common
{
template <typename T, ptrdiff_t Extent = dynamic_extent> class span;
template <typename T, ptrdiff_t Extent>
constexpr auto begin(const span<T, Extent>& s) noexcept -> decltype(s.begin())
{
return s.begin();
}
template <typename T, ptrdiff_t Extent>
constexpr auto end(const span<T, Extent>& s) noexcept -> decltype(s.end())
{
return s.end();
}
template <class T> struct _is_span_impl : public std::false_type {
};
template <class T, ptrdiff_t Extent>
struct _is_span_impl<span<T, Extent>> : public std::true_type {
};
template <class T>
struct _is_span : public _is_span_impl<typename std::remove_cv<T>::type> {
};
template <class T> struct _is_std_array_impl : public std::false_type {
};
template <class T, size_t Sz>
struct _is_std_array_impl<array<T, Sz>> : public std::true_type {
};
template <class T>
struct _is_std_array : public _is_std_array_impl<typename std::remove_cv<T>::type> {
};
template <class T, class ElementType, class = void>
struct _is_span_compatible_container : public std::false_type {
};
template <class... Ts> using _void_t = void;
template <class C> constexpr auto _std_data(C& c) -> decltype(c.data())
{
return c.data();
}
template <class C> constexpr auto _std_data(const C& c) -> decltype(c.data())
{
return c.data();
}
template <class T, std::size_t N> constexpr T* _std_data(T (&array)[N]) noexcept
{
return array;
}
template <class E> constexpr const E* _std_data(std::initializer_list<E> il) noexcept
{
return il.begin();
}
template <class C> constexpr auto _std_size(const C& c) -> decltype(c.size())
{
return c.size();
}
template <class T, std::size_t N>
constexpr std::size_t _std_size(const T (&array)[N]) noexcept
{
return N;
}
template <class T, class ElementType>
struct _is_span_compatible_container<
T, ElementType,
_void_t<
// is not a specialization of span
typename std::enable_if<!_is_span<T>::value, std::nullptr_t>::type,
// is not a specialization of array
typename std::enable_if<!_is_std_array<T>::value, std::nullptr_t>::type,
// is_array_v<Container> is false,
typename std::enable_if<!std::is_array<T>::value, std::nullptr_t>::type,
// data(cont) and size(cont) are well formed
decltype(data(std::declval<T>())), decltype(size(std::declval<T>())),
// remove_pointer_t<decltype(data(cont))>(*)[] is convertible to ElementType(*)[]
typename std::enable_if<
std::is_convertible<typename std::remove_pointer<decltype(
data(std::declval<T&>()))>::type (*)[],
ElementType (*)[]>::value,
std::nullptr_t>::type>> : public std::true_type {
};
#if defined Vc_MSVC || (defined Vc_GCC && Vc_GCC < 0x50100) || defined Vc_ICC || !defined __cpp_constexpr || __cpp_constexpr < 201304
#define Vc_CONSTEXPR
#else
#define Vc_CONSTEXPR constexpr
#endif
template <typename T, ptrdiff_t Extent> class span
{
public:
// constants and types
using element_type = T;
using value_type = typename std::remove_cv<T>::type;
using index_type = ptrdiff_t;
using difference_type = ptrdiff_t;
using pointer = T*;
using const_pointer = const T*; // not in standard
using reference = T&;
using const_reference = const T&; // not in standard
using iterator = pointer;
using const_iterator = const_pointer;
using reverse_iterator = std::reverse_iterator<iterator>;
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
static constexpr index_type extent = Extent;
static_assert(Extent >= 0, "Can't have a span with an extent < 0");
// [span.cons], span constructors, copy, assignment, and destructor
Vc_CONSTEXPR span() noexcept : data_{nullptr}
{
static_assert(Extent == 0,
"Can't default construct a statically sized span with size > 0");
}
Vc_CONSTEXPR span(const span&) noexcept = default;
Vc_CONSTEXPR span& operator=(const span&) noexcept = default;
Vc_CONSTEXPR span(pointer _ptr, index_type _count) : data_{_ptr}
{
(void)_count;
Vc_ASSERT(((void)"size mismatch in span's constructor (ptr, len)", Extent == _count));
}
Vc_CONSTEXPR span(pointer _f, pointer _l) : data_{_f}
{
(void)_l;
Vc_ASSERT(((void)"size mismatch in span's constructor (ptr, ptr)",
Extent == distance(_f, _l)));
}
Vc_CONSTEXPR span(element_type (&_arr)[Extent]) noexcept : data_{_arr} {}
Vc_CONSTEXPR span(array<value_type, Extent>& _arr) noexcept : data_{_arr.data()} {}
Vc_CONSTEXPR span(const array<value_type, Extent>& _arr) noexcept : data_{_arr.data()} {}
template <class Container>
inline Vc_CONSTEXPR span(
Container& _c,
typename std::enable_if<_is_span_compatible_container<Container, T>::value,
std::nullptr_t>::type = nullptr)
: data_{_std_data(_c)}
{
Vc_ASSERT(("size mismatch in span's constructor (container))",
Extent == _std_size(_c)));
}
template <class Container>
inline Vc_CONSTEXPR span(
const Container& _c,
typename std::enable_if<_is_span_compatible_container<const Container, T>::value,
std::nullptr_t>::type = nullptr)
: data_{_std_data(_c)}
{
Vc_ASSERT(("size mismatch in span's constructor (const container)",
Extent == _std_size(_c)));
}
template <class OtherElementType>
inline Vc_CONSTEXPR span(
const span<OtherElementType, Extent>& _other,
typename std::enable_if<
std::is_convertible<OtherElementType (*)[], element_type (*)[]>::value,
std::nullptr_t>::type = nullptr)
: data_{_other.data()}
{
}
template <class OtherElementType>
inline Vc_CONSTEXPR span(
const span<OtherElementType, dynamic_extent>& _other,
typename std::enable_if<
std::is_convertible<OtherElementType (*)[], element_type (*)[]>::value,
std::nullptr_t>::type = nullptr) noexcept
: data_{_other.data()}
{
Vc_ASSERT(("size mismatch in span's constructor (other span)",
Extent == _other.size()));
}
// ~span() noexcept = default;
template <ptrdiff_t Count>
inline Vc_CONSTEXPR span<element_type, Count> first() const noexcept
{
static_assert(Count >= 0, "Count must be >= 0 in span::first()");
static_assert(Count <= Extent, "Count out of range in span::first()");
return {data(), Count};
}
template <ptrdiff_t Count>
inline Vc_CONSTEXPR span<element_type, Count> last() const noexcept
{
static_assert(Count >= 0, "Count must be >= 0 in span::last()");
static_assert(Count <= Extent, "Count out of range in span::last()");
return {data() + size() - Count, Count};
}
Vc_CONSTEXPR span<element_type, dynamic_extent> first(index_type _count) const noexcept
{
Vc_ASSERT(("Count out of range in span::first(count)",
_count >= 0 && _count <= size()));
return {data(), _count};
}
Vc_CONSTEXPR span<element_type, dynamic_extent> last(index_type _count) const noexcept
{
Vc_ASSERT(
("Count out of range in span::last(count)", _count >= 0 && _count <= size()));
return {data() + size() - _count, _count};
}
#ifndef Vc_MSVC
// MSVC 190024215 fails with "error C2059: syntax error: '<end Parse>'" somewhere in
// this file. Unless someone needs this function on MSVC, I don't see a reason to
// invest time into working around their bugs.
template <ptrdiff_t Offset, ptrdiff_t Count = dynamic_extent>
inline Vc_CONSTEXPR auto subspan() const noexcept
-> span<element_type, Count != dynamic_extent ? Count : Extent - Offset>
{
Vc_ASSERT(
("Offset out of range in span::subspan()", Offset >= 0 && Offset <= size()));
return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
}
inline Vc_CONSTEXPR span<element_type, dynamic_extent> subspan(
index_type offset, index_type count = dynamic_extent) const noexcept
{
Vc_ASSERT(("Offset out of range in span::subspan(offset, count)",
offset >= 0 && offset <= size()));
Vc_ASSERT(("Count out of range in span::subspan(offset, count)",
(count >= 0 && count <= size()) || count == dynamic_extent));
if (count == dynamic_extent) {
return {data() + offset, size() - offset};
}
Vc_ASSERT(("count + offset out of range in span::subspan(offset, count)",
offset + count <= size()));
return {data() + offset, count};
}
#endif // Vc_MSVC
Vc_CONSTEXPR index_type size() const noexcept { return Extent; }
Vc_CONSTEXPR index_type size_bytes() const noexcept
{
return Extent * sizeof(element_type);
}
Vc_CONSTEXPR bool empty() const noexcept { return Extent == 0; }
Vc_CONSTEXPR reference operator[](index_type _idx) const noexcept
{
Vc_ASSERT(("span<T,N>[] index out of bounds", _idx >= 0 && _idx < size()));
return data_[_idx];
}
Vc_CONSTEXPR reference operator()(index_type _idx) const noexcept
{
Vc_ASSERT(("span<T,N>() index out of bounds", _idx >= 0 && _idx < size()));
return data_[_idx];
}
Vc_CONSTEXPR pointer data() const noexcept { return data_; }
// [span.iter], span iterator support
Vc_CONSTEXPR iterator begin() const noexcept { return iterator(data()); }
Vc_CONSTEXPR iterator end() const noexcept { return iterator(data() + size()); }
Vc_CONSTEXPR const_iterator cbegin() const noexcept { return const_iterator(data()); }
Vc_CONSTEXPR const_iterator cend() const noexcept
{
return const_iterator(data() + size());
}
Vc_CONSTEXPR reverse_iterator rbegin() const noexcept { return reverse_iterator(end()); }
Vc_CONSTEXPR reverse_iterator rend() const noexcept { return reverse_iterator(begin()); }
Vc_CONSTEXPR const_reverse_iterator crbegin() const noexcept
{
return const_reverse_iterator(cend());
}
Vc_CONSTEXPR const_reverse_iterator crend() const noexcept
{
return const_reverse_iterator(cbegin());
}
Vc_CONSTEXPR void swap(span& _other) noexcept
{
pointer _p = data_;
data_ = _other.data_;
_other.data_ = _p;
}
#ifdef __cpp_lib_byte
span<const std::byte, Extent * sizeof(element_type)> _as_bytes() const noexcept
{
return {reinterpret_cast<const std::byte*>(data()), size_bytes()};
}
span<std::byte, Extent * sizeof(element_type)> _as_writeable_bytes() const noexcept
{
return {reinterpret_cast<std::byte*>(data()), size_bytes()};
}
#endif // __cpp_lib_byte
private:
pointer data_;
};
template <typename T> class span<T, dynamic_extent>
{
private:
public:
// constants and types
using element_type = T;
using value_type = typename std::remove_cv<T>::type;
using index_type = ptrdiff_t;
using difference_type = ptrdiff_t;
using pointer = T*;
using const_pointer = const T*; // not in standard
using reference = T&;
using const_reference = const T&; // not in standard
using iterator = pointer;
using const_iterator = const_pointer;
using reverse_iterator = std::reverse_iterator<iterator>;
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
static constexpr index_type extent = dynamic_extent;
// [span.cons], span constructors, copy, assignment, and destructor
Vc_CONSTEXPR span() noexcept : data_{nullptr}, size_{0} {}
Vc_CONSTEXPR span(const span&) noexcept = default;
Vc_CONSTEXPR span& operator=(const span&) noexcept = default;
Vc_CONSTEXPR span(pointer _ptr, index_type _count) : data_{_ptr}, size_{_count} {}
Vc_CONSTEXPR span(pointer _f, pointer _l) : data_{_f}, size_{distance(_f, _l)} {}
template <size_t Sz>
inline Vc_CONSTEXPR span(element_type (&_arr)[Sz]) noexcept : data_{_arr}, size_{Sz}
{
}
template <size_t Sz>
inline Vc_CONSTEXPR span(array<value_type, Sz>& _arr) noexcept
: data_{_arr.data()}, size_{Sz}
{
}
template <size_t Sz>
inline Vc_CONSTEXPR span(const array<value_type, Sz>& _arr) noexcept
: data_{_arr.data()}, size_{Sz}
{
}
template <class Container>
inline Vc_CONSTEXPR span(
Container& _c,
typename std::enable_if<_is_span_compatible_container<Container, T>::value,
std::nullptr_t>::type = nullptr)
: data_{_std_data(_c)}, size_{index_type(_std_size(_c))}
{
}
template <class Container>
inline Vc_CONSTEXPR span(
const Container& _c,
typename std::enable_if<_is_span_compatible_container<const Container, T>::value,
std::nullptr_t>::type = nullptr)
: data_{_std_data(_c)}, size_{index_type(_std_size(_c))}
{
}
template <class OtherElementType, ptrdiff_t OtherExtent>
inline Vc_CONSTEXPR span(
const span<OtherElementType, OtherExtent>& _other,
typename std::enable_if<
std::is_convertible<OtherElementType (*)[], element_type (*)[]>::value,
std::nullptr_t>::type = nullptr) noexcept
: data_{_other.data()}, size_{_other.size()}
{
}
// ~span() noexcept = default;
template <ptrdiff_t Count>
inline Vc_CONSTEXPR span<element_type, Count> first() const noexcept
{
static_assert(Count >= 0, "");
Vc_ASSERT(("Count out of range in span::first()", Count <= size()));
return {data(), Count};
}
template <ptrdiff_t Count>
inline Vc_CONSTEXPR span<element_type, Count> last() const noexcept
{
static_assert(Count >= 0, "");
Vc_ASSERT(("Count out of range in span::last()", Count <= size()));
return {data() + size() - Count, Count};
}
Vc_CONSTEXPR span<element_type, dynamic_extent> first(index_type _count) const noexcept
{
Vc_ASSERT(("Count out of range in span::first(count)",
_count >= 0 && _count <= size()));
return {data(), _count};
}
Vc_CONSTEXPR span<element_type, dynamic_extent> last(index_type _count) const noexcept
{
Vc_ASSERT(
("Count out of range in span::last(count)", _count >= 0 && _count <= size()));
return {data() + size() - _count, _count};
}
template <ptrdiff_t Offset, ptrdiff_t Count = dynamic_extent>
inline Vc_CONSTEXPR span<T, dynamic_extent> subspan() const noexcept
{
Vc_ASSERT(
("Offset out of range in span::subspan()", Offset >= 0 && Offset <= size()));
Vc_ASSERT(("Count out of range in span::subspan()",
Count == dynamic_extent || Offset + Count <= size()));
return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
}
Vc_CONSTEXPR span<element_type, dynamic_extent> inline subspan(
index_type _offset, index_type _count = dynamic_extent) const noexcept
{
Vc_ASSERT(("Offset out of range in span::subspan(offset, count)",
_offset >= 0 && _offset <= size()));
Vc_ASSERT(("count out of range in span::subspan(offset, count)",
(_count >= 0 && _count <= size()) || _count == dynamic_extent));
if (_count == dynamic_extent)
return {data() + _offset, size() - _offset};
Vc_ASSERT(("Offset + count out of range in span::subspan(offset, count)",
_offset + _count <= size()));
return {data() + _offset, _count};
}
Vc_CONSTEXPR index_type size() const noexcept { return size_; }
Vc_CONSTEXPR index_type size_bytes() const noexcept
{
return size_ * sizeof(element_type);
}
Vc_CONSTEXPR bool empty() const noexcept { return size_ == 0; }
Vc_CONSTEXPR reference operator[](index_type _idx) const noexcept
{
Vc_ASSERT(("span<T>[] index out of bounds", _idx >= 0 && _idx < size()));
return data_[_idx];
}
Vc_CONSTEXPR reference operator()(index_type _idx) const noexcept
{
Vc_ASSERT(("span<T>() index out of bounds", _idx >= 0 && _idx < size()));
return data_[_idx];
}
Vc_CONSTEXPR pointer data() const noexcept { return data_; }
// [span.iter], span iterator support
Vc_CONSTEXPR iterator begin() const noexcept { return iterator(data()); }
Vc_CONSTEXPR iterator end() const noexcept { return iterator(data() + size()); }
Vc_CONSTEXPR const_iterator cbegin() const noexcept { return const_iterator(data()); }
Vc_CONSTEXPR const_iterator cend() const noexcept
{
return const_iterator(data() + size());
}
Vc_CONSTEXPR reverse_iterator rbegin() const noexcept { return reverse_iterator(end()); }
Vc_CONSTEXPR reverse_iterator rend() const noexcept { return reverse_iterator(begin()); }
Vc_CONSTEXPR const_reverse_iterator crbegin() const noexcept
{
return const_reverse_iterator(cend());
}
Vc_CONSTEXPR const_reverse_iterator crend() const noexcept
{
return const_reverse_iterator(cbegin());
}
Vc_CONSTEXPR void swap(span& _other) noexcept
{
pointer _p = data_;
data_ = _other.data_;
_other.data_ = _p;
index_type _sz = size_;
size_ = _other.size_;
_other.size_ = _sz;
}
#ifdef __cpp_lib_byte
// Disable _as_bytes() for older MSVC versions as it leads to a compilation error due to a compiler bug.
// When parsing the return type, MSVC will instantiate the primary template of span<> and static_assert().
#if _MSC_VER > 1928
span<const std::byte, dynamic_extent> _as_bytes() const noexcept
{
return {reinterpret_cast<const std::byte*>(data()), size_bytes()};
}
span<std::byte, dynamic_extent> _as_writeable_bytes() const noexcept
{
return {reinterpret_cast<std::byte*>(data()), size_bytes()};
}
#endif
#endif // __cpp_lib_byte
private:
pointer data_;
index_type size_;
};
template <class T1, ptrdiff_t Extent1, class T2, ptrdiff_t Extent2>
Vc_CONSTEXPR bool operator==(const span<T1, Extent1>& lhs, const span<T2, Extent2>& rhs)
{
return equal(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
}
template <class T1, ptrdiff_t Extent1, class T2, ptrdiff_t Extent2>
Vc_CONSTEXPR bool operator!=(const span<T1, Extent1>& lhs, const span<T2, Extent2>& rhs)
{
return !(rhs == lhs);
}
template <class T1, ptrdiff_t Extent1, class T2, ptrdiff_t Extent2>
Vc_CONSTEXPR bool operator<(const span<T1, Extent1>& lhs, const span<T2, Extent2>& rhs)
{
return lexicographical_compare(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
}
template <class T1, ptrdiff_t Extent1, class T2, ptrdiff_t Extent2>
Vc_CONSTEXPR bool operator<=(const span<T1, Extent1>& lhs, const span<T2, Extent2>& rhs)
{
return !(rhs < lhs);
}
template <class T1, ptrdiff_t Extent1, class T2, ptrdiff_t Extent2>
Vc_CONSTEXPR bool operator>(const span<T1, Extent1>& lhs, const span<T2, Extent2>& rhs)
{
return rhs < lhs;
}
template <class T1, ptrdiff_t Extent1, class T2, ptrdiff_t Extent2>
Vc_CONSTEXPR bool operator>=(const span<T1, Extent1>& lhs, const span<T2, Extent2>& rhs)
{
return !(lhs < rhs);
}
// as_bytes & as_writeable_bytes
template <class T, ptrdiff_t Extent>
auto as_bytes(span<T, Extent> _s) noexcept -> decltype(_s._as_bytes())
{
return _s._as_bytes();
}
template <class T, ptrdiff_t Extent>
auto as_writeable_bytes(span<T, Extent> _s) noexcept ->
typename std::enable_if<!std::is_const<T>::value,
decltype(_s._as_writeable_bytes())>::type
{
return _s._as_writeable_bytes();
}
template <class T, ptrdiff_t Extent>
Vc_CONSTEXPR void swap(span<T, Extent>& lhs, span<T, Extent>& rhs) noexcept
{
lhs.swap(rhs);
}
#undef Vc_CONSTEXPR
// Deduction guides
#ifdef __cpp_deduction_guides
template <class T, size_t Sz> span(T (&)[Sz])->span<T, Sz>;
template <class T, size_t Sz> span(array<T, Sz>&)->span<T, Sz>;
template <class T, size_t Sz> span(const array<T, Sz>&)->span<const T, Sz>;
template <class Container> span(Container&)->span<typename Container::value_type>;
template <class Container>
span(const Container&)->span<const typename Container::value_type>;
#endif // __cpp_deduction_guides
} // namespace Common
/**
* \ingroup Containers
* \headerfile span.h <Vc/span>
*
* An adapted `std::span` with additional subscript operators supporting gather and scatter operations.
*
* The [std::span](https://en.cppreference.com/w/cpp/container/span) documentation applies.
*
* Example:
* \code
* struct Point {
* float x, y;
* };
* Point data[100];
* // initialize values in data
*
* Vc::span<Point, 100> view(data);
* float_v::IndexType indexes = ...; // values between 0-99
* float_v x = view[indexes][&Point::x];
* float_v y = view[indexes][&Point::y];
* \endcode
*/
template <typename T, ptrdiff_t Extent = dynamic_extent>
using span = Common::AdaptSubscriptOperator<Common::span<T, Extent>>;
namespace Traits
{
template <typename T, ptrdiff_t Extent>
struct has_contiguous_storage_impl<Vc::span<T, Extent>> : public std::true_type {
};
template <typename T, ptrdiff_t Extent>
struct has_contiguous_storage_impl<Vc::Common::span<T, Extent>> : public std::true_type {
};
} // namespace Traits
} // namespace Vc_VERSIONED_NAMESPACE
#endif // VC_COMMON_SPAN_H_

381
Vc/common/storage.h Normal file
View File

@ -0,0 +1,381 @@
/* This file is part of the Vc library. {{{
Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_STORAGE_H_
#define VC_COMMON_STORAGE_H_
#include "aliasingentryhelper.h"
#include "types.h"
#include "maskbool.h"
#ifdef Vc_IMPL_AVX
#include "../avx/intrinsics.h"
#endif
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template <typename V> inline V zero();
} // namespace Detail
namespace Common
{
namespace Detail
{
#ifdef Vc_IMPL_AVX
template <typename ValueType, size_t Size> struct IntrinsicType {
using type = typename std::conditional<
std::is_integral<ValueType>::value,
typename std::conditional<sizeof(ValueType) * Size == 16, __m128i, __m256i>::type,
typename std::conditional<
std::is_same<ValueType, double>::value,
typename std::conditional<sizeof(ValueType) * Size == 16, __m128d,
__m256d>::type,
typename std::conditional<sizeof(ValueType) * Size == 16, __m128,
__m256>::type>::type>::type;
};
#elif defined Vc_IMPL_SSE
template <typename ValueType, size_t Size> struct IntrinsicType {
using type = typename std::conditional<
std::is_integral<ValueType>::value, __m128i,
typename std::conditional<std::is_same<ValueType, double>::value, __m128d,
__m128>::type>::type;
};
#else
template <typename ValueType, size_t Size> struct IntrinsicType {
static_assert(Size == 1,
"IntrinsicType without SIMD target support may only have Size = 1");
using type = ValueType;
};
#endif
template <typename ValueType, size_t Size, size_t Bytes = sizeof(ValueType) * Size>
struct BuiltinType;
#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
#define Vc_VECBUILTIN __attribute__((__vector_size__(16)))
template <size_t Size> struct BuiltinType< double , Size, 16> { typedef double type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< float , Size, 16> { typedef float type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< long long, Size, 16> { typedef long long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned long long, Size, 16> { typedef unsigned long long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< long , Size, 16> { typedef long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned long , Size, 16> { typedef unsigned long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< int , Size, 16> { typedef int type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned int , Size, 16> { typedef unsigned int type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< short , Size, 16> { typedef short type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned short , Size, 16> { typedef unsigned short type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< char , Size, 16> { typedef char type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned char , Size, 16> { typedef unsigned char type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< signed char , Size, 16> { typedef signed char type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< bool , Size, 16> { typedef unsigned char type Vc_VECBUILTIN; };
#undef Vc_VECBUILTIN
#define Vc_VECBUILTIN __attribute__((__vector_size__(32)))
template <size_t Size> struct BuiltinType< double , Size, 32> { typedef double type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< float , Size, 32> { typedef float type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< long long, Size, 32> { typedef long long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned long long, Size, 32> { typedef unsigned long long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< long , Size, 32> { typedef long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned long , Size, 32> { typedef unsigned long type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< int , Size, 32> { typedef int type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned int , Size, 32> { typedef unsigned int type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< short , Size, 32> { typedef short type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned short , Size, 32> { typedef unsigned short type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< char , Size, 32> { typedef char type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType<unsigned char , Size, 32> { typedef unsigned char type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< signed char , Size, 32> { typedef signed char type Vc_VECBUILTIN; };
template <size_t Size> struct BuiltinType< bool , Size, 32> { typedef unsigned char type Vc_VECBUILTIN; };
#undef Vc_VECBUILTIN
#endif
} // namespace Detail
template <typename ValueType, size_t Size>
using IntrinsicType = typename Detail::IntrinsicType<ValueType, Size>::type;
template <typename ValueType, size_t Size>
using BuiltinType = typename Detail::BuiltinType<ValueType, Size>::type;
namespace AliasStrategy
{
struct Union {};
struct MayAlias {};
struct VectorBuiltin {};
struct UnionMembers {};
} // namespace AliasStrategy
using DefaultStrategy =
#if defined Vc_USE_BUILTIN_VECTOR_TYPES
AliasStrategy::VectorBuiltin;
#elif defined Vc_MSVC
AliasStrategy::UnionMembers;
#elif defined Vc_ICC
AliasStrategy::Union;
#elif defined __GNUC__
AliasStrategy::MayAlias;
#else
AliasStrategy::Union;
#endif
template <typename ValueType, size_t Size, typename Strategy = DefaultStrategy>
class Storage;
// GCC 6 forbids `EntryType m[]` altogether
template <typename ValueType, size_t Size>
class Storage<ValueType, Size, AliasStrategy::Union>
{
static_assert(std::is_fundamental<ValueType>::value &&
std::is_arithmetic<ValueType>::value,
"Only works for fundamental arithmetic types.");
public:
using VectorType = IntrinsicType<ValueType, Size>;
using EntryType = ValueType;
union Alias {
Vc_INTRINSIC Alias(VectorType vv) : v(vv) {}
VectorType v;
EntryType m[Size];
};
Vc_INTRINSIC Storage() : data(Vc::Detail::zero<VectorType>()) {}
Vc_INTRINSIC Storage(const VectorType &x) : data(x) { assertCorrectAlignment(&data); }
template <typename U>
Vc_INTRINSIC explicit Storage(const U &x,
enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
: data(reinterpret_cast<VectorType>(x))
{
assertCorrectAlignment(&data);
}
Vc_INTRINSIC Storage(const Storage &) = default;
Vc_INTRINSIC Storage &operator=(const Storage &) = default;
Vc_INTRINSIC operator const VectorType &() const { return data; }
Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return Alias(data).m[i]; }
Vc_INTRINSIC void set(size_t i, EntryType x)
{
Alias a(data);
a.m[i] = x;
data = a.v;
}
private:
VectorType data;
};
template <typename ValueType, size_t Size>
class Storage<ValueType, Size, AliasStrategy::MayAlias>
{
static_assert(std::is_fundamental<ValueType>::value &&
std::is_arithmetic<ValueType>::value,
"Only works for fundamental arithmetic types.");
public:
using VectorType = IntrinsicType<ValueType, Size>;
using EntryType = ValueType;
Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
Vc_INTRINSIC Storage(const VectorType &x) : data(x)
{
assertCorrectAlignment(&data);
}
template <typename U>
Vc_INTRINSIC explicit Storage(const U &x,
enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
: data(reinterpret_cast<const VectorType &>(x))
{
assertCorrectAlignment(&data);
}
Vc_INTRINSIC Storage &operator=(const VectorType &x)
{
data = x;
return *this;
}
Vc_INTRINSIC Storage(const Storage &) = default;
Vc_INTRINSIC Storage &operator=(const Storage &) = default;
Vc_INTRINSIC operator const VectorType &() const { return v(); }
Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const
{
return aliasing_cast<EntryType>(&data)[i];
}
Vc_INTRINSIC void set(size_t i, EntryType x)
{
aliasing_cast<EntryType>(&data)[i] = x;
}
private:
VectorType data;
};
template <typename ValueType, size_t Size>
class Storage<ValueType, Size, AliasStrategy::VectorBuiltin>
{
static_assert(std::is_fundamental<ValueType>::value &&
std::is_arithmetic<ValueType>::value,
"Only works for fundamental arithmetic types.");
using Builtin = BuiltinType<ValueType, Size>;
public:
using VectorType =
#ifdef Vc_TEMPLATES_DROP_ATTRIBUTES
MayAlias<IntrinsicType<ValueType, Size>>;
#else
IntrinsicType<ValueType, Size>;
#endif
using EntryType = ValueType;
Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
Vc_INTRINSIC Storage(const Storage &) = default;
Vc_INTRINSIC Storage &operator=(const Storage &) = default;
Vc_INTRINSIC Storage(const VectorType &x)
: data(aliasing_cast<Builtin>(x))
{
assertCorrectAlignment(&data);
}
template <typename U>
Vc_INTRINSIC explicit Storage(const U &x,
enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
: data(aliasing_cast<Builtin>(x))
{
assertCorrectAlignment(&data);
}
Vc_INTRINSIC Storage &operator=(const VectorType &x)
{
data = aliasing_cast<Builtin>(x);
return *this;
}
Vc_INTRINSIC operator const VectorType &() const { return v(); }
Vc_INTRINSIC Vc_PURE VectorType &v() { return reinterpret_cast<VectorType &>(data); }
Vc_INTRINSIC Vc_PURE const VectorType &v() const { return reinterpret_cast<const VectorType &>(data); }
Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return data[i]; }
Vc_INTRINSIC void set(size_t i, EntryType x) { data[i] = x; }
Vc_INTRINSIC Builtin &builtin() { return data; }
Vc_INTRINSIC const Builtin &builtin() const { return data; }
private:
Builtin data;
};
template <typename ValueType, size_t Size>
class Storage<ValueType, Size, AliasStrategy::UnionMembers>
{
static_assert(std::is_fundamental<ValueType>::value &&
std::is_arithmetic<ValueType>::value,
"Only works for fundamental arithmetic types.");
public:
using VectorType = IntrinsicType<ValueType, Size>;
using EntryType = ValueType;
Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
Vc_INTRINSIC Storage(const VectorType &x) : data(x)
{
assertCorrectAlignment(&data);
}
template <typename U>
Vc_INTRINSIC explicit Storage(const U &x,
enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
: data(reinterpret_cast<const VectorType &>(x))
{
assertCorrectAlignment(&data);
}
Vc_INTRINSIC Storage &operator=(const VectorType &x)
{
data = x;
return *this;
}
Vc_INTRINSIC Storage(const Storage &) = default;
Vc_INTRINSIC Storage &operator=(const Storage &) = default;
Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
Vc_INTRINSIC_L Vc_PURE_L EntryType m(size_t i) const Vc_INTRINSIC_R Vc_PURE_R;
Vc_INTRINSIC void set(size_t i, EntryType x) { ref(i) = x; }
private:
Vc_INTRINSIC_L Vc_PURE_L EntryType &ref(size_t i) Vc_INTRINSIC_R Vc_PURE_R;
VectorType data;
};
#ifdef Vc_MSVC
template <> Vc_INTRINSIC Vc_PURE double Storage< double, 2, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128d_f64[i]; }
template <> Vc_INTRINSIC Vc_PURE float Storage< float , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128_f32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed int Storage< signed int , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed short Storage< signed short , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i16[i]; }
template <> Vc_INTRINSIC Vc_PURE signed char Storage< signed char ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i8[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned int Storage<unsigned int , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u32[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned short Storage<unsigned short , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u16[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned char Storage<unsigned char ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u8[i]; }
template <> Vc_INTRINSIC Vc_PURE double &Storage< double, 2, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128d_f64[i]; }
template <> Vc_INTRINSIC Vc_PURE float &Storage< float , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128_f32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed int &Storage< signed int , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed short &Storage< signed short , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i16[i]; }
template <> Vc_INTRINSIC Vc_PURE signed char &Storage< signed char ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast<signed char &>(data.m128i_i8[i]); }
template <> Vc_INTRINSIC Vc_PURE unsigned int &Storage<unsigned int , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u32[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned short &Storage<unsigned short , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u16[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned char &Storage<unsigned char ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u8[i]; }
#ifdef Vc_IMPL_AVX
template <> Vc_INTRINSIC Vc_PURE double Storage< double, 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256d_f64[i]; }
template <> Vc_INTRINSIC Vc_PURE float Storage< float , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256_f32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed int Storage< signed int , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed short Storage< signed short ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i16[i]; }
template <> Vc_INTRINSIC Vc_PURE signed char Storage< signed char ,32, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i8[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned int Storage<unsigned int , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u32[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned short Storage<unsigned short ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u16[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned char Storage<unsigned char ,32, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u8[i]; }
template <> Vc_INTRINSIC Vc_PURE double &Storage< double, 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256d_f64[i]; }
template <> Vc_INTRINSIC Vc_PURE float &Storage< float , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256_f32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed int &Storage< signed int , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i32[i]; }
template <> Vc_INTRINSIC Vc_PURE signed short &Storage< signed short ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i16[i]; }
template <> Vc_INTRINSIC Vc_PURE signed char &Storage< signed char ,32, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast<signed char &>(data.m256i_i8[i]); }
template <> Vc_INTRINSIC Vc_PURE unsigned int &Storage<unsigned int , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u32[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned short &Storage<unsigned short ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u16[i]; }
template <> Vc_INTRINSIC Vc_PURE unsigned char &Storage<unsigned char ,32, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u8[i]; }
#endif
#endif // Vc_MSVC
template <typename VectorType, typename EntryType>
using VectorMemoryUnion = Storage<EntryType, sizeof(VectorType) / sizeof(EntryType)>;
} // namespace Common
} // namespace Vc
#endif // VC_COMMON_STORAGE_H_

View File

@ -0,0 +1,92 @@
/* This file is part of the Vc library. {{{
Copyright © 2014 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
///////////////////////////////////////////////////////////////////////////////////////////
// stores
/**
* Store the vector data to \p mem.
*
* \param mem A pointer to memory, where \VSize{T} consecutive values will be stored.
* \param flags The flags parameter can be used to select e.g. the Vc::Aligned,
* Vc::Unaligned, Vc::Streaming, and/or Vc::PrefetchDefault flags.
*/
template <
typename U,
typename Flags = DefaultStoreTag,
typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
/**
* Store the vector data to \p mem where \p mask is set.
*
* \param mem A pointer to memory, where \VSize{T} consecutive values will be stored.
* \param mask A mask object that determines which entries of the vector should be stored
* to \p mem.
* \param flags The flags parameter can be used to select e.g. the Vc::Aligned,
* Vc::Unaligned, Vc::Streaming, and/or Vc::PrefetchDefault flags.
*
* \note
* The masked store does not pack the values into memory. I.e. the value at offset \c i
* will be stored to `mem[i]`, independent of whether `mask[j]` for any `j < i` is \c
* false.
*/
template <
typename U,
typename Flags = DefaultStoreTag,
typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
//@{
/**
* The following store overloads support classes that have a cast operator to `EntryType
* *`.
*/
Vc_INTRINSIC void store(EntryType *mem) const
{
store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
{
store<EntryType, Flags>(mem, flags);
}
Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
{
store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
}
template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
{
store<EntryType, Flags>(mem, mask, flags);
}
//@}
// vim: foldmethod=marker

526
Vc/common/subscript.h Normal file
View File

@ -0,0 +1,526 @@
/* This file is part of the Vc library. {{{
Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_SUBSCRIPT_H_
#define VC_COMMON_SUBSCRIPT_H_
#include <initializer_list>
#include <type_traits>
#include <vector>
#include "types.h"
#include "macros.h"
#include <assert.h>
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
// AdaptSubscriptOperator {{{
template <typename Base> class AdaptSubscriptOperator : public Base
{
public:
// perfect forward all Base constructors
template <typename... Args>
Vc_ALWAYS_INLINE AdaptSubscriptOperator(Args &&... arguments)
: Base(std::forward<Args>(arguments)...)
{
}
// perfect forward all Base constructors
template <typename T>
Vc_ALWAYS_INLINE AdaptSubscriptOperator(std::initializer_list<T> l)
: Base(l)
{
}
// explicitly enable Base::operator[] because the following would hide it
using Base::operator[];
/// \internal forward to non-member subscript_operator function
template <typename I,
typename = enable_if<!std::is_arithmetic<
typename std::decay<I>::type>::value> // arithmetic types
// should always use
// Base::operator[] and
// never match this one
>
Vc_ALWAYS_INLINE auto operator[](I &&arg_)
-> decltype(subscript_operator(*this, std::forward<I>(arg_)))
{
return subscript_operator(*this, std::forward<I>(arg_));
}
// const overload of the above
template <typename I, typename = enable_if<
!std::is_arithmetic<typename std::decay<I>::type>::value>>
Vc_ALWAYS_INLINE auto operator[](I &&arg_) const
-> decltype(subscript_operator(*this, std::forward<I>(arg_)))
{
return subscript_operator(*this, std::forward<I>(arg_));
}
};
// }}}
// is_valid_indexvector {{{
template <class T, class = decltype(convertIndexVector(std::declval<T>()))>
std::true_type is_valid_indexvector(T &&);
std::false_type is_valid_indexvector(...);
template <class IndexVector, class Test = decltype(is_valid_indexvector(
std::declval<const IndexVector &>()))>
struct is_valid_indexvector_ : public std::integral_constant<bool, Test::value> {
};
static_assert(!is_valid_indexvector_<const int *>::value,
"Pointer is incorrectly classified as valid index vector type");
static_assert(is_valid_indexvector_<const int[4]>::value,
"C-Array is incorrectly classified as invalid index vector type");
// }}}
// apply Scale (std::ratio) functions {{{1
template <typename Scale, typename T>
Vc_ALWAYS_INLINE enable_if<Scale::num == Scale::den, Traits::decay<T>> applyScale(T &&x)
{
return std::forward<T>(x);
}
template <typename Scale, typename T>
Vc_ALWAYS_INLINE enable_if<
Scale::num != Scale::den && Traits::has_multiply_operator<T, int>::value,
Traits::decay<T>>
applyScale(T &&x)
{
static_assert(Scale::num % Scale::den == 0,
"Non-integral index scaling requested. This typically happens only for "
"Vc::Scalar on 32-bit for gathers on double. You can work around the "
"issue by ensuring that all doubles in the structure are aligned on 8 "
"Bytes.");
constexpr int value = Scale::num / Scale::den;
Vc_ASSERT(Vc::all_of((x * value) / value == x));
return std::forward<T>(x) * value;
}
template <typename Scale, typename T>
Vc_ALWAYS_INLINE enable_if<
Scale::num != Scale::den && !Traits::has_multiply_operator<T, int>::value,
T>
applyScale(T x)
{
static_assert(Scale::num % Scale::den == 0,
"Non-integral index scaling requested. This typically happens only for "
"Vc::Scalar on 32-bit for gathers on double. You can work around the "
"issue by ensuring that all doubles in the structure are aligned on 8 "
"Bytes.");
constexpr int value = Scale::num / Scale::den;
for (size_t i = 0; i < x.size(); ++i) {
Vc_ASSERT((x[i] * value) / value == x[i]);
x[i] *= value;
}
return x;
}
template <typename Scale, typename T, typename U,
typename = enable_if<Traits::has_multiply_operator<T, int>::value &&
Traits::has_addition_operator<T, U>::value>>
Vc_ALWAYS_INLINE typename std::decay<T>::type applyScaleAndAdd(T &&x, U &&y)
{
constexpr int value = Scale::num / Scale::den;
if (value == 1) { // static evaluation
return std::forward<T>(x) + std::forward<U>(y);
}
return std::forward<T>(x) * value + std::forward<U>(y);
}
template <
typename Scale, typename T, typename U,
typename = enable_if<
!(Traits::has_multiply_operator<T &, int>::value &&
Traits::has_addition_operator<T &, decltype(std::declval<U>()[0])>::value) &&
Traits::has_subscript_operator<U>::value>>
Vc_ALWAYS_INLINE T applyScaleAndAdd(T x, U &&y)
{
constexpr int value = Scale::num / Scale::den;
for (size_t i = 0; i < x.size(); ++i) {
if (value == 1) { // static evaluation
x[i] = x[i] + y[i];
} else {
x[i] = x[i] * value + y[i];
}
}
return x;
}
template <typename Scale, typename T, typename U>
Vc_ALWAYS_INLINE enable_if<!(Traits::has_multiply_operator<T &, int>::value &&
Traits::has_addition_operator<T &, U>::value) &&
!Traits::has_subscript_operator<U>::value,
T>
applyScaleAndAdd(T x, U &&y)
{
constexpr int value = Scale::num / Scale::den;
for (size_t i = 0; i < x.size(); ++i) {
if (value == 1) { // static evaluation
x[i] = x[i] + y;
} else {
x[i] = x[i] * value + y;
}
}
return x;
}
// IndexVectorSizeMatches {{{1
template <std::size_t MinSize,
typename IndexT,
bool = Traits::is_simd_vector<IndexT>::value>
struct IndexVectorSizeMatches
: public std::true_type // you might expect this should be false_type here, but the point is
// that IndexT is a type where the size is not known at compile time.
// Thus it may be good but we cannot know from the type. The only check
// we could do is a runtime check, but the type is fine.
{
};
template <std::size_t MinSize, typename V>
struct IndexVectorSizeMatches<MinSize,
V,
true> : public std::integral_constant<bool, (MinSize <= V::Size)>
{
};
template <std::size_t MinSize, typename T, std::size_t ArraySize>
struct IndexVectorSizeMatches<MinSize,
T[ArraySize],
false> : public std::integral_constant<bool, (MinSize <= ArraySize)>
{
};
template <std::size_t MinSize, typename T, std::size_t ArraySize>
struct IndexVectorSizeMatches<MinSize,
std::array<T, ArraySize>,
false> : public std::integral_constant<bool, (MinSize <= ArraySize)>
{
};
template <std::size_t MinSize, typename T, std::size_t ArraySize>
struct IndexVectorSizeMatches<MinSize,
Vc::array<T, ArraySize>,
false> : public std::integral_constant<bool, (MinSize <= ArraySize)>
{
};
template <std::size_t MinSize, typename T, std::ptrdiff_t N>
struct IndexVectorSizeMatches<MinSize, Vc::Common::span<T, N>, false>
: public std::integral_constant<bool, (N == -1 || static_cast<std::ptrdiff_t>(MinSize) <= N)> {
};
// SubscriptOperation {{{1
template <
typename T, typename IndexVector, typename Scale = std::ratio<1, 1>,
bool = is_valid_indexvector_<IndexVector>::value>
class SubscriptOperation
{
const IndexVector m_indexes;
T *const m_address;
using ScalarType = typename std::decay<T>::type;
using IndexVectorScaled = Traits::decay<decltype(convertIndexVector(std::declval<const IndexVector &>()))>;
public:
// try to stop the user from forming lvalues of this type
SubscriptOperation &operator=(const SubscriptOperation &) = delete;
SubscriptOperation(const SubscriptOperation &) = delete;
#ifndef __cpp_guaranteed_copy_elision
constexpr SubscriptOperation(SubscriptOperation &&) = default;
#endif
template <typename U,
typename = enable_if<((std::is_convertible<const U &, IndexVector>::value ||
std::is_same<U, IndexVector>::value) &&
std::is_copy_constructible<IndexVector>::value)>>
constexpr Vc_ALWAYS_INLINE SubscriptOperation(T *address, const U &indexes)
: m_indexes(indexes), m_address(address)
{
}
template <std::size_t... Indexes>
constexpr Vc_ALWAYS_INLINE SubscriptOperation(T *address, const IndexVector &indexes,
index_sequence<Indexes...>)
: m_indexes{indexes[Indexes]...}, m_address(address)
{}
template <typename U>
constexpr Vc_ALWAYS_INLINE SubscriptOperation(
T *address, const U &indexes,
enable_if<((std::is_convertible<const U &, IndexVector>::value ||
std::is_same<U, IndexVector>::value) &&
!std::is_copy_constructible<IndexVector>::value &&
std::is_array<IndexVector>::value &&
std::extent<IndexVector>::value > 0)> = nullarg)
: SubscriptOperation(address, indexes,
make_index_sequence<std::extent<IndexVector>::value>())
{
}
static constexpr bool need_explicit_scaling =
Scale::num % Scale::den != 0 || Scale::num / Scale::den * sizeof(T) > 8;
Vc_ALWAYS_INLINE
GatherArguments<typename std::remove_cv<T>::type, IndexVectorScaled,
(need_explicit_scaling ? 1 : Scale::num / Scale::den)>
gatherArguments() &&
{
static_assert(std::is_arithmetic<ScalarType>::value,
"Incorrect type for a SIMD vector gather. Must be an arithmetic type.");
return {applyScale<typename std::conditional<need_explicit_scaling, Scale,
std::ratio<1, 1>>::type>(
convertIndexVector(m_indexes)),
m_address};
}
Vc_ALWAYS_INLINE ScatterArguments<T, IndexVectorScaled> scatterArguments() &&
{
static_assert(std::is_arithmetic<ScalarType>::value,
"Incorrect type for a SIMD vector scatter. Must be an arithmetic type.");
return {applyScale<Scale>(convertIndexVector(m_indexes)), m_address};
}
template <typename V,
typename = enable_if<(std::is_arithmetic<ScalarType>::value &&Traits::is_simd_vector<
V>::value &&IndexVectorSizeMatches<V::Size, IndexVector>::value)>>
Vc_INTRINSIC operator V() &&
{
return V(static_cast<SubscriptOperation &&>(*this).gatherArguments());
}
template <typename V,
typename = enable_if<(std::is_arithmetic<ScalarType>::value &&Traits::is_simd_vector<
V>::value &&IndexVectorSizeMatches<V::Size, IndexVector>::value)>>
Vc_ALWAYS_INLINE SubscriptOperation &operator=(const V &rhs) &&
{
static_assert(std::is_arithmetic<ScalarType>::value,
"Incorrect type for a SIMD vector scatter. Must be an arithmetic type.");
const auto indexes = applyScale<Scale>(convertIndexVector(m_indexes));
rhs.scatter(m_address, indexes);
return *this;
}
// precondition: m_address points to a struct/class/union
template <
typename U,
typename S, // S must be equal to T. Still we require this template parameter -
// otherwise instantiation of SubscriptOperation would only be valid for
// structs/unions.
typename = enable_if<std::is_same<S, typename std::remove_cv<T>::type>::value &&(
std::is_class<T>::value || std::is_union<T>::value)>>
Vc_ALWAYS_INLINE auto operator[](U S::*member) &&
-> SubscriptOperation<
typename std::conditional<std::is_const<T>::value,
const typename std::remove_reference<U>::type,
typename std::remove_reference<U>::type>::type,
IndexVector,
// By passing the scale factor as a fraction of integers in the template
// arguments the value does not lose information if the division yields a
// non-integral value. This could happen e.g. for a struct of struct (S2 {
// S1, char }, with sizeof(S1) = 16, sizeof(S2) = 20. Then scale would be
// 20/16)
std::ratio_multiply<Scale, std::ratio<sizeof(S), sizeof(U)>>>
{
static_assert(std::is_same<Traits::decay<decltype(m_address->*member)>,
Traits::decay<U>>::value,
"Type mismatch that should be impossible.");
// TODO: check whether scale really works for unions correctly
return {&(m_address->*member), m_indexes};
}
/*
* The following functions allow subscripting of nested arrays. But
* there are two cases of containers and only one that we want to support:
* 1. actual arrays (e.g. T[N] or std::array<T, N>)
* 2. dynamically allocated vectors (e.g. std::vector<T>)
*
* For (1.) the offset calculation is straightforward.
* For (2.) the m_address pointer points to memory where pointers are
* stored to the actual data. Meaning the data can be scattered
* freely in memory (and far away from what m_address points to). Supporting this leads to
* serious trouble with the pointer (it does not really point to the start of a memory
* region anymore) and inefficient code. The user is better off to write a loop that assigns the
* scalars to the vector object sequentially.
*/
private:
// The following is a workaround for MSVC 2015 Update 2. Whenever the ratio
// in the return type of the following operator[] is encountered with a sizeof
// expression that fails, MSVC decides to substitute a 0 for the sizeof instead of
// just leaving the ratio instantiation alone via proper SFINAE. The make_ratio helper
// ensures that the 0 from the sizeof failure does not reach the denominator of
// std::ratio where it would hit a static_assert.
template <intmax_t N, intmax_t D> struct make_ratio {
using type = std::ratio<N, D == 0 ? 1 : D>;
};
public:
// precondition: m_address points to a type that implements the subscript operator
template <typename U>
// U is only required to delay name lookup to the 2nd phase (on use).
// This is necessary because m_address[0][index] is only a correct
// expression if has_subscript_operator<T>::value is true.
Vc_ALWAYS_INLINE auto operator[](U index) && -> typename std::enable_if<
#ifndef Vc_IMPROVE_ERROR_MESSAGES
Traits::has_no_allocated_data<T>::value &&
#endif
std::is_convertible<U, size_t>::value,
SubscriptOperation<
// the following decltype expression must depend on index and cannot
// simply use [0][0] because it would yield an invalid expression in
// case m_address[0] returns a struct/union
typename std::remove_reference<decltype(m_address[0][index])>::type,
IndexVector,
std::ratio_multiply<
Scale,
typename make_ratio<sizeof(T), sizeof(m_address[0][index])>::type>>>::type
{
static_assert(Traits::has_subscript_operator<T>::value,
"The subscript operator was called on a type that does not implement it.\n");
static_assert(Traits::has_no_allocated_data<T>::value,
"Invalid container type in gather/scatter operation.\nYou may only use "
"nested containers that store the data inside the object (such as builtin "
"arrays or std::array) but not containers that store data in allocated "
"memory (such as std::vector).\nSince this feature cannot be queried "
"generically at compile time you need to spezialize the "
"Vc::Traits::has_no_allocated_data_impl<T> type-trait for custom types that "
"meet the requirements.\n");
static_assert(std::is_lvalue_reference<decltype(m_address[0][index])>::value,
"The container does not return an lvalue reference to the data at "
"the requested offset. This makes it impossible to execute a "
"gather operation.\n");
return {&(m_address[0][index]), m_indexes};
}
// precondition: m_address points to a type that implements the subscript operator
template <typename IT>
Vc_ALWAYS_INLINE typename std::enable_if<
#ifndef Vc_IMPROVE_ERROR_MESSAGES
Traits::has_no_allocated_data<T>::value &&
Traits::has_subscript_operator<T>::value &&
#endif
Traits::has_subscript_operator<IT>::value,
SubscriptOperation<typename std::remove_reference<decltype(
m_address[0][std::declval<
const IT &>()[0]] // std::declval<IT>()[0] could
// be replaced with 0 if it
// were not for two-phase lookup. We need to make the
// m_address[0][0] expression dependent on IT
)>::type,
IndexVectorScaled,
std::ratio<1, 1> // reset Scale to 1 since it is applied below
>>::type
operator[](const IT &index) &&
{
static_assert(Traits::has_subscript_operator<T>::value,
"The subscript operator was called on a type that does not implement it.\n");
static_assert(Traits::has_no_allocated_data<T>::value,
"Invalid container type in gather/scatter operation.\nYou may only use "
"nested containers that store the data inside the object (such as builtin "
"arrays or std::array) but not containers that store data in allocated "
"memory (such as std::vector).\nSince this feature cannot be queried "
"generically at compile time you need to spezialize the "
"Vc::Traits::has_no_allocated_data_impl<T> type-trait for custom types that "
"meet the requirements.\n");
return {&(m_address[0][0]),
applyScaleAndAdd<std::ratio_multiply<
Scale, std::ratio<sizeof(T), sizeof(m_address[0][0])>>>(
convertIndexVector(m_indexes), index)};
}
};
// specialization for invalid IndexVector type
template <typename T, typename IndexVector, typename Scale>
class SubscriptOperation<T, IndexVector, Scale, false>;
// subscript_operator {{{1
template <
typename Container,
typename IndexVector,
typename = enable_if<
Traits::has_subscript_operator<IndexVector>::value // The index vector must provide [] for
// the implementations of gather/scatter
&&Traits::has_contiguous_storage<Container>::value // Container must use contiguous
// storage, otherwise the index vector
// cannot be used as memory offsets, which is required for efficient
// gather/scatter implementations
&&std::is_lvalue_reference<decltype(*begin(std::declval<
Container>()))>::value // dereferencing the begin iterator must yield an lvalue
// reference (const or non-const). Otherwise it is not possible
// to determine a pointer to the data storage (see above).
>>
Vc_ALWAYS_INLINE SubscriptOperation<
typename std::remove_reference<decltype(*begin(std::declval<Container>()))>::
type, // the type of the first value in the container is what the internal array pointer
// has to point to. But if the subscript operator of the container returns a
// reference we need to drop that part because it's useless information for us. But
// const and volatile, as well as array rank/extent are interesting and need not be
// dropped.
typename std::remove_const<typename std::remove_reference<
IndexVector>::type>::type // keep volatile and possibly the array extent, but the const and
// & parts of the type need to be removed because
// SubscriptOperation explicitly adds them for its member type
> subscript_operator(Container &&c, IndexVector &&indexes)
{
Vc_ASSERT(std::addressof(*begin(c)) + 1 ==
std::addressof(*(begin(c) + 1))); // runtime assertion for contiguous storage, this
// requires a RandomAccessIterator - but that
// should be given for a container with contiguous
// storage
return {std::addressof(*begin(c)), std::forward<IndexVector>(indexes)};
}
/**
* \internal
* Implement subscripts of std::initializer_list. This function must be in the global scope
* because Container arguments may be in any scope. The other argument is in std scope.
*
* -----
* std::initializer_list does not have constexpr member functions in C++11, but from C++14 onwards
* the world is a happier place. :)
*/
template <typename Container, typename I>
Vc_ALWAYS_INLINE Vc::Common::SubscriptOperation<
typename std::remove_reference<decltype(std::declval<Container>()[0])>::type,
const std::initializer_list<I> &> subscript_operator(Container &&vec,
const std::initializer_list<I> &indexes)
{
return {&vec[0], indexes};
}
//}}}1
} // namespace Common
using Common::subscript_operator;
} // namespace Vc
#endif // VC_COMMON_SUBSCRIPT_H_
// vim: foldmethod=marker

7
Vc/common/support.h Normal file
View File

@ -0,0 +1,7 @@
#ifndef VC_DEPRECATED_COMMON_SUPPORT_H_
#define VC_DEPRECATED_COMMON_SUPPORT_H_
#ifdef __GNUC__
#warning "the <Vc/common/support.h> header is deprecated. Use <Vc/support.h> instead."
#endif
#include <Vc/support.h>
#endif // VC_DEPRECATED_COMMON_SUPPORT_H_

57
Vc/common/transpose.h Normal file
View File

@ -0,0 +1,57 @@
/* This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_TRANSPOSE_H_
#define VC_COMMON_TRANSPOSE_H_
#include "macros.h"
#include <tuple>
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
template <typename... Inputs> struct TransposeProxy
{
TransposeProxy(const Inputs &... inputs) : in{inputs...} {}
std::tuple<const Inputs &...> in;
};
template <int LhsLength, size_t RhsLength> struct TransposeTag {
};
} // namespace Common
template <typename... Vs> Common::TransposeProxy<Vs...> transpose(Vs... vs)
{
return {vs...};
}
} // namespace Vc
#endif // VC_COMMON_TRANSPOSE_H_
// vim: foldmethod=marker

226
Vc/common/trigonometric.h Normal file
View File

@ -0,0 +1,226 @@
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_TRIGONOMETRIC_H_
#define VC_COMMON_TRIGONOMETRIC_H_
#include "macros.h"
#ifdef Vc_HAVE_LIBMVEC
extern "C" {
__m128 _ZGVbN4v_sinf(__m128);
__m128d _ZGVbN2v_sin(__m128d);
__m128 _ZGVbN4v_cosf(__m128);
__m128d _ZGVbN2v_cos(__m128d);
__m256 _ZGVdN8v_sinf(__m256);
__m256d _ZGVdN4v_sin(__m256d);
__m256 _ZGVdN8v_cosf(__m256);
__m256d _ZGVdN4v_cos(__m256d);
}
#endif
namespace Vc_VERSIONED_NAMESPACE
{
namespace Detail
{
template<Vc::Implementation Impl> struct MapImpl { enum Dummy { Value = Impl }; };
template<> struct MapImpl<Vc::SSE42Impl> { enum Dummy { Value = MapImpl<Vc::SSE41Impl>::Value }; };
template<Vc::Implementation Impl> using TrigonometricImplementation =
ImplementationT<MapImpl<Impl>::Value
#if defined(Vc_IMPL_XOP) && defined(Vc_IMPL_FMA4)
+ Vc::XopInstructions
+ Vc::Fma4Instructions
#endif
>;
} // namespace Detail
namespace Common
{
template<typename Impl> struct Trigonometric
{
template<typename T> static T Vc_VDECL sin(const T &_x);
template<typename T> static T Vc_VDECL cos(const T &_x);
template<typename T> static void Vc_VDECL sincos(const T &_x, T *_sin, T *_cos);
template<typename T> static T Vc_VDECL asin (const T &_x);
template<typename T> static T Vc_VDECL atan (const T &_x);
template<typename T> static T Vc_VDECL atan2(const T &y, const T &x);
};
} // namespace Common
#if defined Vc_IMPL_SSE || defined DOXYGEN
// this is either SSE, AVX, or AVX2
namespace Detail
{
template <typename T, typename Abi>
using Trig = Common::Trigonometric<Detail::TrigonometricImplementation<
(std::is_same<Abi, VectorAbi::Sse>::value
? SSE42Impl
: std::is_same<Abi, VectorAbi::Avx>::value ? AVXImpl : ScalarImpl)>>;
} // namespace Detail
#ifdef Vc_HAVE_LIBMVEC
Vc_INTRINSIC __m128 sin_dispatch(__m128 x) { return ::_ZGVbN4v_sinf(x); }
Vc_INTRINSIC __m128d sin_dispatch(__m128d x) { return ::_ZGVbN2v_sin (x); }
Vc_INTRINSIC __m128 cos_dispatch(__m128 x) { return ::_ZGVbN4v_cosf(x); }
Vc_INTRINSIC __m128d cos_dispatch(__m128d x) { return ::_ZGVbN2v_cos (x); }
#ifdef Vc_IMPL_AVX
Vc_INTRINSIC __m256 sin_dispatch(__m256 x) { return ::_ZGVdN8v_sinf(x); }
Vc_INTRINSIC __m256d sin_dispatch(__m256d x) { return ::_ZGVdN4v_sin (x); }
Vc_INTRINSIC __m256 cos_dispatch(__m256 x) { return ::_ZGVdN8v_cosf(x); }
Vc_INTRINSIC __m256d cos_dispatch(__m256d x) { return ::_ZGVdN4v_cos (x); }
#endif
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> sin(const Vector<T, Abi> &x)
{
return sin_dispatch(x.data());
}
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> cos(const Vector<T, Abi> &x)
{
return cos_dispatch(x.data());
}
#else
/**
* \ingroup Math
* Returns the sine of all input values in \p x.
*
* \param x The values to apply the sine function on.
*
* \returns the sine of \p x.
*
* \note The single-precision implementation has a precision of max. 2 ulp (mean 0.17 ulp)
* in the range [-8192, 8192].
* (testSin< float_v> with a maximal distance of 2 to the reference (mean: 0.310741))
*
* \note The double-precision implementation has a precision of max. 3 ulp (mean 1040 ulp)
* in the range [-8192, 8192].
* (testSin<double_v> with a maximal distance of 1 to the reference (mean: 0.170621))
*
* \note The precision and execution latency depends on:
* - `Abi` (e.g. Scalar uses the `<cmath>` implementation
* - whether `Vc_HAVE_LIBMVEC` is defined
* - for the `<cmath>` fallback, the implementations differ (e.g. MacOS vs. Linux
* vs. Windows; fpmath=sse vs. fpmath=387)
*
* \note Vc versions before 1.4 had different precision.
*/
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> sin(const Vector<T, Abi> &x)
{
return Detail::Trig<T, Abi>::sin(x);
}
/**
* \ingroup Math
* Returns the cosine of all input values in \p x.
*
* \param x The values to apply the cosine function on.
* \returns the cosine of \p x.
*
* \note The single-precision implementation has a precision of max. 2 ulp (mean 0.18 ulp) in the range [-8192, 8192].
* \note The double-precision implementation has a precision of max. 3 ulp (mean 1160 ulp) in the range [-8192, 8192].
* \note Vc versions before 1.4 had different precision.
*/
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> cos(const Vector<T, Abi> &x)
{
return Detail::Trig<T, Abi>::cos(x);
}
#endif
/**
* \ingroup Math
* Returns the arcsine of all input values in \p x.
*
* \param x The values to apply the arcsine function on.
* \returns the arcsine of \p x.
*
* \note The single-precision implementation has an error of max. 2 ulp (mean 0.3 ulp).
* \note The double-precision implementation has an error of max. 36 ulp (mean 0.4 ulp).
*/
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> asin(const Vector<T, Abi> &x)
{
return Detail::Trig<T, Abi>::asin(x);
}
/**
* \ingroup Math
* Returns the arctangent of all input values in \p x.
*
* \param x The values to apply the arctangent function on.
* \returns the arctangent of \p x.
* \note The single-precision implementation has an error of max. 3 ulp (mean 0.4 ulp) in the range [-8192, 8192].
* \note The double-precision implementation has an error of max. 2 ulp (mean 0.1 ulp) in the range [-8192, 8192].
*/
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> atan(const Vector<T, Abi> &x)
{
return Detail::Trig<T, Abi>::atan(x);
}
/**
* \ingroup Math
* Returns the arctangent of all input values in \p x and \p y.
*
* Calculates the angle given the lengths of the opposite and adjacent legs in a right
* triangle.
* \param y The opposite leg.
* \param x The adjacent leg.
* \returns the arctangent of \p y / \p x.
*/
template <typename T, typename Abi>
Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> atan2(const Vector<T, Abi> &y,
const Vector<T, Abi> &x)
{
return Detail::Trig<T, Abi>::atan2(y, x);
}
/**
* \ingroup Math
*
* \param x Input value to both sine and cosine.
* \param sin A non-null pointer to a potentially uninitialized object of type Vector.
* When \c sincos returns, `*sin` contains the result of `sin(x)`.
* \param cos A non-null pointer to a potentially uninitialized object of type Vector.
* When \c sincos returns, `*cos` contains the result of `cos(x)`.
*
* \see sin, cos
*/
template <typename T, typename Abi>
Vc_INTRINSIC void sincos(const Vector<T, Abi> &x,
Vector<T, detail::not_fixed_size_abi<Abi>> *sin,
Vector<T, Abi> *cos)
{
Detail::Trig<T, Abi>::sincos(x, sin, cos);
}
#endif
} // namespace Vc_VERSIONED_NAMESPACE
#endif // VC_COMMON_TRIGONOMETRIC_H_

402
Vc/common/types.h Normal file
View File

@ -0,0 +1,402 @@
/* This file is part of the Vc library. {{{
Copyright © 2012-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_TYPES_H_
#define VC_COMMON_TYPES_H_
#ifdef Vc_CHECK_ALIGNMENT
#include <cstdlib>
#include <cstdio>
#endif
#include <ratio>
#include "../global.h"
#include "../traits/type_traits.h"
#include "permutation.h"
namespace Vc_VERSIONED_NAMESPACE
{
///\addtogroup Utilities
///@{
/// \internal Allow writing \c size_t without the `std::` prefix.
using std::size_t;
/// long long shorthand
using llong = long long;
/// unsigned long long shorthand
using ullong = unsigned long long;
/// unsigned long shorthand
using ulong = unsigned long;
/// unsigned int shorthand
using uint = unsigned int;
/// unsigned short shorthand
using ushort = unsigned short;
/// unsigned char shorthand
using uchar = unsigned char;
/// signed char shorthand
using schar = signed char;
/**\internal
* Tag type for explicit zero-initialization
*/
struct VectorSpecialInitializerZero {};
/**\internal
* Tag type for explicit one-initialization
*/
struct VectorSpecialInitializerOne {};
/**\internal
* Tag type for explicit "iota-initialization"
*/
struct VectorSpecialInitializerIndexesFromZero {};
/**
* The special object \p Vc::Zero can be used to construct Vector and Mask objects
* initialized to zero/\c false.
*/
constexpr VectorSpecialInitializerZero Zero = {};
/**
* The special object \p Vc::One can be used to construct Vector and Mask objects
* initialized to one/\c true.
*/
constexpr VectorSpecialInitializerOne One = {};
/**
* The special object \p Vc::IndexesFromZero can be used to construct Vector objects
* initialized to values 0, 1, 2, 3, 4, ...
*/
constexpr VectorSpecialInitializerIndexesFromZero IndexesFromZero = {};
///@}
namespace Detail
{
template<typename T> struct MayAliasImpl {
#ifdef Vc_ICC
#pragma warning(disable:2621)
#endif
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wattributes"
#endif
typedef T type Vc_MAY_ALIAS;
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
#ifdef Vc_ICC
#pragma warning(enable:2621)
#endif
};
//template<size_t Bytes> struct MayAlias<MaskBool<Bytes>> { typedef MaskBool<Bytes> type; };
} // namespace Detail
/**\internal
* Helper MayAlias<T> that turns T into the type to be used for an aliasing pointer. This
* adds the may_alias attribute to T (with compilers that support it). But for MaskBool this
* attribute is already part of the type and applying it a second times leads to warnings/errors,
* therefore MaskBool is simply forwarded as is.
*/
template <typename T> using MayAlias = typename Detail::MayAliasImpl<T>::type;
template <class To, class From> MayAlias<To> &aliasing_cast(From &x)
{
return *reinterpret_cast<MayAlias<To> *>(&x);
}
template <class To, class From> const MayAlias<To> &aliasing_cast(const From &x)
{
return *reinterpret_cast<const MayAlias<To> *>(&x);
}
template <class To, class From> MayAlias<To> *aliasing_cast(From *x)
{
return reinterpret_cast<MayAlias<To> *>(x);
}
template <class To, class From> const MayAlias<To> *aliasing_cast(const From *x)
{
return reinterpret_cast<const MayAlias<To> *>(x);
}
/**\internal
* This enumeration lists all possible operators in C++.
*
* The assignment and compound assignment enumerators are used with the conditional_assign
* implementation.
*/
enum class Operator : char {
Assign,
Multiply,
MultiplyAssign,
Divide,
DivideAssign,
Remainder,
RemainderAssign,
Plus,
PlusAssign,
Minus,
MinusAssign,
RightShift,
RightShiftAssign,
LeftShift,
LeftShiftAssign,
And,
AndAssign,
Xor,
XorAssign,
Or,
OrAssign,
PreIncrement,
PostIncrement,
PreDecrement,
PostDecrement,
LogicalAnd,
LogicalOr,
Comma,
UnaryPlus,
UnaryMinus,
UnaryNot,
UnaryOnesComplement,
CompareEqual,
CompareNotEqual,
CompareLess,
CompareGreater,
CompareLessEqual,
CompareGreaterEqual
};
// forward declaration for Vc::array in <Vc/array>
template <typename T, std::size_t N> struct array;
// forward declaration for Vc::span in <Vc/span>
namespace Common {
template <typename T, std::ptrdiff_t N> class span;
}
/* TODO: add type for half-float, something along these lines:
class half_float
{
uint16_t data;
public:
constexpr half_float() : data(0) {}
constexpr half_float(const half_float &) = default;
constexpr half_float(half_float &&) = default;
constexpr half_float &operator=(const half_float &) = default;
constexpr explicit half_float(float);
constexpr explicit half_float(double);
constexpr explicit half_float(int);
constexpr explicit half_float(unsigned int);
explicit operator float () const;
explicit operator double () const;
explicit operator int () const;
explicit operator unsigned int() const;
bool operator==(half_float rhs) const;
bool operator!=(half_float rhs) const;
bool operator>=(half_float rhs) const;
bool operator<=(half_float rhs) const;
bool operator> (half_float rhs) const;
bool operator< (half_float rhs) const;
half_float operator+(half_float rhs) const;
half_float operator-(half_float rhs) const;
half_float operator*(half_float rhs) const;
half_float operator/(half_float rhs) const;
};
*/
// TODO: the following doesn't really belong into the toplevel Vc namespace.
#ifndef Vc_CHECK_ALIGNMENT
template<typename _T> static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *){}
#else
template<typename _T> static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *ptr)
{
const size_t s = alignof(_T);
if((reinterpret_cast<size_t>(ptr) & ((s ^ (s & (s - 1))) - 1)) != 0) {
fprintf(stderr, "A vector with incorrect alignment has just been created. Look at the stacktrace to find the guilty object.\n");
abort();
}
}
#endif
namespace Common
{
// defined in common/simdarrayhelper.h
template <typename T, std::size_t Pieces, std::size_t Index> struct Segment;
/**
* \internal
*
* Helper interface to make m_indexes in InterleavedMemoryAccessBase behave like an integer vector.
* Only that the entries are successive entries from the given start index.
*/
template<size_t StructSize> class SuccessiveEntries
{
#ifdef Vc_MSVC
// scatterinterleavedmemory fails with garbage values in m_first if size_type is a
// 64-bit integer type. Using a 32-bit type seems to work around the miscompilation.
using size_type = unsigned;
#else
using size_type = size_t;
#endif
const size_type m_first;
public:
typedef SuccessiveEntries AsArg;
Vc_INTRINSIC SuccessiveEntries(size_type first) : m_first(first) {}
Vc_INTRINSIC Vc_PURE size_type operator[](size_type offset) const
{
return m_first + offset * StructSize;
}
Vc_INTRINSIC Vc_PURE size_type data() const { return m_first; }
Vc_INTRINSIC Vc_PURE SuccessiveEntries operator+(const SuccessiveEntries &rhs) const
{
return SuccessiveEntries(m_first + rhs.m_first);
}
Vc_INTRINSIC Vc_PURE SuccessiveEntries operator*(const SuccessiveEntries &rhs) const
{
return SuccessiveEntries(m_first * rhs.m_first);
}
Vc_INTRINSIC Vc_PURE SuccessiveEntries operator<<(size_type x) const
{
return {m_first << x};
}
friend Vc_INTRINSIC SuccessiveEntries &internal_data(SuccessiveEntries &x)
{
return x;
}
friend Vc_INTRINSIC const SuccessiveEntries &internal_data(const SuccessiveEntries &x)
{
return x;
}
};
// declaration for functions in common/malloc.h
template <std::size_t alignment>
Vc_INTRINSIC_L void *aligned_malloc(std::size_t n) Vc_INTRINSIC_R;
Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R;
/**\internal
* Central definition of the type combinations that convert implicitly.
*/
template <typename Mask, typename T, typename U>
using enable_if_mask_converts_implicitly =
enable_if<(!std::is_same<Mask, Traits::decay<U>>::value && // that'd be the copy ctor
Traits::is_simd_mask<U>::value && !Traits::isSimdMaskArray<U>::value &&
Traits::is_implicit_cast_allowed_mask<
Traits::entry_type_of<typename Traits::decay<U>::Vector>, T>::value)>;
/**\internal
* Central definition of the type combinations that only convert explicitly.
*/
template <typename T, typename U>
using enable_if_mask_converts_explicitly = enable_if<(
Traits::isSimdMaskArray<U>::value ||
(Traits::is_simd_mask<U>::value &&
!Traits::is_implicit_cast_allowed_mask<
Traits::entry_type_of<typename Traits::decay<U>::Vector>, T>::value))>;
/**\internal
* Tag type for overloading on the width (\VSize{T}) of a vector.
*/
template <typename T> using WidthT = std::integral_constant<std::size_t, sizeof(T)>;
// forward declaration of MaskBool in common/maskbool.h
template <std::size_t Bytes> class MaskBool;
// forward declaration of SubscriptOperation in common/subscript.h
template <typename T, typename IndexVector, typename Scale, bool>
class SubscriptOperation;
/**
* \internal
* Helper type to pass along the two arguments for a gather operation.
*
* \tparam IndexVector Normally an integer SIMD vector, but an array or std::vector also
* works (though often not as efficient).
*/
template <class T, class IndexVector, int Scale = 1>
struct GatherArguments {
static_assert(std::is_same<T, remove_cvref_t<T>>::value && !std::is_pointer<T>::value,
"GatherArguments expects an cv unqualified non-ref/ptr type");
const IndexVector indexes;
const T *const address;
};
template <int Scale, class T, class I>
GatherArguments<T, I, Scale> make_gather(const T *m, const I &i)
{
return {i, m};
}
/**
* \internal
* Helper type to pass along the two arguments for a scatter operation.
*
* \tparam IndexVector Normally an integer SIMD vector, but an array or std::vector also
* works (though often not as efficient).
*/
template <typename T, typename IndexVector> struct ScatterArguments
{
const IndexVector indexes;
T *const address;
};
/**\internal
* Break the recursion of the function below.
*/
template <typename I, I Begin, I End, typename F>
Vc_INTRINSIC enable_if<(Begin >= End), void> unrolled_loop(F &&)
{
}
/**\internal
* Force the code in the lambda \p f to be called with indexes starting from \p Begin up
* to (excluding) \p End to be called without compare and jump instructions (i.e. an
* unrolled loop).
*/
template <typename I, I Begin, I End, typename F>
Vc_INTRINSIC Vc_FLATTEN enable_if<(Begin < End), void> unrolled_loop(F &&f)
{
f(Begin);
unrolled_loop<I, Begin + 1, End>(f);
}
/**\internal
* Small simplification of the unrolled_loop call for ranges from 0 to \p Size using
* std::size_t as the index type.
*/
template <std::size_t Size, typename F> Vc_INTRINSIC void for_all_vector_entries(F &&f)
{
unrolled_loop<std::size_t, 0u, Size>(std::forward<F>(f));
}
} // namespace Common
} // namespace Vc
#include "vector.h"
#include "mask.h"
#include "memoryfwd.h"
#endif // VC_COMMON_TYPES_H_
// vim: foldmethod=marker

96
Vc/common/utility.h Normal file
View File

@ -0,0 +1,96 @@
/* This file is part of the Vc library. {{{
Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_UTILITY_H_
#define VC_COMMON_UTILITY_H_
#include "macros.h"
namespace Vc_VERSIONED_NAMESPACE
{
namespace Common
{
/**
* \internal
* Returns the next power of 2 larger than or equal to \p x.
*/
template <size_t x, bool = (x & (x - 1)) == 0> struct NextPowerOfTwo;
template <size_t x>
struct NextPowerOfTwo<x, true> : public std::integral_constant<size_t, x> {
};
template <size_t x>
struct NextPowerOfTwo<x, false>
: public std::integral_constant<
size_t, NextPowerOfTwo<(x | (x >> 1) | (x >> 2) | (x >> 5)) + 1>::value> {
};
/**
* \internal
* Enforce an upper bound to an alignment value. This is necessary because some compilers
* implement such an upper bound and emit a warning if it is encountered.
*/
template <size_t A>
struct BoundedAlignment : public std::integral_constant<size_t,
#if defined Vc_MSVC || defined Vc_GCC
((A - 1) &
#ifdef Vc_MSVC
31
#elif defined __AVX__
255
#else
127
#endif
) + 1
#else
A
#endif
> {
};
/**
* \internal
* Returns the size of the left/first SimdArray member.
*/
template <std::size_t N> static constexpr std::size_t left_size()
{
return Common::NextPowerOfTwo<(N + 1) / 2>::value;
}
/**
* \internal
* Returns the size of the right/second SimdArray member.
*/
template <std::size_t N> static constexpr std::size_t right_size()
{
return N - left_size<N>();
}
} // namespace Common
} // namespace Vc
#endif // VC_COMMON_UTILITY_H_
// vim: foldmethod=marker

857
Vc/common/vector.h Normal file
View File

@ -0,0 +1,857 @@
/* This file is part of the Vc library. {{{
Copyright © 2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_COMMON_VECTOR_H_
#define VC_COMMON_VECTOR_H_
#include <ratio>
#include "elementreference.h"
#include "types.h"
#include "vectorabi.h"
#include "vectortraits.h"
#include "simdarrayfwd.h"
#include "loadstoreflags.h"
#include "writemaskedvector.h"
#include "detail.h"
namespace Vc_VERSIONED_NAMESPACE
{
/**
* \ingroup Math
* Copies the sign(s) of \p sign to the value(s) in \p magnitude and returns the resulting
* vector.
*
* \param magnitude This vector's magnitude will be used in the return vector.
* \param sign This vector's sign bit will be used in the return vector.
*
* \return a value where the sign of the value equals the sign of \p sign. I.e.
* `sign(copysign(v, r)) == sign(r)`.
*/
template <typename T, typename Abi,
typename = enable_if<std::is_floating_point<T>::value &&
!detail::is_fixed_size_abi<Abi>::value>>
inline Vector<T, Abi> copysign(Vector<T, Abi> magnitude, Vector<T, Abi> sign);
/**
* \ingroup Math
* Extracts the exponent of each floating-point vector component.
*
* \param x The vector of values to check for the sign.
* \return the exponent to base 2.
*
* This function provides efficient access to the exponent of the floating point number. The
* returned value is a fast approximation to the logarithm of base 2. The absolute error of that
* approximation is between [0, 1[.
*
* Examples:
\verbatim
value | exponent | log2
=======|==========|=======
1.0 | 0 | 0
2.0 | 1 | 1
3.0 | 1 | 1.585
3.9 | 1 | 1.963
4.0 | 2 | 2
4.1 | 2 | 2.036
\endverbatim
*
* \warning This function assumes a positive value (non-zero). If the value is negative the sign bit will
* modify the returned value. An input value of zero will return the bias of the floating-point
* representation. If you compile with Vc runtime checks, the function will assert
* values greater than or equal to zero.
*
* You may use abs to apply this function to negative values:
* \code
* exponent(abs(v))
* \endcode
*/
template <typename T, typename Abi,
typename = enable_if<std::is_floating_point<T>::value &&
!detail::is_fixed_size_abi<Abi>::value>>
inline Vector<T, Abi> exponent(Vector<T, Abi> x);
/**
* \ingroup Math
* Returns for each vector component whether it stores a negative value.
*
* \param x The vector of values to check for the sign.
* \returns a mask which is \c true only in those components that are negative in \p x.
*/
template <typename T, typename Abi>
Vc_INTRINSIC Vc_CONST typename Vector<T, detail::not_fixed_size_abi<Abi>>::MaskType
isnegative(Vector<T, Abi> x)
{
return x < Vector<T, Abi>::Zero();
}
/**
* \class Vector types.h <Vc/vector.h>
* \ingroup Vectors
*
* The main vector class for expressing data parallelism.
*
* are specializations of this class.
* For most cases there are no API differences for the specializations.
* Make use of Vector<T> for generic programming, otherwise you might prefer to use
* the \p *_v aliases.
*
* \see Vc::float_v, Vc::double_v, Vc::int_v, Vc::uint_v, Vc::short_v, Vc::ushort_v
* \see Mask
*/
template<typename T, typename Abi = VectorAbi::Best<T>> class Vector
{
public:
/**
* Returns the number of scalar components (\VSize{T}) in a vector of this type.
*
* The size of the vector. I.e. the number of scalar entries in the vector. Do not
* make any assumptions about the size of vectors. If you need vectors of \c float and
* \c int types use Vector::IndexType or SimdArray.
*
* You can easily use if clauses to compare Vector sizes. The compiler can
* statically evaluate and fully optimize dead code away (very much like \#ifdef, but
* with syntax checking).
*
* \returns The number of components (i.e. \VSize{T}) objects of this vector type
* store and manipulate.
*/
static constexpr size_t size() { return VectorTraits<T, Abi>::size(); }
/**
* Specifies the alignment requirement for aligned load and store calls for objects of
* this vector type.
*/
static constexpr size_t MemoryAlignment = VectorTraits<T, Abi>::memoryAlignment();
/// The ABI tag type of the current template instantiation.
using abi = Abi;
/// The type of the entries in the vector.
using EntryType = typename VectorTraits<T, Abi>::EntryType;
/// \copydoc EntryType
using value_type = EntryType;
using VectorEntryType = typename VectorTraits<T, Abi>::VectorEntryType;
/**\internal
* This type reveals the implementation-specific type used for the data member.
*/
using VectorType = typename VectorTraits<T, Abi>::VectorType;
/**\internal
* \copydoc VectorType
*/
using vector_type = VectorType;
/// The type of the mask used for masked operations and returned from comparisons.
using MaskType = Vc::Mask<T, Abi>;
/// \copydoc MaskType
using mask_type = MaskType;
using MaskArgument = MaskType;
using VectorArgument = Vector;
/// The type of the vector used for indexes in gather and scatter operations.
using IndexType = Vc::fixed_size_simd<int, VectorTraits<T, Abi>::size()>;
/// \copydoc IndexType
using index_type = IndexType;
using reference = Detail::ElementReference<Vector>;
/// \name Generators
///@{
/**
* Returns a vector with the entries initialized to zero.
*/
static inline Vector Zero();
/**
* Returns a vector with the entries initialized to one.
*/
static inline Vector One();
/**
* Returns a vector with the entries initialized to 0, 1, 2, 3, 4, 5, ...
*/
static inline Vector IndexesFromZero();
/**
* Returns a vector with pseudo-random entries.
*
* Currently the state of the random number generator cannot be modified and starts
* off with the same state. Thus you will get the same sequence of numbers for the
* same sequence of calls.
*
* \return a new random vector. Floating-point values will be in the 0-1 range.
* Integers will use the full range the integer representation allows.
*
* \note This function may use a very small amount of state and thus will be a weak
* random number generator.
*/
static inline Vector Random();
/// Generate a vector object from return values of \p gen (static variant of \ref fill).
template <typename G> static inline Vector generate(G gen);
///@}
/// \name Compile-Time Constant Initialization
///@{
/**
* Construct a zero-initialized vector object.
*
* This constructor follows the behavior of the underlying arithmetic type \p T in
* that the expression `T()` zero-initializes the object. On the other hand the
* variable \c x in `T x;` is uninitialized.
* Since, for class types, both expressions call the default constructor `Vector<T> x`
* must zero-initialize \c x as well.
*/
inline Vector() = default;
/**
* Construct a vector with the entries initialized to zero.
*
* \see Vc::Zero, Zero()
*/
explicit inline Vector(VectorSpecialInitializerZero);
/**
* Construct a vector with the entries initialized to one.
*
* \see Vc::One, One()
*/
explicit inline Vector(VectorSpecialInitializerOne);
/**
* Construct a vector with the entries initialized to 0, 1, 2, 3, 4, 5, ...
*
* \see Vc::IndexesFromZero, IndexesFromZero()
*/
explicit inline Vector(VectorSpecialInitializerIndexesFromZero);
///@}
/// \name Conversion/Broadcast Constructors
///@{
/**
* Implict conversion from compatible Vector<U, Abi> types.
*/
template <typename U>
inline Vector(Vector<U, abi> x,
enable_if<Traits::is_implicit_cast_allowed<U, T>::value> = nullarg);
#if Vc_IS_VERSION_1
/**
* Explicit conversion (i.e. `static_cast`) from the remaining Vector<U, Abi> types.
*
* \param x A vector object to use for initialization of the new vector object. If \p
* x contains more entries than the new object the high components will be
* ignored. If \p x contains fewer entries than the new object the high
* components of the new object will be zero-initialized. Type conversion is
* done according to the standard conversion rules for the underlying
* fundamental arithmetic types.
*/
template <typename U>
Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
"vector types") inline explicit Vector(
Vector<U, abi> x,
enable_if<!Traits::is_implicit_cast_allowed<U, T>::value> = nullarg);
#endif
/**
* Broadcast Constructor.
*
* Constructs a vector with all entries of the vector filled with the given value.
*
* \param a The scalar value to broadcast to all entries of the constructed vector.
*/
inline Vector(EntryType a);
template <typename U>
inline Vector(U a, enable_if<std::is_same<U, int>::value &&
!std::is_same<U, EntryType>::value> = nullarg);
inline explicit Vector(reference a);
///@}
/**
* \name Loads & Stores
*/
///@{
#include "../common/loadinterface.h"
#include "../common/storeinterface.h"
///@}
/**
* Set all entries to zero.
*/
inline void setZero();
/**
* Set all entries to zero where the mask is set.
*
* A 4-vector with a mask of `[0111]` therefore would set the last three entries to 0.
*
* \param mask Selects the entries to be set to zero.
*/
inline void setZero(MaskType mask);
/**
* Set all entries to zero where the mask is not set.
*
* A 4-vector with a mask of `[0111]` therefore would set only the first entry to 0.
*
* \param mask Selects the entries to not be set to zero.
*/
inline void setZeroInverted(MaskType mask);
/**
* Set all entries to the bit representation of a QNaN.
*/
inline void setQnan();
/**
* Set all entries to the bit representation of a QNaN where the mask is set.
*
* \param mask Selects the entries to be set to QNaN.
*/
inline void setQnan(MaskType mask);
#define Vc_CURRENT_CLASS_NAME Vector
#include "../common/gatherinterface.h"
#include "../common/scatterinterface.h"
#undef Vc_CURRENT_CLASS_NAME
/// \name Scalar Subscript Operators
///@{
/**
* This operator can be used to modify scalar entries of the vector.
*
* \param index A value between 0 and Size. This value is not checked internally so
* you must make/be sure it is in range.
*
* \return a reference to the vector entry at the given \p index.
*
* \warning The use of this function may result in suboptimal performance. Please
* check whether you can find a more vector-friendly way to do what you
* intended.
* \note the returned object models the concept of a reference and
* as such it can exist longer than the data it is referencing.
* \note to avoid lifetime issues, we strongly advice not to store
* any reference objects.
*/
inline reference operator[](size_t index) noexcept;
/**
* This operator can be used to read scalar entries of the vector.
*
* \param index A value between 0 and Size. This value is not checked internally so
* you must make/be sure it is in range.
*
* \return a copy of the vector entry at the given \p index.
*/
inline EntryType operator[](size_t index) const noexcept;
///@}
/// \name Unary Operators
///@{
/**
* Determine where the vector is null.
*
* \returns a mask which denotes the zero entries of this vector object.
*/
inline MaskType operator!() const;
/**
* Inverts all bits.
*
* \returns a new vector which has all bits inverted. I.e. `v & ~v == 0`.
*
* \note This operator is only defined for integral types \p T.
*/
inline Vector operator~() const;
/// Returns a new vector object with all entries negated.
inline Vector operator-() const;
/// Returns a copy of the vector object.
inline Vector operator+() const;
///@}
/**
* \name Increment and Decrement Operators
* The increment and decrement operators apply the increment/decrement operation per
* component.
*
* The semantics are equal to the semantics of the fundamental arithmetics type \p T.
*
* \note Over-/Underflow of signed integral types is undefined behavior and may
* actually break your code.
*/
///@{
inline Vector &operator++(); // prefix
inline Vector operator++(int); // postfix
inline Vector &operator--(); // prefix
inline Vector operator--(int); // postfix
///@}
#define Vc_OP(symbol) \
inline Vc_PURE Vector operator symbol(const Vector &x) const;
/**
* \name Arithmetic Operations
*
* The arithmetic operations are implemented as component-wise
* application of the operator on the two vector objects.
*
* Example:
* \code
* void foo(float_v a, float_v b) {
* const float_v product = a * b;
* const float_v difference = a - b;
* a += b;
* auto quotient = a / b;
* auto modulo = static_cast<int_v>(a) % static_cast<int_v>(b);
* }
* \endcode
*
* \param x The vector to add, subtract, multiply, or divide by.
* \returns A vector object of the same type with the components filled according to a
* component-wise application of the operator.
*
* \note If a signed integral vector operation overflows the result is undefined.
* (which is in agreement to the behavior of the fundamental signed integral types in
* C++)
*/
///@{
Vc_ALL_ARITHMETICS(Vc_OP);
///@}
/**
* \name Binary Operations
*
* The binary operations are implemented as component-wise
* application of the operator on the two vector objects.
*
* Example:
* \code
* void foo(int_v a, int_v b) {
* const int_v combined_bits = a | b;
* const int_v masked_bits = a & b;
* a ^= b; // flipped bits
* }
* \endcode
*
* \returns A vector object of the same type with the components filled according to a
* component-wise application of the operator.
*/
///@{
Vc_ALL_BINARY(Vc_OP);
///@}
/**
* \name Shift Operations
*
* The shift operations are implemented as component-wise
* application of the operator on the two vector objects.
*
* Example:
* \code
* void foo(int_v a, int_v b) {
* const int_v right = a >> b;
* a <<= b;
* }
* \endcode
*
* \returns A vector object of the same type with the components filled according to a
* component-wise application of the operator.
*/
///@{
Vc_ALL_SHIFTS(Vc_OP);
///@}
#undef Vc_OP
/**
* \name Comparisons
*
* All comparison operators return a mask object.
*
* Example:
* \code
* void foo(const float_v &a, const float_v &b) {
* const float_m mask = a < b;
* ...
* }
* \endcode
*
* \param x The vector to compare against.
* \returns A mask object. Its components contain the boolean results of the
* component-wise compare operation.
*/
///@{
#define Vc_CMP_OP(symbol) inline Vc_PURE MaskType operator symbol(const Vector &x) const;
Vc_ALL_COMPARES(Vc_CMP_OP);
#undef Vc_CMP_OP
///@}
/**
* Writemask the vector before an assignment.
*
* \param mask The writemask to be used.
*
* \return an object that can be used for any kind of masked assignment.
*
* The returned object is only to be used for assignments and should not be assigned
* to a variable.
*
* Examples:
* \code
* float_v v = float_v::Zero(); // v = [0, 0, 0, 0]
* int_v v2 = int_v::IndexesFromZero(); // v2 = [0, 1, 2, 3]
* v(v2 < 2) = 1.f; // v = [1, 1, 0, 0]
* v(v2 < 3) += 1.f; // v = [2, 2, 1, 0]
* ++v2(v < 1.f); // v2 = [0, 1, 2, 4]
* \endcode
*/
inline Common::WriteMaskedVector<Vector, MaskType> operator()(MaskType mask);
/**
* \name Horizontal Reduction Operations
*
* Horizontal operations can be used to reduce the values of a vector to a scalar
* value.
*
* Example:
* \code
* void foo(const float_v &v) {
* float min = v.min(); // smallest value in v
* float sum = v.sum(); // sum of all values in v
* }
* \endcode
*/
///@{
/// Returns the smallest entry in the vector.
inline EntryType min() const;
/// Returns the largest entry in the vector.
inline EntryType max() const;
/// Returns the product of all entries in the vector.
inline EntryType product() const;
/// Returns the sum of all entries in the vector.
inline EntryType sum() const;
/// Returns a vector containing the sum of all entries with smaller index.
inline Vector partialSum() const;
/// Returns the smallest entry of the vector components selected by \p mask.
inline EntryType min(MaskType mask) const;
/// Returns the largest entry of the vector components selected by \p mask.
inline EntryType max(MaskType mask) const;
/// Returns the product of the vector components selected by \p mask.
inline EntryType product(MaskType mask) const;
/// Returns the sum of the vector components selected by \p mask.
inline EntryType sum(MaskType mask) const;
///@}
/**
* \name Shift and Rotate
*
* These functions allow to shift or rotate the entries in a vector.
*
* All functions with an \p amount parameter support positive and negative numbers for
* the shift/rotate value.
*
* Example:
* \code
* using namespace Vc;
* int_v foo = int_v::IndexesFromZero() + 1; // e.g. [1, 2, 3, 4] with SSE
* int_v x;
* x = foo.shifted( 1); // [2, 3, 4, 0]
* x = foo.shifted( 2); // [3, 4, 0, 0]
* x = foo.shifted( 3); // [4, 0, 0, 0]
* x = foo.shifted( 4); // [0, 0, 0, 0]
* x = foo.shifted(-1); // [0, 1, 2, 3]
* x = foo.shifted(-2); // [0, 0, 1, 2]
* x = foo.shifted(-3); // [0, 0, 0, 1]
* x = foo.shifted(-4); // [0, 0, 0, 0]
*
* x = foo.rotated( 1); // [2, 3, 4, 1]
* x = foo.rotated( 2); // [3, 4, 1, 2]
* x = foo.rotated( 3); // [4, 1, 2, 3]
* x = foo.rotated( 4); // [1, 2, 3, 4]
* x = foo.rotated(-1); // [4, 1, 2, 3]
* x = foo.rotated(-2); // [3, 4, 1, 2]
* x = foo.rotated(-3); // [2, 3, 4, 1]
* x = foo.rotated(-4); // [1, 2, 3, 4]
* \endcode
*
* These functions are slightly related to the above swizzles. In any case, they are
* often useful for communication between SIMD lanes or binary decoding operations.
*
* \warning Use of these functions leads to less portable code. Consider the scalar
* implementation where every vector has only one entry. The shift and rotate
* functions have no useful task to fulfil there and you will almost certainly not get
* any useful results. It is recommended to add a static_assert for the assumed
* minimum vector size.
*/
///@{
/// Shift vector entries to the left by \p amount; shifting in zeros.
inline Vector shifted(int amount) const;
/**
* Shift vector entries to the left by \p amount; shifting in values from shiftIn
* (instead of zeros).
*
* This function can be used to create vectors from unaligned memory locations.
*
* Example:
* \code
* Vc::Memory<int_v, 256> mem;
* for (int i = 0; i < 256; ++i) { mem[i] = i + 1; }
* int_v a = mem.vectorAt(0);
* int_v b = mem.vectorAt(int_v::Size);
* int_v x = a.shifted(1, b);
* // now x == mem.vectorAt(1, Vc::Unaligned)
* \endcode
*
* \param amount The number of entries to shift by. \p amount must be between \c
* -Size and \c Size, otherwise the result is undefined.
* \param shiftIn The vector of values to shift in.
* \return A new vector with values from \p this and \p shiftIn concatenated
* and then shifted by \p amount.
*/
inline Vector shifted(int amount, Vector shiftIn) const;
/// Rotate vector entries to the left by \p amount.
inline Vector rotated(int amount) const;
/// Returns a vector with all components reversed.
inline Vector reversed() const;
///@}
/**
* Return a sorted copy of the vector.
*
* \returns a sorted vector. The returned values are in ascending order:
\verbatim
v[0] <= v[1] <= v[2] <= v[3] ...
\endverbatim
*
* \note If the vector contains NaNs the result is undefined.
*
* Example:
* \code
* int_v v = int_v::Random();
* int_v s = v.sorted();
* std::cout << v << '\n' << s << '\n';
* \endcode
*
* With SSE the output would be:
*
\verbatim
[1513634383, -963914658, 1763536262, -1285037745]
[-1285037745, -963914658, 1513634383, 1763536262]
\endverbatim
*
* With the Scalar implementation:
\verbatim
[1513634383]
[1513634383]
\endverbatim
*/
inline Vector sorted() const;
/*!
* \name Apply/Call/Fill Functions
*
* There are still many situations where the code needs to switch from SIMD operations
* to scalar execution. In this case you can, of course rely on operator[]. But there
* are also a number of functions that can help with common patterns.
*
* The apply functions expect a function that returns a scalar value, i.e. a function
* of the form "T f(T)". The call functions do not return a value and thus the
* function passed does not need a return value. The fill functions are used to
* serially set the entries of the vector from the return values of a function.
*
* Example:
* \code
* void foo(float_v v) {
* float_v logarithm = v.apply(std::log);
* float_v exponential = v.apply(std::exp);
* }
* \endcode
*
* Of course, you can also use lambdas here:
* \code
* float_v power = v.apply([](float f) { return std::pow(f, 0.6f); })
* \endcode
*
* \param f A functor: this can either be a function or an object that implements
* operator().
*/
///@{
/// Call \p f sequentially, starting with the minimum up to the maximum value.
template <typename F> void callWithValuesSorted(F &&f);
/// Call \p f with the scalar entries of the vector.
template <typename F> inline void call(F &&f) const;
/// As above, but skip the entries where \p mask is not set.
template <typename F> inline void call(F &&f, MaskType mask) const;
/// Call \p f on every entry of the vector and return the results as a new vector.
template <typename F> inline Vector apply(F &&f) const;
/// As above, but skip the entries where \p mask is not set.
template <typename F> inline Vector apply(F &&f, MaskType mask) const;
/// Fill the vector with the values [f(0), f(1), f(2), ...].
template <typename IndexT> inline void fill(EntryType(&f)(IndexT));
/// Fill the vector with the values [f(), f(), f(), ...].
inline void fill(EntryType(&f)());
///@}
/**\internal
* Interleaves this vector and \p x and returns the resulting low vector.
* Used to implement Vc::interleave.
*/
inline Vector interleaveLow(Vector x) const;
/**\internal
* Interleaves this vector and \p x and returns the resulting low vector.
* Used to implement Vc::interleave.
*/
inline Vector interleaveHigh(Vector x) const;
/**\internal
* Assigns the components of \p v where \p m is \c true.
*/
inline void assign(const Vector &v, const MaskType &m);
/**
* \internal
* \name Internal Data Access
* Returns a (const) reference the internal data member, storing the vector data.
*/
///@{
inline VectorType &data();
inline const VectorType &data() const;
///@}
/// \name Deprecated Members
///@{
/**
* Returns the exponents of the floating-point values in the vector.
*
* \return A new vector object of the same type containing the exponents.
*
* \deprecated use Vc::exponent instead.
*/
Vc_DEPRECATED("use exponent(x) instead") inline Vector exponent() const;
/**
* Returns whether a value is negative.
*
* \return A new mask object indicating the sign of each vector element.
*
* \deprecated use Vc::isnegative instead.
*/
Vc_DEPRECATED("use isnegative(x) instead") inline MaskType isNegative() const;
///\copydoc size
///\deprecated Use Vc::Vector::size instead.
static constexpr size_t Size = VectorTraits<T, Abi>::size();
/**
* Casts the current object to \p V2.
*
* \returns a converted object of type \p Vc.
*
* \deprecated Use Vc::simd_cast instead.
*/
template <typename V2> inline V2 staticCast() const;
/**
* reinterpret_cast the vector components to construct a vector of type \p V2.
*
* \returns An object of type \p V2 with the smae bit-representation.
*
* \deprecated use Vc::reinterpret_components_cast instead.
*/
template <typename V2>
Vc_DEPRECATED("use reinterpret_components_cast instead") inline V2
reinterpretCast() const;
/**
* Copies the signs of the components of \p reference to the components of the current
* vector, returning the result.
*
* \param reference A vector object that determines the sign of the the result.
* \returns A new vector with sign taken from \p reference and absolute value taken
* from the current vector object.
*
* \deprecated Use Vc::copysign instead.
*/
Vc_DEPRECATED("use copysign(x, y) instead") inline Vector
copySign(Vector reference) const;
///@}
Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Vector));
private:
VectorType d;
};
/**
* \ingroup Utilities
* Constructs a new Vector object of type \p V from the Vector \p x, reinterpreting the
* bits of \p x for the new type \p V.
*
* This function is only applicable if:
* - the \c sizeof of the input and output types is equal
* - the Vector::size() of the input and output types is equal
* - the \c VectorEntryTypes of input and output have equal \c sizeof
*
* \tparam V The requested type to change \p x into.
* \param x The Vector to reinterpret as an object of type \p V.
* \returns A new object (rvalue) of type \p V.
*
* \warning This cast is non-portable since the applicability (see above) may change
* depending on the default vector types of the target platform. The function is perfectly
* safe to use with fully specified \p Abi, though.
*/
template <typename V, typename T, typename Abi>
Vc_ALWAYS_INLINE Vc_CONST enable_if<
(V::size() == Vector<T, Abi>::size() &&
sizeof(typename V::VectorEntryType) ==
sizeof(typename Vector<T, Abi>::VectorEntryType) &&
sizeof(V) == sizeof(Vector<T, Abi>) && alignof(V) <= alignof(Vector<T, Abi>)),
V>
reinterpret_components_cast(const Vector<T, Abi> &x)
{
return reinterpret_cast<const V &>(x);
}
#define Vc_OP(symbol) \
template <typename T, typename Abi> \
inline Vector<T, Abi> &operator symbol##=(Vector<T, Abi> &, \
const Vector<T, Abi> &x);
//Vc_ALL_ARITHMETICS(Vc_OP);
//Vc_ALL_BINARY(Vc_OP);
//Vc_ALL_SHIFTS(Vc_OP);
#undef Vc_OP
} // namespace Vc
#endif // VC_COMMON_VECTOR_H_
// vim: foldmethod=marker

Some files were not shown because too many files have changed in this diff Show More