313 changed files with 92292 additions and 1 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,130 @@
+BasedOnStyle: Google
+
+#    The extra indent or outdent of access modifiers, e.g. public:.
+AccessModifierOffset: -4
+
+#    If true, aligns escaped newlines as far left as possible. Otherwise puts them into the right-most column.
+AlignEscapedNewlinesLeft: false
+
+#    If true, aligns trailing comments.
+AlignTrailingComments: true
+
+#    Allow putting all parameters of a function declaration onto the next line even if BinPackParameters is false.
+AllowAllParametersOfDeclarationOnNextLine: false
+
+#    If true, if (a) return; can be put on a single line.
+AllowShortIfStatementsOnASingleLine: false
+
+#    If true, while (true) continue; can be put on a single line.
+AllowShortLoopsOnASingleLine: false
+
+AllowShortFunctionsOnASingleLine: true
+
+#    If true, always break before multiline string literals.
+AlwaysBreakBeforeMultilineStrings: false
+
+#    If true, always break after the template<...> of a template declaration.
+AlwaysBreakTemplateDeclarations: false
+
+#    If false, a function call’s or function definition’s parameters will either all be on the same line or will have one line each.
+BinPackParameters: true
+
+#    If true, binary operators will be placed after line breaks.
+BreakBeforeBinaryOperators: false
+
+#    The brace breaking style to use.
+#    Possible values:
+#        BS_Attach (in configuration: Attach) Always attach braces to surrounding context.
+#        BS_Linux (in configuration: Linux) Like Attach, but break before braces on function, namespace and class definitions.
+#        BS_Stroustrup (in configuration: Stroustrup) Like Attach, but break before function definitions.
+#        BS_Allman (in configuration: Allman) Always break before braces.
+BreakBeforeBraces: Linux
+
+#    Always break constructor initializers before commas and align the commas with the colon.
+BreakConstructorInitializersBeforeComma: true
+
+#    The column limit.
+#    A column limit of 0 means that there is no column limit. In this case, clang-format will respect the input’s line breaking decisions within statements.
+ColumnLimit: 90
+
+#    If the constructor initializers don’t fit on a line, put each initializer on its own line.
+#ConstructorInitializerAllOnOneLineOrOnePerLine (bool)
+
+#    The number of characters to use for indentation of constructor initializer lists.
+#ConstructorInitializerIndentWidth (unsigned)
+
+#    If true, format braced lists as best suited for C++11 braced lists.
+#    Important differences: - No spaces inside the braced list. - No line break before the closing brace. - Indentation with the continuation indent, not with the block indent.
+#    Fundamentally, C++11 braced lists are formatted exactly like function calls would be formatted in their place. If the braced list follows a name (e.g. a type or variable name), clang-format formats as if the {} were the parentheses of a function call with that name. If there is no name, a zero-length name is assumed.
+Cpp11BracedListStyle: true
+
+#    If true, analyze the formatted file for the most common binding.
+#DerivePointerBinding (bool)
+
+#    If true, clang-format detects whether function calls and definitions are formatted with one parameter per line.
+#    Each call can be bin-packed, one-per-line or inconclusive. If it is inconclusive, e.g. completely on one line, but a decision needs to be made, clang-format analyzes whether there are other bin-packed cases in the input file and act accordingly.
+#    NOTE: This is an experimental flag, that might go away or be renamed. Do not use this in config files, etc. Use at your own risk.
+#ExperimentalAutoDetectBinPacking (bool)
+
+#    Indent case labels one level from the switch statement.
+#    When false, use the same indentation level as for the switch statement. Switch statement body is always indented one level more than case labels.
+IndentCaseLabels: false
+
+#    If true, indent when breaking function declarations which are not also definitions after the type.
+#IndentFunctionDeclarationAfterType (bool)
+
+#    The number of characters to use for indentation.
+IndentWidth: 4
+
+#    The maximum number of consecutive empty lines to keep.
+MaxEmptyLinesToKeep: 1
+
+#    The indentation used for namespaces.
+#    Possible values:
+#        NI_None (in configuration: None) Don’t indent in namespaces.
+#        NI_Inner (in configuration: Inner) Indent only in inner namespaces (nested in other namespaces).
+#        NI_All (in configuration: All) Indent in all namespaces.
+NamespaceIndentation: None
+
+#    Add a space in front of an Objective-C protocol list, i.e. use Foo <Protocol> instead of Foo<Protocol>.
+#ObjCSpaceBeforeProtocolList (bool)
+
+#    The penalty for each line break introduced inside a comment.
+#PenaltyBreakComment (unsigned)
+
+#    The penalty for breaking before the first <<.
+#PenaltyBreakFirstLessLess (unsigned)
+
+#    The penalty for each line break introduced inside a string literal.
+#PenaltyBreakString (unsigned)
+#    The penalty for each character outside of the column limit.
+#PenaltyExcessCharacter (unsigned)
+#    Penalty for putting the return type of a function onto its own line.
+#PenaltyReturnTypeOnItsOwnLine (unsigned)
+#    Set whether & and * bind to the type as opposed to the variable.
+#PointerBindsToType: false
+#    If true, spaces will be inserted between ‘for’/’if’/’while’/... and ‘(‘.
+#SpaceAfterControlStatementKeyword: true
+#    If false, spaces will be removed before ‘=’, ‘+=’, etc.
+#SpaceBeforeAssignmentOperators: true
+#    If false, spaces may be inserted into ‘()’.
+#SpaceInEmptyParentheses: false
+#    The number of spaces to before trailing line comments.
+#SpacesBeforeTrailingComments (unsigned)
+#    If false, spaces may be inserted into C style casts.
+#SpacesInCStyleCastParentheses (bool)
+
+#    If true, spaces will be inserted after every ‘(‘ and before every ‘)’.
+SpacesInParentheses: false
+
+#    Format compatible with this standard, e.g. use A<A<int> > instead of A<A<int>> for LS_Cpp03.
+#    Possible values:
+#        LS_Cpp03 (in configuration: Cpp03) Use C++03-compatible syntax.
+#        LS_Cpp11 (in configuration: Cpp11) Use features of C++11 (e.g. A<A<int>> instead of A<A<int> >).
+#        LS_Auto (in configuration: Auto) Automatic detection based on the input.
+Standard: Cpp11
+
+#    If true, IndentWidth consecutive spaces will be replaced with tab characters.
+UseTab: false
+
+# vim: ft=yaml
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@ -0,0 +1,77 @@
+## Copyright and License
+
+Vc is licensed with the [3-clause BSD license](http://opensource.org/licenses/BSD-3-Clause).
+Your contributions to Vc must be released under the same license. You must add
+your copyright information to the files you modified/added.
+
+## Code Formatting & Style
+
+The recommended way is to format the code according to `clang-format` using the
+`.clang-format` file in the repository.
+
+In addition to the `clang-format` style, `if`, `else`, `for`, `while`, and `do`
+*must* use braces.
+
+If, for some reason, you cannot use `clang-format`, here's a quick overview of
+the style rules:
+* Constrain the code to no more than 90 characters per line.
+* Use four spaces for indent. No tabs.
+* Opening braces attach to the preceding expression, except for functions,
+  namespaces, and classes/structs/unions/enums.
+* Namespaces introduce no additional indent
+* `case` labels are aligned with the `switch` statement
+* No more than one empty line.
+* No spaces in parentheses, but spaces between keywords and opening paren, i.e.
+  `if (foo) { bar(); }`
+
+### Naming Rules
+
+* Naming is very important. Take time to choose a name that clearly explains the
+  intended functionality & usage of the entity.
+* Type names typically use `CamelCase`. No underscores.
+* Function and variable names use `camelCase`. No underscores.
+* Acronyms that appear in camel case names must use lowercase letters for all
+  characters after the first characters. (e.g. `SimdArray`, `simdFunction`)
+* Traits use `lower_case_with_underscores`.
+* Macros are prefixed with `Vc_` and use `Vc_ALL_CAPITALS_WITH_UNDERSCORES`.
+  Macro arguments use a single underscore suffix.
+  Include guards are prefixed with `VC_` instead.
+* File names use `alllowercasewithoutunderscores`. Basically, it is the type name
+  declared/defined in the file with all letters in lower case.
+* There are exceptions and inconsistencies in the code. Don't bother.
+
+### Design Guidelines
+
+* *Avoid out parameters.* Use the return value insted. Use `std::tuple` if you
+  need to return multiple values.
+* *Look for alternatives to in-out parameters.* An obvious exception (and thus
+  design alternative) is the implicit `this` parameter to non-static member
+  functions.
+* Consequently, *pass function parameters by const-ref or by value.*
+  Use const-ref for types that (potentially) require more than two CPU
+  registers. (Consider fundamental types and the fundamental `Vector<T>` types
+  to require one register, each.)
+  By value otherwise.
+* *Ensure const-correctness.* Member functions use the `const` qualifier if they
+  do not modify observable state. Use `mutable` members for unobservable state.
+* *Avoid macros.* Possible alternatives are constexpr variables and template
+  code.
+
+## Git History
+
+Git history should be flat, if feasible. Feel free to use merges on your private
+branch. However, once you submit a pull request, the history should apply
+cleanly on top of master. Use `git rebase [-i]` to straighten the history.
+
+Use different branches for different issues.
+
+## Git Commit Logs
+
+1. Write meaningful summaries and strive to use no more than 50 characters
+1. Use imperative mood in the subject line (and possibly in bullet points in the
+   summary)
+1. Wrap the body at 72 characters
+1. Use the body to explain *what* and *why* (normally it is irrelevant *how* you
+   did it)
+
+See also [Chris Beams article](http://chris.beams.io/posts/git-commit/).
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@ -0,0 +1,18 @@
+<!--
+Vc is now in maintenance mode and no longer actively developed.
+However, we continue to review pull requests with bugfixes from the community.
+If your issue is trivial to fix, we might be able to address it.
+Otherwise, please provide a pull request in addition to your issue.
+-->
+
+Vc version / revision | Operating System | Compiler & Version | Compiler Flags | Assembler & Version | CPU
+----------------------|------------------|--------------------|----------------|---------------------|----
+                      |                  |                    |                |                     |
+
+## Testcase
+```cpp
+```
+
+## Actual Results
+
+## Expected Results
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -0,0 +1,85 @@
+name: CI
+on:
+  push:
+  pull_request:
+  schedule:
+    - cron:  '0 3 * * *'
+jobs:
+  #clang-format:
+  #  runs-on: ubuntu-latest
+  #  steps:
+  #  - uses: actions/checkout@v2
+  #  - uses: DoozyX/clang-format-lint-action@v0.12
+  #    with:
+  #      exclude: './thirdparty'
+  #      clangFormatVersion: 12
+        
+  build-ubuntu:
+    runs-on: ubuntu-latest
+    env:
+      dashboard_model: Experimental
+      build_type: ${{ matrix.build_type }}
+      NUMBER_OF_PROCESSORS: 2
+      CXX: ${{ matrix.cxx }}
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [Debug, Release]
+        cxx: [g++-9, g++-10, g++-11, clang++-10, clang++-11, clang++-12, icpc]
+        include:
+          - cxx: g++-11
+            INSTALL_EXTRA: g++-11
+          - cxx: clang++-11
+            INSTALL_EXTRA: clang-11
+          - cxx: clang++-12
+            INSTALL_EXTRA: clang-12
+          - cxx: icpc
+            INSTALL_ONEAPI: true
+        exclude:
+          # icpc in debug mode runs out of memory in CI
+          - cxx: icpc
+            build_type: Debug
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          submodules: true
+      - name: install OneAPI
+        if: ${{ matrix.INSTALL_ONEAPI }}
+        run: |
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic
+      - name: install extras
+        if: ${{ matrix.INSTALL_EXTRA }}
+        run: |
+          sudo apt update
+          sudo apt install ${{ matrix.INSTALL_EXTRA }}
+      - name: ctest
+        run: |
+          if [ ${{ matrix.INSTALL_ONEAPI }} ]
+          then
+            source /opt/intel/oneapi/setvars.sh
+            export LC_ALL=en_US.utf8
+          fi
+          $CXX --version
+          ctest -VV -S test.cmake
+
+  build-windows:
+    runs-on: ${{ matrix.os }}
+    env:
+      build_type: ${{ matrix.build_type }}
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [Debug, Release]
+        os: [windows-2019]
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: true
+    - uses: egor-tensin/vs-shell@v2
+    - name: ctest
+      run: |
+        ctest -VV -S test.cmake
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,11 @@
+doc/html
+doc/latex
+doc/man
+vc-benchmarks
+*.swp
+*~
+.makeApidox.stamp
+.makeApidox.stamp.new
+build-*
+.vs
+out
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,6 @@
+[submodule "tests/testdata"]
+	path = tests/testdata
+	url = https://github.com/VcDevel/vc-testdata
+[submodule "tests/virtest"]
+	path = tests/virtest
+	url = https://github.com/mattkretz/virtest
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,275 @@
+cmake_minimum_required(VERSION 3.0)
+
+cmake_policy(SET CMP0028 NEW) # Double colon in target name means ALIAS or IMPORTED target.
+cmake_policy(SET CMP0048 NEW) # The ``project()`` command manages VERSION variables.
+
+if(PROJECT_SOURCE_DIR STREQUAL PROJECT_BINARY_DIR)
+   message(FATAL_ERROR "You don't want to configure in the source directory!")
+endif()
+
+if(NOT DEFINED CMAKE_BUILD_TYPE)
+   set(CMAKE_BUILD_TYPE Release CACHE STRING
+      "Choose the type of build, options are: None Debug Release RelWithDebug RelWithDebInfo MinSizeRel."
+      FORCE)
+endif()
+
+# read version parts from version.h
+file(STRINGS ${CMAKE_CURRENT_SOURCE_DIR}/Vc/version.h _version_lines REGEX "^#define Vc_VERSION_STRING ")
+string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" _version_matches "${_version_lines}")
+
+project(Vc VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}" LANGUAGES C CXX)
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
+
+set(disabled_targets)
+
+include (VcMacros)
+include (AddTargetProperty)
+include (OptimizeForArchitecture)
+
+vc_determine_compiler()
+
+if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(i686|x86|AMD64|amd64)")
+   set(Vc_X86 TRUE)
+elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(arm|aarch32|aarch64)")
+   message(WARNING "No optimized implementation of the Vc types available for ${CMAKE_SYSTEM_PROCESSOR}")
+   set(Vc_ARM TRUE)
+else()
+   message(WARNING "No optimized implementation of the Vc types available for ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
+option(USE_CCACHE "If enabled, ccache will be used (if it exists on the system) to speed up recompiles." OFF)
+if(USE_CCACHE)
+   find_program(CCACHE_COMMAND ccache)
+   if(CCACHE_COMMAND)
+      mark_as_advanced(CCACHE_COMMAND)
+      set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_COMMAND}")
+   endif()
+endif()
+
+if(NOT Vc_COMPILER_IS_MSVC)
+   AddCompilerFlag("-std=c++14" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS)
+   if(NOT _ok)
+      AddCompilerFlag("-std=c++1y" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS)
+      if(NOT _ok)
+         AddCompilerFlag("-std=c++11" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS)
+         if(NOT _ok)
+            AddCompilerFlag("-std=c++0x" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS)
+            if(NOT _ok)
+               message(FATAL_ERROR "Vc 1.x requires C++11, better even C++14. It seems this is not available. If this was incorrectly determined please notify vc-devel@compeng.uni-frankfurt.de")
+            endif()
+         endif()
+      endif()
+   endif()
+elseif(MSVC_VERSION LESS 1920)
+   message(FATAL_ERROR "Vc 1.x requires at least Visual Studio 2019.")
+   AddCompilerFlag("/std:c++14" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS)
+endif()
+
+if(MSVC AND (NOT DEFINED Vc_USE_MSVC_SSA_OPTIMIZER_DESPITE_BUGGY_EXP OR NOT Vc_USE_MSVC_SSA_OPTIMIZER_DESPITE_BUGGY_EXP))
+   # bug report: https://developercommunity.visualstudio.com/t/AVX-codegen-bug-on-Vc-with-MSVC-2019/1470844#T-N1521672
+   message(STATUS "WARNING! MSVC starting with 19.20 uses a new optimizer that has a bug causing Vc::exp() to return slighly wrong results.\
+ You can set Vc_USE_MSVC_SSA_OPTIMIZER_DESPITE_BUGGY_EXP=ON to still use the new optimizer on the affected MSVC versions.")
+   AddCompilerFlag("/d2SSAOptimizer-" CXX_RESULT _ok CXX_FLAGS CMAKE_CXX_FLAGS)
+endif()
+
+if(Vc_COMPILER_IS_GCC)
+   if(Vc_GCC_VERSION VERSION_GREATER "5.0.0" AND Vc_GCC_VERSION VERSION_LESS "6.0.0")
+      UserWarning("GCC 5 goes into an endless loop comiling example_scaling_scalar. Therefore, this target is disabled.")
+      list(APPEND disabled_targets
+         example_scaling_scalar
+         )
+   endif()
+elseif(Vc_COMPILER_IS_MSVC)
+   # Disable warning "C++ exception specification ignored except to indicate a function is not __declspec(nothrow)"
+   # MSVC emits the warning for the _UnitTest_Compare desctructor which needs the throw declaration so that it doesn't std::terminate
+   AddCompilerFlag("/wd4290")
+endif()
+
+vc_set_preferred_compiler_flags(WARNING_FLAGS BUILDTYPE_FLAGS)
+
+add_definitions(${Vc_DEFINITIONS})
+add_compile_options(${Vc_COMPILE_FLAGS})
+
+if(Vc_COMPILER_IS_INTEL)
+   # per default icc is not IEEE compliant, but we need that for verification
+   AddCompilerFlag("-fp-model source")
+endif()
+
+if(CMAKE_BUILD_TYPE STREQUAL "" AND NOT CMAKE_CXX_FLAGS MATCHES "-O[123]")
+   message(STATUS "WARNING! It seems you are compiling without optimization. Please set CMAKE_BUILD_TYPE.")
+endif(CMAKE_BUILD_TYPE STREQUAL "" AND NOT CMAKE_CXX_FLAGS MATCHES "-O[123]")
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+add_custom_target(other VERBATIM)
+add_custom_target(Scalar COMMENT "build Scalar code" VERBATIM)
+add_custom_target(SSE COMMENT "build SSE code" VERBATIM)
+add_custom_target(AVX COMMENT "build AVX code" VERBATIM)
+add_custom_target(AVX2 COMMENT "build AVX2 code" VERBATIM)
+
+AddCompilerFlag(-ftemplate-depth=128 CXX_FLAGS CMAKE_CXX_FLAGS)
+
+set(libvc_compile_flags "-DVc_COMPILE_LIB")
+AddCompilerFlag("-fPIC" CXX_FLAGS libvc_compile_flags)
+
+# -fstack-protector is the default of GCC, but at least Ubuntu changes the default to -fstack-protector-strong, which is crazy
+AddCompilerFlag("-fstack-protector" CXX_FLAGS libvc_compile_flags)
+
+set(_srcs src/const.cpp)
+if(Vc_X86)
+   list(APPEND _srcs src/cpuid.cpp src/support_x86.cpp)
+   vc_compile_for_all_implementations(_srcs src/trigonometric.cpp ONLY SSE2 SSE3 SSSE3 SSE4_1 AVX AVX+FMA AVX2+FMA+BMI2)
+   if(NOT Vc_XOP_INTRINSICS_BROKEN)
+     vc_compile_for_all_implementations(_srcs src/trigonometric.cpp ONLY AVX+XOP+FMA)
+     if(NOT Vc_FMA4_INTRINSICS_BROKEN)
+       vc_compile_for_all_implementations(_srcs src/trigonometric.cpp ONLY SSE+XOP+FMA4 AVX+XOP+FMA4)
+     endif()
+   endif()
+   vc_compile_for_all_implementations(_srcs src/sse_sorthelper.cpp ONLY SSE2 SSE4_1 AVX AVX2+FMA+BMI2)
+   vc_compile_for_all_implementations(_srcs src/avx_sorthelper.cpp ONLY AVX AVX2+FMA+BMI2)
+elseif(Vc_ARM)
+   list(APPEND _srcs src/support_dummy.cpp)
+else()
+   list(APPEND _srcs src/support_dummy.cpp)
+endif()
+add_library(Vc STATIC ${_srcs})
+set_property(TARGET Vc APPEND PROPERTY COMPILE_OPTIONS ${libvc_compile_flags})
+add_target_property(Vc LABELS "other")
+if(XCODE)
+   # TODO: document what this does and why it has no counterpart in the non-XCODE logic
+   set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_GCC_INLINES_ARE_PRIVATE_EXTERN "NO")
+   set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN "YES")
+   set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++0x")
+   set_target_properties(Vc PROPERTIES XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++")
+elseif(UNIX AND Vc_COMPILER_IS_CLANG)
+   # On UNIX (Linux) the standard library used by default typically is libstdc++ (GCC).
+   # To get the full clang deal we rather want to build against libc++. This requires
+   # additionally the libc++abi and libsupc++ libraries in all linker invokations.
+   option(USE_LIBC++ "Use libc++ instead of the system default C++ standard library." OFF)
+   if(USE_LIBC++)
+      AddCompilerFlag(-stdlib=libc++ CXX_FLAGS CMAKE_CXX_FLAGS CXX_RESULT _use_libcxx)
+      if(_use_libcxx)
+         find_library(LIBC++ABI c++abi)
+         mark_as_advanced(LIBC++ABI)
+         if(LIBC++ABI)
+            set(CMAKE_REQUIRED_LIBRARIES "${LIBC++ABI};supc++")
+            CHECK_CXX_SOURCE_COMPILES("#include <stdexcept>
+            #include <iostream>
+            void foo() {
+              std::cout << 'h' << std::flush << std::endl;
+              throw std::exception();
+            }
+            int main() {
+              try { foo(); }
+              catch (int) { return 0; }
+              return 1;
+            }" libcxx_compiles)
+            unset(CMAKE_REQUIRED_LIBRARIES)
+            if(libcxx_compiles)
+               link_libraries(${LIBC++ABI} supc++)
+            endif()
+         endif()
+      endif()
+   else()
+      CHECK_CXX_SOURCE_COMPILES("#include <tuple>
+      std::tuple<int> f() { std::tuple<int> r; return r; }
+      int main() { return 0; }
+      " tuple_sanity)
+      if (NOT tuple_sanity)
+         message(FATAL_ERROR "Clang and std::tuple brokenness detected. Please update your compiler.")
+      endif()
+   endif()
+endif()
+add_dependencies(other Vc)
+target_include_directories(Vc
+   PUBLIC
+   $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
+   $<INSTALL_INTERFACE:include>
+   )
+
+option(Vc_ENABLE_INSTALL "Whether to install the library." ON)
+if (Vc_ENABLE_INSTALL)
+   install(TARGETS Vc EXPORT VcTargets DESTINATION lib${LIB_SUFFIX})
+   install(DIRECTORY Vc/ DESTINATION include/Vc FILES_MATCHING REGEX "/*.(h|tcc|def)$")
+   install(FILES
+      Vc/Allocator
+      Vc/IO
+      Vc/Memory
+      Vc/SimdArray
+      Vc/Utils
+      Vc/Vc
+      Vc/algorithm
+      Vc/array
+      Vc/iterators
+      Vc/limits
+      Vc/simdize
+      Vc/span
+      Vc/type_traits
+      Vc/vector
+      DESTINATION include/Vc)
+
+   # Generate and install CMake package and modules
+   include(CMakePackageConfigHelpers)
+   set(PACKAGE_INSTALL_DESTINATION
+      lib${LIB_SUFFIX}/cmake/${PROJECT_NAME}
+      )
+   install(EXPORT ${PROJECT_NAME}Targets
+      NAMESPACE ${PROJECT_NAME}::
+      DESTINATION ${PACKAGE_INSTALL_DESTINATION}
+      EXPORT_LINK_INTERFACE_LIBRARIES
+      )
+   write_basic_package_version_file(
+      ${CMAKE_CURRENT_BINARY_DIR}/cmake/${PROJECT_NAME}ConfigVersion.cmake
+      VERSION ${PROJECT_VERSION}
+      COMPATIBILITY AnyNewerVersion
+      )
+   configure_package_config_file(
+      ${PROJECT_SOURCE_DIR}/cmake/${PROJECT_NAME}Config.cmake.in
+      ${CMAKE_CURRENT_BINARY_DIR}/cmake/${PROJECT_NAME}Config.cmake
+      INSTALL_DESTINATION ${PACKAGE_INSTALL_DESTINATION}
+      PATH_VARS CMAKE_INSTALL_PREFIX
+      )
+   install(FILES
+      cmake/UserWarning.cmake
+      cmake/VcMacros.cmake
+      cmake/AddCompilerFlag.cmake
+      cmake/CheckCCompilerFlag.cmake
+      cmake/CheckCXXCompilerFlag.cmake
+      cmake/OptimizeForArchitecture.cmake
+      cmake/FindVc.cmake
+      ${CMAKE_CURRENT_BINARY_DIR}/cmake/${PROJECT_NAME}Config.cmake
+      ${CMAKE_CURRENT_BINARY_DIR}/cmake/${PROJECT_NAME}ConfigVersion.cmake
+      DESTINATION ${PACKAGE_INSTALL_DESTINATION}
+      )
+endif()
+
+option(BUILD_TESTING "Build the testing tree." OFF)
+include (CTest)
+configure_file(${PROJECT_SOURCE_DIR}/CTestCustom.cmake ${PROJECT_BINARY_DIR}/CTestCustom.cmake COPYONLY)
+if(BUILD_TESTING)
+   add_custom_target(build_tests ALL VERBATIM)
+   add_subdirectory(tests)
+endif()
+
+set(BUILD_EXAMPLES FALSE CACHE BOOL "Build examples.")
+if(BUILD_EXAMPLES)
+   add_subdirectory(examples)
+endif(BUILD_EXAMPLES)
+
+# Hide Vc_IMPL as it is only meant for users of Vc
+mark_as_advanced(Vc_IMPL)
+
+find_program(BIN_CAT cat)
+mark_as_advanced(BIN_CAT)
+if(BIN_CAT)
+   file(REMOVE ${PROJECT_BINARY_DIR}/help.txt)
+   add_custom_command(OUTPUT ${PROJECT_BINARY_DIR}/help.txt
+      COMMAND ${CMAKE_MAKE_PROGRAM} help > ${PROJECT_BINARY_DIR}/help.txt
+      VERBATIM
+      )
+   add_custom_target(cached_help
+      ${BIN_CAT} ${PROJECT_BINARY_DIR}/help.txt
+      DEPENDS ${PROJECT_BINARY_DIR}/help.txt
+      VERBATIM
+      )
+endif()
--- a/CTestConfig.cmake
+++ b/CTestConfig.cmake
@ -0,0 +1,15 @@
+set(CTEST_PROJECT_NAME "Vc")
+set(CTEST_NIGHTLY_START_TIME "00:00:00 CEST")
+
+set(CTEST_DROP_METHOD "http")
+set(CTEST_DROP_SITE "cdash.cern.ch")
+set(CTEST_DROP_LOCATION "/submit.php?project=Vc")
+
+set(CTEST_DROP_SITE_CDASH TRUE)
+
+set(CTEST_UPDATE_TYPE "git")
+
+find_program(GITCOMMAND git)
+set(CTEST_UPDATE_COMMAND "${GITCOMMAND}")
+
+mark_as_advanced(GITCOMMAND)
--- a/CTestCustom.cmake
+++ b/CTestCustom.cmake
@ -0,0 +1,21 @@
+set(CTEST_CUSTOM_WARNING_EXCEPTION ${CTEST_CUSTOM_WARNING_EXCEPTION}
+   " C4723: " # MSVC 2012 can't suppress this warning
+   " C4756: " # MSVC 2012 can't suppress this warning
+   "used uninitialized in this function"
+   "Skipping compilation of tests gatherStruct and gather2dim because of clang bug" # Not a helpful warning for the dashboard
+   "warning is a GCC extension"
+   "^-- "  # Ignore output from cmake
+   "AVX disabled per default because of old/broken compiler" # This warning is meant for users not the dashboard
+   "WARNING non-zero return value in ctest from: make" # Ignore output from ctest
+   "ipo: warning #11010:" # Ignore warning about incompatible libraries with ICC -m32 on 64-bit system
+   "include/qt4" # -Wuninitialized in QWeakPointer(X *ptr)
+   " note: " # Notes are additional lines from errors (or warnings) that we don't want to count as additional warnings
+   "clang: warning: argument unused during compilation: '-stdlib=libc"
+   "clang 3.6.x miscompiles AVX code" # a preprocessor warning for users of Vc, irrelevant for the dashboard
+   )
+
+set(CTEST_CUSTOM_ERROR_EXCEPTION ${CTEST_CUSTOM_ERROR_EXCEPTION}
+   "^ICECC"
+   "^make\\[[1-9]\\]: "
+   "^collect2: ld returned . exit status"
+   "^make: \\*\\*\\* \\[.*\\] Error ")
--- a/1
+++ b/1
@ -0,0 +1 @@
+See README.md.
--- a/23
+++ b/23
@ -0,0 +1,23 @@
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/27
+++ b/27
@ -0,0 +1,27 @@
+CXX ?= c++
+build_dir := $(shell which $(CXX))
+tmp := "case $$(readlink -f $(build_dir)) in *icecc) which $${ICECC_CXX:-g++};; *) echo $(build_dir);; esac"
+build_dir := $(shell sh -c $(tmp))
+build_dir := $(realpath $(build_dir))
+build_dir := build-$(subst /,-,$(build_dir:/%=%)$(CXXFLAGS))
+
+all:
+%:: $(build_dir)/CMakeCache.txt
+	$(MAKE) --no-print-directory -C "$(build_dir)" $(MAKECMDGOALS)
+
+$(build_dir)/CMakeCache.txt:
+	@test -n "$(build_dir)"
+	@mkdir -p "$(build_dir)"
+	@test -e "$(build_dir)/CMakeCache.txt" || cmake -H. -B"$(build_dir)"
+
+print_build_dir:
+	@echo "$(PWD)/$(build_dir)"
+
+clean_builddir:
+	rm -rf "$(build_dir)"
+
+# the following rule works around %:: grabbing the Makefile rule and thus stops it from running every time
+Makefile:
+	@true
+
+.PHONY: print_build_dir clean_builddir
--- a/README.md
+++ b/README.md
@ -0,0 +1,194 @@
+**Vc is now in maintenance mode and no longer actively developed.
+However, we continue to review pull requests with bugfixes from the community.**
+
+**You may be interested in switching to [std-simd](https://github.com/VcDevel/std-simd).**
+GCC 11 includes an experimental version of `std::simd` as part of libstdc++, which also works with clang.
+Features present in Vc 1.4 and not present in *std-simd* will eventually turn into Vc 2.0,which then depends on *std-simd*.
+
+# Vc: portable, zero-overhead C++ types for explicitly data-parallel programming
+
+Recent generations of CPUs, and GPUs in particular, require data-parallel codes
+for full efficiency. Data parallelism requires that the same sequence of
+operations is applied to different input data. CPUs and GPUs can thus reduce
+the necessary hardware for instruction decoding and scheduling in favor of more
+arithmetic and logic units, which execute the same instructions synchronously.
+On CPU architectures this is implemented via SIMD registers and instructions.
+A single SIMD register can store N values and a single SIMD instruction can
+execute N operations on those values. On GPU architectures N threads run in
+perfect sync, fed by a single instruction decoder/scheduler. Each thread has
+local memory and a given index to calculate the offsets in memory for loads and
+stores.
+
+Current C++ compilers can do automatic transformation of scalar codes to SIMD
+instructions (auto-vectorization). However, the compiler must reconstruct an
+intrinsic property of the algorithm that was lost when the developer wrote a
+purely scalar implementation in C++. Consequently, C++ compilers cannot
+vectorize any given code to its most efficient data-parallel variant.
+Especially larger data-parallel loops, spanning over multiple functions or even
+translation units, will often not be transformed into efficient SIMD code.
+
+The Vc library provides the missing link. Its types enable explicitly stating
+data-parallel operations on multiple values. The parallelism is therefore added
+via the type system. Competing approaches state the parallelism via new control
+structures and consequently new semantics inside the body of these control
+structures.
+
+Vc is a free software library to ease explicit vectorization of C++ code. It
+has an intuitive API and provides portability between different compilers and
+compiler versions as well as portability between different vector instruction
+sets. Thus an application written with Vc can be compiled for:
+
+* AVX and AVX2
+* SSE2 up to SSE4.2 or SSE4a
+* Scalar
+* ~~AVX-512 (Vc 2 development)~~
+* ~~NEON (in development)~~
+* ~~NVIDIA GPUs / CUDA (research)~~
+
+After Intel dropped MIC support with ICC 18, Vc 1.4 also removed support for it.
+
+## Examples
+
+### Usage on Compiler Explorer
+
+* [Simdize Example](https://godbolt.org/z/JVEM2j)
+* [Total momentum and time stepping of `std::vector<Particle>`](https://godbolt.org/z/JNdkL9)
+* [Matrix Example](https://godbolt.org/z/fFEkuX): This uses vertical
+  vectorization which does not scale to different vector sizes. However, the
+  example is instructive to compare it with similar solutions of other languages
+  or libraries.
+* [N-vortex solver](https://godbolt.org/z/4o1cg_) showing `simdize`d iteration
+  over many `std::vector<float>`. Note how [important the `-march` flag is, compared
+  to plain `-mavx2 -mfma`](https://godbolt.org/z/hKiOjr).
+
+### Scalar Product
+
+Let's start from the code for calculating a 3D scalar product using builtin floats:
+```cpp
+using Vec3D = std::array<float, 3>;
+float scalar_product(Vec3D a, Vec3D b) {
+  return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+}
+```
+Using Vc, we can easily vectorize the code using the `float_v` type:
+```cpp
+using Vc::float_v
+using Vec3D = std::array<float_v, 3>;
+float_v scalar_product(Vec3D a, Vec3D b) {
+  return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+}
+```
+The above will scale to 1, 4, 8, 16, etc. scalar products calculated in parallel, depending
+on the target hardware's capabilities.
+
+For comparison, the same vectorization using Intel SSE intrinsics is more verbose and uses
+prefix notation (i.e. function calls):
+```cpp
+using Vec3D = std::array<__m128, 3>;
+__m128 scalar_product(Vec3D a, Vec3D b) {
+  return _mm_add_ps(_mm_add_ps(_mm_mul_ps(a[0], b[0]), _mm_mul_ps(a[1], b[1])),
+                    _mm_mul_ps(a[2], b[2]));
+}
+```
+The above will neither scale to AVX, AVX-512, etc. nor is it portable to other SIMD ISAs.
+
+## Build Requirements
+
+cmake >= 3.0
+
+C++11 Compiler:
+
+* GCC >= 4.8.1
+* clang >= 3.4
+* ICC >= 18.0.5
+* Visual Studio 2019 (64-bit target)
+
+
+## Building and Installing Vc
+
+* Clone Vc and initialize Vc's git submodules:
+
+```sh
+git clone https://github.com/VcDevel/Vc.git
+cd Vc
+git submodule update --init
+```
+
+* Create a build directory:
+
+```sh
+$ mkdir build
+$ cd build
+```
+
+* Configure with cmake and add relevant options:
+
+```sh
+$ cmake ..
+```
+
+Optionally, specify an installation directory:
+
+```sh
+$ cmake -DCMAKE_INSTALL_PREFIX=/opt/Vc ..
+```
+
+Optionally, include building the unit tests:
+
+```sh
+$ cmake -DBUILD_TESTING=ON ..
+```
+
+On Windows, if you have multiple versions of Visual Studio installed, you can select one:
+
+```sh
+$ cmake -G "Visual Studio 16 2019" ..
+```
+
+See `cmake --help` for a list of possible generators.
+
+
+* Build and install:
+
+```sh
+$ cmake --build . -j 16
+$ cmake --install . # may require permissions
+```
+
+On Windows, you can also open `Vc.sln` in Visual Studio and build/install from the IDE.
+
+## Documentation
+
+The documentation is generated via [doxygen](http://doxygen.org). You can build
+the documentation by running `doxygen` in the `doc` subdirectory.
+Alternatively, you can find nightly builds of the documentation at:
+
+* [1.4 branch](https://vcdevel.github.io/Vc-1.4/)
+* [1.4.3 release](https://vcdevel.github.io/Vc-1.4.3/)
+* [1.4.2 release](https://vcdevel.github.io/Vc-1.4.2/)
+* [1.4.1 release](https://vcdevel.github.io/Vc-1.4.1/)
+* [1.4.0 release](https://vcdevel.github.io/Vc-1.4.0/)
+* [1.3 branch](https://vcdevel.github.io/Vc-1.3/)
+* [1.3.0 release](https://vcdevel.github.io/Vc-1.3.0/)
+* [1.2.0 release](https://vcdevel.github.io/Vc-1.2.0/)
+* [1.1.0 release](https://vcdevel.github.io/Vc-1.1.0/)
+* [0.7 branch](https://vcdevel.github.io/Vc-0.7/)
+
+## Publications
+
+* [M. Kretz, "Extending C++ for Explicit Data-Parallel Programming via SIMD
+  Vector Types", Goethe University Frankfurt, Dissertation,
+  2015.](http://publikationen.ub.uni-frankfurt.de/frontdoor/index/index/docId/38415)
+* [M. Kretz and V. Lindenstruth, "Vc: A C++ library for explicit
+  vectorization", Software: Practice and Experience,
+  2011.](http://dx.doi.org/10.1002/spe.1149)
+* [M. Kretz, "Efficient Use of Multi- and Many-Core Systems with Vectorization
+  and Multithreading", University of Heidelberg,
+  2009.](http://code.compeng.uni-frankfurt.de/attachments/13/Diplomarbeit.pdf)
+
+[Work on integrating the functionality of Vc in the C++ standard library.](
+https://github.com/VcDevel/Vc/wiki/ISO-Standardization-of-the-Vector-classes)
+
+## License
+
+Vc is released under the terms of the [3-clause BSD license](http://opensource.org/licenses/BSD-3-Clause).
--- a/Test_all_compilers.sh
+++ b/Test_all_compilers.sh
@ -0,0 +1,140 @@
+#!/bin/sh -e
+export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games"
+export LANG="en_US.UTF-8"
+export LANGUAGE="en_US.UTF-8"
+export LC_CTYPE="en_US.UTF-8"
+export LC_NUMERIC="en_US.UTF-8"
+export LC_TIME="en_US.UTF-8"
+export LC_MESSAGES="en_US.UTF-8"
+unset CFLAGS CXXFLAGS
+
+cd "`dirname "$0"`"
+test -z "dashboard_model" && export dashboard_model=Experimental
+
+runTest() {
+  libpath="$LD_LIBRARY_PATH"
+  test -n "$1" && libpath="$(dirname $(realpath $($CXX $1 -print-file-name=libstdc++.so)))${libpath:+:}${libpath}"
+  LD_LIBRARY_PATH="$libpath" CFLAGS="$1" CXXFLAGS="$1" ctest -S test.cmake || true
+}
+
+tested_compilers="lsakdfjwowleqirjodfisj"
+
+runAllTests() {
+  # first make sure we don't test a compiler a second time
+  id="`which $CXX`"
+  id="`readlink -f $id`"
+  echo "$id"|grep -qF "$tested_compilers" && return
+  tested_compilers="$tested_compilers
+$id"
+
+  # alright run the ctest script
+  runTest
+  supports32Bit && runTest -m32 || true
+  supportsx32 && runTest -mx32 || true
+}
+
+supports32Bit() {
+  test `uname -m` = "x86_64" || return 1
+  CXX=${CXX:-c++}
+  cat > /tmp/m32test.cpp <<END
+#include <algorithm>
+#include <string>
+#include <iostream>
+#include <cerrno>
+void foo(int x) { switch (x) { case 0x0A: break; case 0x0B: break; case 0x0C: break; case 0x0D: break; case 0x0E: break; } }
+int main() { std::cout << "Hello World!\n"; return 0; }
+END
+  $CXX -m32 -o /tmp/m32test /tmp/m32test.cpp >/dev/null 2>&1 || return 1
+  rm /tmp/m32test*
+  return 0
+}
+
+supportsx32() {
+  test `uname -m` = "x86_64" || return 1
+  CXX=${CXX:-c++}
+  cat > /tmp/mx32test.cpp <<END
+#include <algorithm>
+#include <string>
+#include <iostream>
+#include <cerrno>
+void foo(int x) { switch (x) { case 0x0A: break; case 0x0B: break; case 0x0C: break; case 0x0D: break; case 0x0E: break; } }
+int main() { std::cout << "Hello World!\n"; return 0; }
+END
+  $CXX -mx32 -o /tmp/mx32test /tmp/mx32test.cpp >/dev/null 2>&1 || return 1
+  rm /tmp/mx32test*
+  return 0
+}
+
+system_compilers() {
+  cxxlist="`find /usr/bin/ /usr/local/bin/ -name '*++-[0-9]*'|grep -v -- -linux-gnu`"
+  if test -z "$cxxlist"; then
+    cxxlist="`find /usr/bin/ /usr/local/bin/ -name '*++'|grep -v -- -linux-gnu`"
+  fi
+  if test -z "$cxxlist"; then
+    # default compiler
+    runAllTests
+  else
+    for CXX in $cxxlist; do
+      CC=`echo "$CXX"|sed 's/clang++/clang/;s/g++/gcc/'`
+      if test -x "$CC" -a -x "$CXX"; then
+        export CC
+        export CXX
+        runAllTests
+      fi
+    done
+  fi
+}
+
+modules_compilers() {
+  if test -r /etc/profile.d/modules.sh; then
+    source /etc/profile.d/modules.sh
+    for mod in `module avail -t 2>&1`; do
+      case `echo $mod|tr '[:upper:]' '[:lower:]'` in
+        *intel*|*icc*) export CC=icc CXX=icpc;;
+        *gnu*|*gcc*) export CC=gcc CXX=g++;;
+        *llvm*|*clang*) export CC=clang CXX=clang++;;
+        *) continue;;
+      esac
+      module load $mod
+      runAllTests
+      module unload $mod
+    done
+  fi
+}
+
+gccbuild_compilers() {
+  for VcEnv in `find /opt/ -mindepth 2 -maxdepth 2 -name Vc.env`; do (
+    . "$VcEnv"
+    case "$VcEnv" in
+      *-snapshot/Vc.env)
+        ( cd $HOME/src/gcc-build && ./update.sh "`dirname "$VcEnv"`" )
+        ;;
+    esac
+    runAllTests
+  ) done
+}
+
+icc_compilers() {
+  test -d /opt/intel || return
+  export CC=icc
+  export CXX=icpc
+  icclist="`find /opt/intel/compiler* -name 'iccvars.sh' | xargs readlink -e | sort -ur`"
+  case `uname -m` in
+    x86_64)
+      COMPILERVARS_ARCHITECTURE=intel64
+      ;;
+    i[345678]86)
+      COMPILERVARS_ARCHITECTURE=ia32
+      ;;
+  esac
+  export COMPILERVARS_ARCHITECTURE
+  test -n "$icclist" && for IccEnv in $icclist; do (
+    . $IccEnv $COMPILERVARS_ARCHITECTURE
+    runAllTests
+  ) done
+}
+
+system_compilers
+modules_compilers
+gccbuild_compilers
+icc_compilers
--- a/Test_vc.sh
+++ b/Test_vc.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+
+case "$1" in
+   Experimental|Nightly|Continuous)
+      export dashboard_model=$1
+      case "$2" in
+        None|Debug|Release|RelWithDebug|RelWithDebInfo|MinSizeRel)
+        export build_type=$2
+        ;;
+      esac
+      ;;
+   *)
+      echo "Usage: $0 <model> [<build type>]"
+      echo
+      echo "Possible arguments for model are Nightly, Continuous, or Experimental."
+      echo "Build type may be one of: None Debug Release RelWithDebug RelWithDebInfo MinSizeRel."
+      echo
+      exit 1
+      ;;
+esac
+
+ctest -S "`dirname $0`/test.cmake"
--- a/Vc/Allocator
+++ b/Vc/Allocator
@ -0,0 +1,284 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_ALLOCATOR_H_
+#define VC_ALLOCATOR_H_
+
+#include <new>
+#include <cstddef>
+#include <cstdlib>
+#include <utility>
+
+#include "global.h"
+#include "common/macros.h"
+
+/**
+ * \ingroup Utilities
+ *
+ * Convenience macro to set the default allocator for a given \p Type to
+ * Vc::Allocator.
+ *
+ * \param Type Your type that you want to use with STL containers.
+ *
+ * \note You have to use this macro in the global namespace.
+ */
+#ifdef Vc_MSVC
+#define Vc_DECLARE_ALLOCATOR(Type)                                                   \
+namespace std                                                                        \
+{                                                                                    \
+template <> class allocator<Type> : public ::Vc::Allocator<Type>                     \
+{                                                                                    \
+public:                                                                              \
+    template <typename U> struct rebind {                                            \
+        typedef ::std::allocator<U> other;                                           \
+    };                                                                               \
+    /* MSVC brokenness: the following function is optional - just doesn't compile    \
+     * without it */                                                                 \
+    const allocator &select_on_container_copy_construction() const { return *this; } \
+};                                                                                   \
+}
+#else
+#define Vc_DECLARE_ALLOCATOR(Type)                                                   \
+namespace std                                                                        \
+{                                                                                    \
+template <> class allocator<Type> : public ::Vc::Allocator<Type>                     \
+{                                                                                    \
+public:                                                                              \
+    template <typename U> struct rebind {                                            \
+        typedef ::std::allocator<U> other;                                           \
+    };                                                                               \
+};                                                                                   \
+}
+#endif
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+    using std::size_t;
+    using std::ptrdiff_t;
+
+    /**
+     * \headerfile Allocator <Vc/Allocator>
+     * An allocator that uses global new and supports over-aligned types, as per [C++11 20.6.9].
+     *
+     * Meant as a simple replacement for the allocator defined in the C++ Standard.
+     * Allocation is done using the global new/delete operators. But if the alignment property of \p
+     * T is larger than the size of a pointer, the allocate function allocates slightly more memory
+     * to adjust the pointer for correct alignment.
+     *
+     * If the \p T does not require over-alignment no additional memory will be allocated.
+     *
+     * \tparam T The type of objects to allocate.
+     *
+     * Example:
+     * \code
+     * struct Data {
+     *   Vc::float_v x, y, z;
+     * };
+     *
+     * void fun()
+     * {
+     *   std::vector<Data> dat0; // this will use std::allocator<Data>, which probably ignores the
+     *                           // alignment requirements for Data. Thus any access to dat0 may
+     *                           // crash your program.
+     *
+     *   std::vector<Data, Vc::Allocator<Data> > dat1; // now std::vector will get correctly aligned
+     *                           // memory. Accesses to dat1 are safe.
+     *   ...
+     * \endcode
+     *
+     * %Vc ships a macro to conveniently tell STL to use Vc::Allocator per default for a given type:
+     * \code
+     * struct Data {
+     *   Vc::float_v x, y, z;
+     * };
+     * Vc_DECLARE_ALLOCATOR(Data)
+     *
+     * void fun()
+     * {
+     *   std::vector<Data> dat0; // good now
+     *   ...
+     * \endcode
+     *
+     * \ingroup Utilities
+     */
+    template<typename T> class Allocator
+    {
+    private:
+        enum Constants {
+#ifdef Vc_HAVE_STD_MAX_ALIGN_T
+            NaturalAlignment = alignof(std::max_align_t),
+#elif defined(Vc_HAVE_MAX_ALIGN_T)
+            NaturalAlignment = alignof(::max_align_t),
+#else
+            NaturalAlignment = sizeof(void *) > alignof(long double) ? sizeof(void *) :
+                (alignof(long double) > alignof(long long) ? alignof(long double) : alignof(long long)),
+#endif
+#if defined Vc_IMPL_AVX
+            SimdAlignment = 32,
+#elif defined Vc_IMPL_SSE
+            SimdAlignment = 16,
+#else
+            SimdAlignment = 1,
+#endif
+            Alignment = alignof(T) > SimdAlignment ? alignof(T) : SimdAlignment,
+            /* The number of extra bytes allocated must be large enough to put a pointer right
+             * before the adjusted address. This pointer stores the original address, which is
+             * required to call ::operator delete in deallocate.
+             *
+             * The address we get from ::operator new is a multiple of NaturalAlignment:
+             *   p = N * NaturalAlignment
+             *
+             * Since all alignments are powers of two, Alignment is a multiple of NaturalAlignment:
+             *   Alignment = k * NaturalAlignment
+             *
+             * two cases:
+             * 1. If p is already aligned to Alignment then allocate will return p + Alignment. In
+             *    this case there are Alignment Bytes available to store a pointer.
+             * 2. If p is not aligned then p + (k - (N modulo k)) * NaturalAlignment will be
+             *    returned. Since NaturalAlignment >= sizeof(void*) the pointer fits.
+             */
+            ExtraBytes = Alignment > NaturalAlignment ? Alignment : 0,
+            AlignmentMask = Alignment - 1
+        };
+    public:
+        typedef size_t    size_type;
+        typedef ptrdiff_t difference_type;
+        typedef T*        pointer;
+        typedef const T*  const_pointer;
+        typedef T&        reference;
+        typedef const T&  const_reference;
+        typedef T         value_type;
+
+        template<typename U> struct rebind { typedef Allocator<U> other; };
+
+        Allocator() throw() { }
+        Allocator(const Allocator&) throw() { }
+        template<typename U> Allocator(const Allocator<U>&) throw() { }
+
+        pointer address(reference x) const { return &x; }
+        const_pointer address(const_reference x) const { return &x; }
+
+        pointer allocate(size_type n, const void* = 0)
+        {
+            if (n > this->max_size()) {
+                throw std::bad_alloc();
+            }
+
+            char *p = static_cast<char *>(::operator new(n * sizeof(T) + ExtraBytes));
+            if (ExtraBytes > 0) {
+                char *const pp = p;
+                p += ExtraBytes;
+                const char *null = 0;
+                p -= ((p - null) & AlignmentMask); // equivalent to p &= ~AlignmentMask;
+                reinterpret_cast<char **>(p)[-1] = pp;
+            }
+            return reinterpret_cast<pointer>(p);
+        }
+
+        void deallocate(pointer p, size_type)
+        {
+            if (ExtraBytes > 0) {
+                p = reinterpret_cast<pointer *>(p)[-1];
+            }
+            ::operator delete(p);
+        }
+
+        size_type max_size() const throw() { return size_t(-1) / sizeof(T); }
+
+#ifdef Vc_MSVC
+        // MSVC brokenness: the following function is optional - just doesn't compile without it
+        const Allocator &select_on_container_copy_construction() const { return *this; }
+
+        // MSVC also requires a function that neither C++98 nor C++11 mention
+        // but it doesn't support variadic templates... otherwise the Vc_CXX11 clause would be nice
+        void construct(pointer p) { ::new(p) T(); }
+
+        // we still need the C++98 version:
+        void construct(pointer p, const T& val) { ::new(p) T(val); }
+        void destroy(pointer p) { p->~T(); }
+#else
+        template<typename U, typename... Args> void construct(U* p, Args&&... args)
+        {
+            ::new(p) U(std::forward<Args>(args)...);
+        }
+        template<typename U> void destroy(U* p) { p->~U(); }
+#endif
+    };
+
+    template<typename T> inline bool operator==(const Allocator<T>&, const Allocator<T>&) { return true;  }
+    template<typename T> inline bool operator!=(const Allocator<T>&, const Allocator<T>&) { return false; }
+
+}
+
+#include "vector.h"
+namespace std
+{
+    template<typename T, typename Abi>
+    class allocator<Vc::Vector<T, Abi> > : public ::Vc::Allocator<Vc::Vector<T, Abi> >
+    {
+    public:
+        template<typename U> struct rebind { typedef ::std::allocator<U> other; };
+#ifdef Vc_MSVC
+        // MSVC brokenness: the following function is optional - just doesn't compile without it
+        const allocator &select_on_container_copy_construction() const { return *this; }
+#endif
+    };
+    template <typename T, typename Abi>
+    class allocator<Vc::Mask<T, Abi>> : public ::Vc::Allocator<Vc::Mask<T, Abi>>
+    {
+    public:
+        template<typename U> struct rebind { typedef ::std::allocator<U> other; };
+#ifdef Vc_MSVC
+        // MSVC brokenness: the following function is optional - just doesn't compile without it
+        const allocator &select_on_container_copy_construction() const { return *this; }
+#endif
+    };
+    template <typename T, std::size_t N, typename V, std::size_t M>
+    class allocator<Vc::SimdArray<T, N, V, M>> : public ::Vc::Allocator<Vc::SimdArray<T, N, V, M>>
+    {
+    public:
+        template<typename U> struct rebind { typedef ::std::allocator<U> other; };
+#ifdef Vc_MSVC
+        // MSVC brokenness: the following function is optional - just doesn't compile without it
+        const allocator &select_on_container_copy_construction() const { return *this; }
+#endif
+    };
+    template <typename T, std::size_t N, typename V, std::size_t M>
+    class allocator<Vc::SimdMaskArray<T, N, V, M>> : public ::Vc::Allocator<Vc::SimdMaskArray<T, N, V, M>>
+    {
+    public:
+        template<typename U> struct rebind { typedef ::std::allocator<U> other; };
+#ifdef Vc_MSVC
+        // MSVC brokenness: the following function is optional - just doesn't compile without it
+        const allocator &select_on_container_copy_construction() const { return *this; }
+#endif
+    };
+}
+
+#endif // VC_ALLOCATOR_H_
+
+// vim: ft=cpp et sw=4 sts=4
--- a/Vc/IO
+++ b/Vc/IO
@ -0,0 +1,268 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_IO_
+#define VC_IO_
+
+#include "common/types.h"
+#include "common/simdarrayfwd.h"
+#include "common/memoryfwd.h"
+#include <iostream>
+
+#if defined(__GNUC__) && !defined(_WIN32) && defined(_GLIBCXX_OSTREAM)
+#define Vc_HACK_OSTREAM_FOR_TTY 1
+#endif
+
+#ifdef Vc_HACK_OSTREAM_FOR_TTY
+#include <unistd.h>
+#include <ext/stdio_sync_filebuf.h>
+#endif
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace
+{
+#ifdef Vc_HACK_OSTREAM_FOR_TTY
+class hacked_ostream : public std::ostream
+{
+public:
+    using std::ostream::_M_streambuf;
+};
+bool mayUseColor(const std::ostream &os) __attribute__((__const__));
+bool mayUseColor(const std::ostream &os)
+{
+    std::basic_streambuf<char> *hack1 =
+        const_cast<std::basic_streambuf<char> *>(os.*(&hacked_ostream::_M_streambuf));
+    __gnu_cxx::stdio_sync_filebuf<char> *hack =
+        dynamic_cast<__gnu_cxx::stdio_sync_filebuf<char> *>(hack1);
+    if (!hack) {
+        return false;
+    }
+    FILE *file = hack->file();
+    return 1 == isatty(fileno(file));
+}
+#else
+bool mayUseColor(const std::ostream &) { return false; }
+#endif
+}  // anonymous namespace
+
+namespace AnsiColor
+{
+struct Type
+{
+    const char *data;
+};
+static const Type green = {"\033[1;40;32m"};
+static const Type yellow = {"\033[1;40;33m"};
+static const Type blue = {"\033[1;40;34m"};
+static const Type normal = {"\033[0m"};
+
+inline std::ostream &operator<<(std::ostream &out, const Type &c)
+{
+    if (mayUseColor(out)) {
+        out << c.data;
+    }
+    return out;
+}
+}  // namespace AnsiColor
+
+/**
+ * \ingroup Vectors
+ * \headerfile IO <Vc/IO>
+ *
+ * Prints the contents of a vector into a stream object.
+ *
+ * \code
+ * const Vc::int_v v(Vc::IndexesFromZero);
+ * std::cout << v << std::endl;
+ * \endcode
+ * will output (with SSE):
+\verbatim
+[0, 1, 2, 3]
+\endverbatim
+ *
+ * \param out Any standard C++ ostream object. For example std::cout or a
+ *            std::stringstream object.
+ * \param v Any Vc::Vector object.
+ * \return  The ostream object: to chain multiple stream operations.
+ *
+ * \note With the GNU standard library this function will check whether the
+ *       output stream is a tty in which case it colorizes the output.
+ */
+template <typename T, typename Abi>
+inline std::ostream &operator<<(std::ostream &out, const Vc::Vector<T, Abi> &v)
+{
+    using TT = typename std::conditional<std::is_same<T, char>::value ||
+                                             std::is_same<T, unsigned char>::value ||
+                                             std::is_same<T, signed char>::value,
+                                         int,
+                                         T>::type;
+    out << AnsiColor::green << '[';
+    out << TT(v[0]);
+    for (size_t i = 1; i < v.Size; ++i) {
+        out << ", " << TT(v[i]);
+    }
+    out << ']' << AnsiColor::normal;
+    return out;
+}
+
+/**
+ * \ingroup Masks
+ * \headerfile IO <Vc/IO>
+ *
+ * Prints the contents of a mask into a stream object.
+ *
+ * \code
+ * const Vc::short_m m = Vc::short_v::IndexesFromZero() < 3;
+ * std::cout << m << std::endl;
+ * \endcode
+ * will output (with SSE):
+\verbatim
+m[1110 0000]
+\endverbatim
+ *
+ * \param out Any standard C++ ostream object. For example std::cout or a
+ *            std::stringstream object.
+ * \param m Any Vc::Mask object.
+ * \return  The ostream object: to chain multiple stream operations.
+ *
+ * \note With the GNU standard library this function will check whether the
+ *       output stream is a tty in which case it colorizes the output.
+ */
+template <typename T, typename Abi>
+inline std::ostream &operator<<(std::ostream &out, const Vc::Mask<T, Abi> &m)
+{
+    out << AnsiColor::blue << "m[";
+    for (unsigned int i = 0; i < m.Size; ++i) {
+        if (i > 0 && (i % 4) == 0) {
+            out << ' ';
+        }
+        if (m[i]) {
+            out << AnsiColor::yellow << '1';
+        } else {
+            out << AnsiColor::blue << '0';
+        }
+    }
+    out << AnsiColor::blue << ']' << AnsiColor::normal;
+    return out;
+}
+
+namespace Common
+{
+#ifdef DOXYGEN
+/**
+ * \ingroup Utilities
+ * \headerfile dox.h <Vc/IO>
+ *
+ * Prints the contents of a Memory object into a stream object.
+ *
+ * \code
+ * Vc::Memory<int_v, 10> m;
+ * for (int i = 0; i < m.entriesCount(); ++i) {
+ *   m[i] = i;
+ * }
+ * std::cout << m << std::endl;
+ * \endcode
+ * will output (with SSE):
+\verbatim
+{[0, 1, 2, 3] [4, 5, 6, 7] [8, 9, 0, 0]}
+\endverbatim
+ *
+ * \param s Any standard C++ ostream object. For example std::cout or a std::stringstream object.
+ * \param m Any Vc::Memory object.
+ * \return  The ostream object: to chain multiple stream operations.
+ *
+ * \note With the GNU standard library this function will check whether the
+ *       output stream is a tty in which case it colorizes the output.
+ *
+ * \warning Please do not forget that printing a large memory object can take a long time.
+ */
+template<typename V, typename Parent, typename Dimension, typename RM>
+inline std::ostream &operator<<(std::ostream &s, const Vc::MemoryBase<V, Parent, Dimension, RM> &m);
+#endif
+
+template<typename V, typename Parent, typename RM>
+inline std::ostream &operator<<(std::ostream &out, const MemoryBase<V, Parent, 1, RM> &m )
+{
+    out << AnsiColor::blue << '{' << AnsiColor::normal;
+    for (unsigned int i = 0; i < m.vectorsCount(); ++i) {
+        out << V(m.vector(i));
+    }
+    out << AnsiColor::blue << '}' << AnsiColor::normal;
+    return out;
+}
+
+template<typename V, typename Parent, typename RM>
+inline std::ostream &operator<<(std::ostream &out, const MemoryBase<V, Parent, 2, RM> &m )
+{
+    out << AnsiColor::blue << '{' << AnsiColor::normal;
+    for (size_t i = 0; i < m.rowsCount(); ++i) {
+        if (i > 0) {
+            out << "\n ";
+        }
+        const size_t vcount = m[i].vectorsCount();
+        for (size_t j = 0; j < vcount; ++j) {
+            out << V(m[i].vector(j));
+        }
+    }
+    out << AnsiColor::blue << '}' << AnsiColor::normal;
+    return out;
+}
+}  // namespace Common
+
+template<typename T, std::size_t N>
+inline std::ostream &operator<<(std::ostream &out, const SimdArray<T, N> &v)
+{
+    out << AnsiColor::green << '<' << v[0];
+    for (size_t i = 1; i < N; ++i) {
+        if (i % 4 == 0) out << " |";
+        out << ' ' << v[i];
+    }
+    return out << '>' << AnsiColor::normal;
+}
+
+template<typename T, std::size_t N>
+inline std::ostream &operator<<(std::ostream &out, const SimdMaskArray<T, N> &m)
+{
+    out << AnsiColor::blue << "«";
+    for (size_t i = 0; i < N; ++i) {
+        if (i > 0 && (i % 4) == 0) {
+            out << ' ';
+        }
+        if ( m[i] ) {
+          out << AnsiColor::yellow << '1';
+        } else {
+          out << AnsiColor::blue << '0';
+        }
+    }
+    return out << AnsiColor::blue << "»" << AnsiColor::normal;
+}
+}
+
+#endif // VC_IO_
+
+// vim: ft=cpp foldmethod=marker
--- a/Vc/Memory
+++ b/Vc/Memory
@ -0,0 +1,43 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_MEMORY_
+#define VC_MEMORY_
+
+#include "vector.h"
+#include "common/memory.h"
+#include "common/interleavedmemory.h"
+
+#include "common/make_unique.h"
+namespace Vc_VERSIONED_NAMESPACE
+{
+using Common::make_unique;
+}
+
+#endif // VC_MEMORY_
+
+// vim: ft=cpp foldmethod=marker
--- a/Vc/SimdArray
+++ b/Vc/SimdArray
@ -0,0 +1,35 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_SIMDARRAY_
+#define VC_SIMDARRAY_
+
+#include "common/simdarray.h"
+
+#endif // VC_SIMDARRAY_
+
+// vim: ft=cpp foldmethod=marker
--- a/Vc/Utils
+++ b/Vc/Utils
@ -0,0 +1,44 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_UTILS_
+#define VC_UTILS_
+
+#include "global.h"
+
+#ifdef Vc_IMPL_Scalar
+# define VECTOR_NAMESPACE Scalar
+#else
+# define VECTOR_NAMESPACE SSE
+#endif
+
+#include "common/deinterleave.h"
+#include "common/makeContainer.h"
+
+#endif // VC_UTILS_
+
+// vim: ft=cpp foldmethod=marker
--- a/Vc/Vc
+++ b/Vc/Vc
@ -0,0 +1,43 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_VC_
+#define VC_VC_
+#include "vector.h"
+#include "IO"
+#include "Memory"
+#include "Utils"
+#include "Allocator"
+#include "algorithm"
+#include "iterators"
+#include "simdize"
+#include "array"
+#include "span"
+#include "vector"
+#endif // VC_VC_
+
+// vim: ft=cpp foldmethod=marker
--- a/Vc/algorithm
+++ b/Vc/algorithm
@ -0,0 +1 @@
+#include "common/algorithms.h"
--- a/Vc/array
+++ b/Vc/array
@ -0,0 +1,315 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+//===---------------------------- array -----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef VC_INCLUDE_VC_ARRAY_
+#define VC_INCLUDE_VC_ARRAY_
+
+#include <type_traits>
+#include <utility>
+#include <iterator>
+#include <algorithm>
+#include <stdexcept>
+
+#include "common/subscript.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+/**
+ * \ingroup Containers
+ * This is `std::array` with additional subscript operators supporting gather and scatter operations.
+ *
+ * The [std::array](https://en.cppreference.com/w/cpp/container/array) documentation applies.
+ *
+ * Gathers from structured data (AoS: arrays of struct) are possible via a special
+ * subscript operator.
+ * Example:
+ * \code
+ * Vc::array<float, 100> data;
+ * std::iota(data.begin(), data.end(), 0.f);  // fill with values 0, 1, 2, ...
+ * auto indexes = float_v::IndexType::IndexesFromZero();
+ * float_v gathered = data[indexes];  // gathered == [0, 1, 2, ...]
+ * \endcode
+ *
+ * This also works for gathers into arrays of structures:
+ * \code
+ * struct Point { float x, y, z; };
+ * Vc::array<Point, 100> points;
+ * // fill points ...
+ * auto indexes = float_v::IndexType::IndexesFromZero();
+ * float_v xs = data[indexes][&Point::x];  // [points[0].x, points[1].x, points[2].x, ...]
+ * float_v ys = data[indexes][&Point::y];  // [points[0].y, points[1].y, points[2].y, ...]
+ * float_v zs = data[indexes][&Point::z];  // [points[0].z, points[1].z, points[2].z, ...]
+ * \endcode
+ *
+ * Arrays may also be nested:
+ * \code:
+ * Vc::array<Vc::array<float, 3>, 100> points;
+ * // fill points ...
+ * auto indexes = float_v::IndexType::IndexesFromZero();
+ * float_v xs = data[indexes][0];  // [points[0][0], points[1][0], points[2][0], ...]
+ * float_v ys = data[indexes][1];  // [points[0][1], points[1][1], points[2][1], ...]
+ * float_v zs = data[indexes][2];  // [points[0][2], points[1][2], points[2][2], ...]
+ * \endcode
+ */
+template <class T, size_t Size> struct array {
+    // types:
+    typedef array self_;
+    typedef T value_type;
+    typedef value_type& reference;
+    typedef const value_type& const_reference;
+    typedef value_type* iterator;
+    typedef const value_type* const_iterator;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+    typedef size_t size_type;
+    typedef ptrdiff_t difference_type;
+    typedef std::reverse_iterator<iterator> reverse_iterator;
+    typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+    value_type elems_[Size > 0 ? Size : 1];
+
+    // No explicit construct/copy/destroy for aggregate type
+    void fill(const value_type& u_) { std::fill_n(elems_, Size, u_); }
+    void swap(array& a_) noexcept(std::swap(std::declval<T &>(), std::declval<T &>()))
+    {
+        std::swap_ranges(elems_, elems_ + Size, a_.elems_);
+    }
+
+    // iterators:
+    iterator begin() noexcept { return iterator(elems_); }
+    const_iterator begin() const noexcept { return const_iterator(elems_); }
+    iterator end() noexcept { return iterator(elems_ + Size); }
+    const_iterator end() const noexcept { return const_iterator(elems_ + Size); }
+    reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
+    const_reverse_iterator rbegin() const noexcept
+    {
+        return const_reverse_iterator(end());
+    }
+    reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
+    const_reverse_iterator rend() const noexcept
+    {
+        return const_reverse_iterator(begin());
+    }
+
+    const_iterator cbegin() const noexcept { return begin(); }
+    const_iterator cend() const noexcept { return end(); }
+    const_reverse_iterator crbegin() const noexcept { return rbegin(); }
+    const_reverse_iterator crend() const noexcept { return rend(); }
+    // capacity:
+    constexpr size_type size() const noexcept { return Size; }
+    constexpr size_type max_size() const noexcept { return Size; }
+    constexpr bool empty() const noexcept { return Size == 0; }
+    // element access:
+    reference operator[](size_type n_) { return elems_[n_]; }
+    constexpr const_reference operator[](size_type n_) const { return elems_[n_]; }
+
+    /**
+     * \name Data-Parallel Subscripting for Gather & Scatter
+     */
+    ///@{
+    template <typename I>
+    Vc_ALWAYS_INLINE auto operator[](I&& arg_)
+        -> decltype(subscript_operator(*this, std::forward<I>(arg_)))
+    {
+        return subscript_operator(*this, std::forward<I>(arg_));
+    }
+
+    template <typename I>
+    Vc_ALWAYS_INLINE auto operator[](I&& arg_) const
+        -> decltype(subscript_operator(*this, std::forward<I>(arg_)))
+    {
+        return subscript_operator(*this, std::forward<I>(arg_));
+    }
+    ///@}
+
+    reference at(size_type n_);
+    constexpr const_reference at(size_type n_) const;
+
+    reference front() { return elems_[0]; }
+    constexpr const_reference front() const { return elems_[0]; }
+    reference back() { return elems_[Size > 0 ? Size - 1 : 0]; }
+    constexpr const_reference back() const { return elems_[Size > 0 ? Size - 1 : 0]; }
+    value_type* data() noexcept { return elems_; }
+    const value_type* data() const noexcept { return elems_; }
+};
+
+template <class T, size_t Size>
+typename array<T, Size>::reference array<T, Size>::at(size_type n_)
+{
+    if (n_ >= Size) {
+        throw std::out_of_range("array::at");
+    }
+    return elems_[n_];
+}
+
+template <class T, size_t Size>
+constexpr typename array<T, Size>::const_reference array<T, Size>::at(size_type n_) const
+{
+    return n_ >= Size ? (throw std::out_of_range("array::at"), elems_[0]) : elems_[n_];
+}
+
+template <class T, size_t Size>
+inline bool operator==(const array<T, Size>& x_, const array<T, Size>& y_)
+{
+    return std::equal(x_.elems_, x_.elems_ + Size, y_.elems_);
+}
+
+template <class T, size_t Size>
+inline bool operator!=(const array<T, Size>& x_, const array<T, Size>& y_)
+{
+    return !(x_ == y_);
+}
+
+template <class T, size_t Size>
+inline bool operator<(const array<T, Size>& x_, const array<T, Size>& y_)
+{
+    return std::lexicographical_compare(x_.elems_, x_.elems_ + Size, y_.elems_,
+                                        y_.elems_ + Size);
+}
+
+template <class T, size_t Size>
+inline bool operator>(const array<T, Size>& x_, const array<T, Size>& y_)
+{
+    return y_ < x_;
+}
+
+template <class T, size_t Size>
+inline bool operator<=(const array<T, Size>& x_, const array<T, Size>& y_)
+{
+    return !(y_ < x_);
+}
+
+template <class T, size_t Size>
+inline bool operator>=(const array<T, Size>& x_, const array<T, Size>& y_)
+{
+    return !(x_ < y_);
+}
+
+/**\name non-member begin & end
+ * Implement the non-member begin & end functions in the %Vc namespace so that ADL works
+ * and `begin(some_vc_array)` always works.
+ */
+///@{
+template <typename T, std::size_t N>
+inline auto begin(array<T, N>& arr) -> decltype(arr.begin())
+{
+    return arr.begin();
+}
+template <typename T, std::size_t N>
+inline auto begin(const array<T, N>& arr) -> decltype(arr.begin())
+{
+    return arr.begin();
+}
+template <typename T, std::size_t N>
+inline auto end(array<T, N>& arr) -> decltype(arr.end())
+{
+    return arr.end();
+}
+template <typename T, std::size_t N>
+inline auto end(const array<T, N>& arr) -> decltype(arr.end())
+{
+    return arr.end();
+}
+///@}
+
+namespace Traits
+{
+template <typename T, std::size_t N>
+struct has_no_allocated_data_impl<Vc::array<T, N>> : public std::true_type
+{
+};
+template <typename T, std::size_t N>
+struct has_contiguous_storage_impl<Vc::array<T, N>> : public std::true_type
+{
+};
+}  // namespace Traits
+}  // namespace Vc
+
+namespace std
+{
+template <class T, size_t Size>
+inline
+#ifdef Vc_MSVC
+    // MSVC fails to do SFINAE correctly and gets totally confused:
+    // error C2433: 'type': 'inline' not permitted on data declarations
+    // error C4430: missing type specifier - int assumed. Note: C++ does not support default-int
+    // error C2061: syntax error: identifier 'swap'
+    void
+#else
+    typename enable_if<is_same<void, decltype(swap(declval<T&>(), declval<T&>()))>::value,
+                       void>::type
+#endif
+    swap(const Vc::array<T, Size>& x_,
+         const Vc::array<T, Size>& y_) noexcept(swap(declval<T&>(), declval<T&>()))
+{
+    x_.swap(y_);
+}
+
+template <class T, size_t Size>
+class tuple_size<Vc::array<T, Size>> : public integral_constant<size_t, Size>
+{
+};
+
+template <size_t I, class T, size_t Size> class tuple_element<I, Vc::array<T, Size>>
+{
+public:
+    typedef T type;
+};
+
+template <size_t I, class T, size_t Size>
+inline constexpr typename std::enable_if<(I < Size), T&>::type get(
+    Vc::array<T, Size>& a_) noexcept
+{
+    return a_.elems_[I];
+}
+
+template <size_t I, class T, size_t Size>
+inline constexpr typename std::enable_if<(I < Size), const T&>::type get(
+    const Vc::array<T, Size>& a_) noexcept
+{
+    return a_.elems_[I];
+}
+
+template <size_t I, class T, size_t Size>
+inline constexpr typename std::enable_if<(I < Size), T&&>::type get(
+    Vc::array<T, Size>&& a_) noexcept
+{
+    return std::move(a_.elems_[I]);
+}
+}  // namespace std
+
+#endif  // VC_INCLUDE_VC_ARRAY_
+
+// vim: ft=cpp foldmethod=marker
--- a/Vc/avx/README
+++ b/Vc/avx/README
@ -0,0 +1,58 @@
+###########################################
+#################   AVX   #################
+###########################################
+
+
+1. Floating Point
+===========================================
+Uses full 256bit vectors for all operations. 128bit vectors are never used.
+
+
+2. Integer
+===========================================
+Integer support in AVX is minimal.
+The 256bit integer vectors are just intended as a supporting type of float operations.
+
+Any arithmetic, logical, or comparison operations must be implemented using 128bit operations.
+
+int_v/uint_v could be implemented either as 128 or 256 types. I.e. either int_v::Size == 4 or 8.
+
+
+2.1. 256bit int vectors
+===========================================
+
+2.1.1. Implementation Details:
+This requires the SSE operations to not zero the high bits of the registers. Since the YMM registers
+are aliased on the XMM registers you need to use SSE ops that are not using the VEX prefix (IIUC).
+Or you have to use two XMM registers most of the time.
+Perfect would be the use of
+union M256I {
+  __m256i ymm;
+  __m128i xmm[2];
+};
+But as far as I know GCC, this will result in lots of unnecessary loads and stores. (It seems this is
+due to GCC expecting aliasing, thus making sure the modified values are always up-to-date in memory
+- like if it were declared volatile.)
+
+2.1.2. Upsides:
+int_v::Size == float_v::Size
+
+2.1.3. Downsides:
+Register pressure is increased.
+
+2.2. 128bit int vectors
+===========================================
+
+2.2.1. Implementation Details:
+
+2.2.2. Upsides:
+
+2.2.3. Downsides:
+- Use of int_v for float_v operations involving __m256i arguments require an extra type. This will
+  be hard to generalize
+
+
+2.3. Mixed approach
+===========================================
+int_v/uint_v are implemented as 256bit while short_v/ushort_v are implemented as 128bit. Thus
+int_v::Size == short_v::Size (which is the case on LRBni, too).
--- a/Vc/avx/casts.h
+++ b/Vc/avx/casts.h
@ -0,0 +1,305 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_AVX_CASTS_H_
+#define VC_AVX_CASTS_H_
+
+#include "intrinsics.h"
+#include "types.h"
+#include "../sse/casts.h"
+#include "shuffle.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace AVX
+{
+namespace Casts
+{
+    template<typename T> Vc_INTRINSIC_L T avx_cast(__m128  v) Vc_INTRINSIC_R;
+    template<typename T> Vc_INTRINSIC_L T avx_cast(__m128i v) Vc_INTRINSIC_R;
+    template<typename T> Vc_INTRINSIC_L T avx_cast(__m128d v) Vc_INTRINSIC_R;
+    template<typename T> Vc_INTRINSIC_L T avx_cast(__m256  v) Vc_INTRINSIC_R;
+    template<typename T> Vc_INTRINSIC_L T avx_cast(__m256i v) Vc_INTRINSIC_R;
+    template<typename T> Vc_INTRINSIC_L T avx_cast(__m256d v) Vc_INTRINSIC_R;
+
+    // 128 -> 128
+    template<> Vc_INTRINSIC __m128  avx_cast(__m128  v) { return v; }
+    template<> Vc_INTRINSIC __m128  avx_cast(__m128i v) { return _mm_castsi128_ps(v); }
+    template<> Vc_INTRINSIC __m128  avx_cast(__m128d v) { return _mm_castpd_ps(v); }
+    template<> Vc_INTRINSIC __m128i avx_cast(__m128  v) { return _mm_castps_si128(v); }
+    template<> Vc_INTRINSIC __m128i avx_cast(__m128i v) { return v; }
+    template<> Vc_INTRINSIC __m128i avx_cast(__m128d v) { return _mm_castpd_si128(v); }
+    template<> Vc_INTRINSIC __m128d avx_cast(__m128  v) { return _mm_castps_pd(v); }
+    template<> Vc_INTRINSIC __m128d avx_cast(__m128i v) { return _mm_castsi128_pd(v); }
+    template<> Vc_INTRINSIC __m128d avx_cast(__m128d v) { return v; }
+
+    // 128 -> 256
+    // FIXME: the following casts leave the upper 128bits undefined. With GCC and ICC I've never
+    // seen the cast not do what I want though: after a VEX-coded SSE instruction the register's
+    // upper 128bits are zero. Thus using the same register as AVX register will have the upper
+    // 128bits zeroed. MSVC, though, implements _mm256_castxx128_xx256 with a 128bit move to memory
+    // + 256bit load. Thus the upper 128bits are really undefined. But there is no intrinsic to do
+    // what I want (i.e. alias the register, disallowing the move to memory in-between). I'm stuck,
+    // do we really want to rely on specific compiler behavior here?
+    template<> Vc_INTRINSIC __m256  avx_cast(__m128  v) { return _mm256_castps128_ps256(v); }
+    template<> Vc_INTRINSIC __m256  avx_cast(__m128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); }
+    template<> Vc_INTRINSIC __m256  avx_cast(__m128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); }
+    template<> Vc_INTRINSIC __m256i avx_cast(__m128  v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); }
+    template<> Vc_INTRINSIC __m256i avx_cast(__m128i v) { return _mm256_castsi128_si256(v); }
+    template<> Vc_INTRINSIC __m256i avx_cast(__m128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); }
+    template<> Vc_INTRINSIC __m256d avx_cast(__m128  v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); }
+    template<> Vc_INTRINSIC __m256d avx_cast(__m128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); }
+    template<> Vc_INTRINSIC __m256d avx_cast(__m128d v) { return _mm256_castpd128_pd256(v); }
+
+#if defined Vc_MSVC || defined Vc_CLANG || defined Vc_APPLECLANG
+    static Vc_INTRINSIC Vc_CONST __m256  zeroExtend(__m128  v) { return _mm256_permute2f128_ps   (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); }
+    static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); }
+    static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_permute2f128_pd   (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); }
+#else
+    static Vc_INTRINSIC Vc_CONST __m256  zeroExtend(__m128  v) { return _mm256_castps128_ps256(v); }
+    static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); }
+    static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); }
+#endif
+
+    // 256 -> 128
+    template<> Vc_INTRINSIC __m128  avx_cast(__m256  v) { return _mm256_castps256_ps128(v); }
+    template<> Vc_INTRINSIC __m128  avx_cast(__m256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); }
+    template<> Vc_INTRINSIC __m128  avx_cast(__m256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); }
+    template<> Vc_INTRINSIC __m128i avx_cast(__m256  v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); }
+    template<> Vc_INTRINSIC __m128i avx_cast(__m256i v) { return _mm256_castsi256_si128(v); }
+    template<> Vc_INTRINSIC __m128i avx_cast(__m256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); }
+    template<> Vc_INTRINSIC __m128d avx_cast(__m256  v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); }
+    template<> Vc_INTRINSIC __m128d avx_cast(__m256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); }
+    template<> Vc_INTRINSIC __m128d avx_cast(__m256d v) { return _mm256_castpd256_pd128(v); }
+
+    // 256 -> 256
+    template<> Vc_INTRINSIC __m256  avx_cast(__m256  v) { return v; }
+    template<> Vc_INTRINSIC __m256  avx_cast(__m256i v) { return _mm256_castsi256_ps(v); }
+    template<> Vc_INTRINSIC __m256  avx_cast(__m256d v) { return _mm256_castpd_ps(v); }
+    template<> Vc_INTRINSIC __m256i avx_cast(__m256  v) { return _mm256_castps_si256(v); }
+    template<> Vc_INTRINSIC __m256i avx_cast(__m256i v) { return v; }
+    template<> Vc_INTRINSIC __m256i avx_cast(__m256d v) { return _mm256_castpd_si256(v); }
+    template<> Vc_INTRINSIC __m256d avx_cast(__m256  v) { return _mm256_castps_pd(v); }
+    template<> Vc_INTRINSIC __m256d avx_cast(__m256i v) { return _mm256_castsi256_pd(v); }
+    template<> Vc_INTRINSIC __m256d avx_cast(__m256d v) { return v; }
+
+    // simplify splitting 256-bit registers in 128-bit registers
+    Vc_INTRINSIC Vc_CONST __m128  lo128(__m256  v) { return avx_cast<__m128>(v); }
+    Vc_INTRINSIC Vc_CONST __m128d lo128(__m256d v) { return avx_cast<__m128d>(v); }
+    Vc_INTRINSIC Vc_CONST __m128i lo128(__m256i v) { return avx_cast<__m128i>(v); }
+    Vc_INTRINSIC Vc_CONST __m128  hi128(__m256  v) { return extract128<1>(v); }
+    Vc_INTRINSIC Vc_CONST __m128d hi128(__m256d v) { return extract128<1>(v); }
+    Vc_INTRINSIC Vc_CONST __m128i hi128(__m256i v) { return extract128<1>(v); }
+
+    // simplify combining 128-bit registers in 256-bit registers
+    Vc_INTRINSIC Vc_CONST __m256  concat(__m128  a, __m128  b) { return insert128<1>(avx_cast<__m256 >(a), b); }
+    Vc_INTRINSIC Vc_CONST __m256d concat(__m128d a, __m128d b) { return insert128<1>(avx_cast<__m256d>(a), b); }
+    Vc_INTRINSIC Vc_CONST __m256i concat(__m128i a, __m128i b) { return insert128<1>(avx_cast<__m256i>(a), b); }
+
+}  // namespace Casts
+using namespace Casts;
+}  // namespace AVX
+
+namespace AVX2
+{
+using namespace AVX::Casts;
+}  // namespace AVX2
+
+namespace AVX
+{
+template <typename From, typename To> struct ConvertTag {};
+
+Vc_INTRINSIC __m256i convert(__m256  v, ConvertTag<float , int>) { return _mm256_cvttps_epi32(v); }
+Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, int>) { return _mm256_cvttpd_epi32(v); }
+Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int   , int>) { return v; }
+Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint  , int>) { return v; }
+Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , int>) {
+#ifdef Vc_IMPL_AVX2
+    return _mm256_cvtepi16_epi32(v);
+#else
+    return AVX::srai_epi32<16>(
+        concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
+#endif
+}
+Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, int>) {
+#ifdef Vc_IMPL_AVX2
+    return _mm256_cvtepu16_epi32(v);
+#else
+    return AVX::srli_epi32<16>(
+        concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
+#endif
+}
+
+Vc_INTRINSIC __m256i convert(__m256  v, ConvertTag<float , uint>) {
+    using namespace AVX;
+    return _mm256_castps_si256(_mm256_blendv_ps(
+        _mm256_castsi256_ps(_mm256_cvttps_epi32(v)),
+        _mm256_castsi256_ps(add_epi32(_mm256_cvttps_epi32(_mm256_sub_ps(v, set2power31_ps())),
+                                      set2power31_epu32())),
+        cmpge_ps(v, set2power31_ps())));
+}
+Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, uint>) {
+    using namespace AVX;
+    return _mm_xor_si128(
+        _mm256_cvttpd_epi32(_mm256_sub_pd(_mm256_floor_pd(v), set1_pd(0x80000000u))),
+        _mm_set2power31_epu32());
+}
+Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int   , uint>) { return v; }
+Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint  , uint>) { return v; }
+Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , uint>) {
+#ifdef Vc_IMPL_AVX2
+    return _mm256_cvtepi16_epi32(v);
+#else
+    return AVX::srai_epi32<16>(
+        concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
+#endif
+}
+Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, uint>) {
+#ifdef Vc_IMPL_AVX2
+    return _mm256_cvtepu16_epi32(v);
+#else
+    return AVX::srli_epi32<16>(
+        concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
+#endif
+}
+
+Vc_INTRINSIC __m256  convert(__m256  v, ConvertTag<float , float>) { return v; }
+Vc_INTRINSIC __m128  convert(__m256d v, ConvertTag<double, float>) { return _mm256_cvtpd_ps(v); }
+Vc_INTRINSIC __m256  convert(__m256i v, ConvertTag<int   , float>) { return _mm256_cvtepi32_ps(v); }
+Vc_INTRINSIC __m256  convert(__m256i v, ConvertTag<uint  , float>) {
+    // this is complicated because cvtepi32_ps only supports signed input. Thus, all
+    // input values with the MSB set would produce a negative result. We can reuse the
+    // cvtepi32_ps instruction if we unset the MSB. But then the rounding results can be
+    // different. Since float uses 24 bits for the mantissa (effectively), the 9-bit LSB
+    // determines the rounding direction. (Consider the bits ...8'7654'3210. The bits [0:7]
+    // need to be dropped and if > 0x80 round up, if < 0x80 round down. If [0:7] == 0x80
+    // then the rounding direction is determined by bit [8] for round to even. That's why
+    // the 9th bit is relevant for the rounding decision.)
+    // If the MSB of the input is set to 0, the cvtepi32_ps instruction makes its rounding
+    // decision on the lowest 8 bits instead. A second rounding decision is made when
+    // float(0x8000'0000) is added. This will rarely fix the rounding issue.
+    //
+    // Here's what the standard rounding mode expects:
+    // 0xc0000080 should cvt to 0xc0000000
+    // 0xc0000081 should cvt to 0xc0000100
+    //     --     should cvt to 0xc0000100
+    // 0xc000017f should cvt to 0xc0000100
+    // 0xc0000180 should cvt to 0xc0000200
+    //
+    // However: using float(input ^ 0x8000'0000) + float(0x8000'0000) we get:
+    // 0xc0000081 would cvt to 0xc0000000
+    // 0xc00000c0 would cvt to 0xc0000000
+    // 0xc00000c1 would cvt to 0xc0000100
+    // 0xc000013f would cvt to 0xc0000100
+    // 0xc0000140 would cvt to 0xc0000200
+    //
+    // Solution: float(input & 0x7fff'fe00) + (float(0x8000'0000) + float(input & 0x1ff))
+    // This ensures the rounding decision is made on the 9-bit LSB when 0x8000'0000 is
+    // added to the float value of the low 8 bits of the input.
+    using namespace AVX;
+    return _mm256_blendv_ps(
+        _mm256_cvtepi32_ps(v),
+        _mm256_add_ps(_mm256_cvtepi32_ps(and_si256(v, set1_epi32(0x7ffffe00))),
+                      _mm256_add_ps(set2power31_ps(), _mm256_cvtepi32_ps(and_si256(
+                                                          v, set1_epi32(0x000001ff))))),
+        _mm256_castsi256_ps(cmplt_epi32(v, _mm256_setzero_si256())));
+}
+Vc_INTRINSIC __m256  convert(__m128i v, ConvertTag<short , float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag< short, int>())); }
+Vc_INTRINSIC __m256  convert(__m128i v, ConvertTag<ushort, float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag<ushort, int>())); }
+
+Vc_INTRINSIC __m256d convert(__m128  v, ConvertTag<float , double>) { return _mm256_cvtps_pd(v); }
+Vc_INTRINSIC __m256d convert(__m256d v, ConvertTag<double, double>) { return v; }
+Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<int   , double>) { return _mm256_cvtepi32_pd(v); }
+Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<uint  , double>) {
+    using namespace AVX;
+    return _mm256_add_pd(
+        _mm256_cvtepi32_pd(_mm_xor_si128(v, _mm_setmin_epi32())),
+        set1_pd(1u << 31)); }
+Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<short , double>) { return convert(convert(v, SSE::ConvertTag< short, int>()), ConvertTag<int, double>()); }
+Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<ushort, double>) { return convert(convert(v, SSE::ConvertTag<ushort, int>()), ConvertTag<int, double>()); }
+
+Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int   , short>) {
+#ifdef Vc_IMPL_AVX2
+    auto a = _mm256_shuffle_epi8(
+        v, _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80,
+                            -0x80, -0x80, -0x80, 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
+                            -0x80, -0x80, -0x80, -0x80, -0x80, -0x80));
+    return lo128(_mm256_permute4x64_epi64(a, 0xf8));  // a[0] a[2] | a[3] a[3]
+#else
+    const auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
+    const auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
+    const auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
+    const auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
+    return _mm_unpacklo_epi16(tmp2, tmp3);
+#endif
+}
+Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint  , short>) { return convert(v, ConvertTag<int, short>()); }
+Vc_INTRINSIC __m128i convert(__m256  v, ConvertTag<float , short>) { return convert(convert(v, ConvertTag<float, int>()), ConvertTag<int, short>()); }
+Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, short>) { return convert(convert(v, ConvertTag<double, int>()), SSE::ConvertTag<int, short>()); }
+Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , short>) { return v; }
+Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, short>) { return v; }
+
+Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int   , ushort>) {
+    auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
+    auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
+    auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
+    auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
+    return _mm_unpacklo_epi16(tmp2, tmp3);
+}
+Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint  , ushort>) {
+    auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
+    auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
+    auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
+    auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
+    return _mm_unpacklo_epi16(tmp2, tmp3);
+}
+Vc_INTRINSIC __m128i convert(__m256  v, ConvertTag<float , ushort>) { return convert(convert(v, ConvertTag<float, uint>()), ConvertTag<uint, ushort>()); }
+Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, ushort>) { return convert(convert(v, ConvertTag<double, uint>()), SSE::ConvertTag<uint, ushort>()); }
+Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , ushort>) { return v; }
+Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, ushort>) { return v; }
+
+template <typename From, typename To>
+Vc_INTRINSIC auto convert(
+    typename std::conditional<(sizeof(From) < sizeof(To)),
+                              typename SSE::VectorTraits<From>::VectorType,
+                              typename AVX::VectorTypeHelper<From>::Type>::type v)
+    -> decltype(convert(v, ConvertTag<From, To>()))
+{
+    return convert(v, ConvertTag<From, To>());
+}
+
+template <typename From, typename To, typename = enable_if<(sizeof(From) < sizeof(To))>>
+Vc_INTRINSIC auto convert(typename AVX::VectorTypeHelper<From>::Type v)
+    -> decltype(convert(lo128(v), ConvertTag<From, To>()))
+{
+    return convert(lo128(v), ConvertTag<From, To>());
+}
+}  // namespace AVX
+}  // namespace Vc
+
+#endif // VC_AVX_CASTS_H_
--- a/Vc/avx/const.h
+++ b/Vc/avx/const.h
@ -0,0 +1,155 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_AVX_CONST_H_
+#define VC_AVX_CONST_H_
+
+#include <cstddef>
+#include "types.h"
+#include "const_data.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace AVX
+{
+    template<typename T> struct IndexesFromZeroData;
+    template<> struct IndexesFromZeroData<int> {
+        static Vc_ALWAYS_INLINE Vc_CONST const int *address() { return reinterpret_cast<const int *>(&_IndexesFromZero32[0]); }
+    };
+    template<> struct IndexesFromZeroData<unsigned int> {
+        static Vc_ALWAYS_INLINE Vc_CONST const unsigned int *address() { return &_IndexesFromZero32[0]; }
+    };
+    template<> struct IndexesFromZeroData<short> {
+        static Vc_ALWAYS_INLINE Vc_CONST const short *address() { return reinterpret_cast<const short *>(&_IndexesFromZero16[0]); }
+    };
+    template<> struct IndexesFromZeroData<unsigned short> {
+        static Vc_ALWAYS_INLINE Vc_CONST const unsigned short *address() { return &_IndexesFromZero16[0]; }
+    };
+    template<> struct IndexesFromZeroData<signed char> {
+        static Vc_ALWAYS_INLINE Vc_CONST const signed char *address() { return reinterpret_cast<const signed char *>(&_IndexesFromZero8[0]); }
+    };
+    template<> struct IndexesFromZeroData<char> {
+        static Vc_ALWAYS_INLINE Vc_CONST const char *address() { return reinterpret_cast<const char *>(&_IndexesFromZero8[0]); }
+    };
+    template<> struct IndexesFromZeroData<unsigned char> {
+        static Vc_ALWAYS_INLINE Vc_CONST const unsigned char *address() { return &_IndexesFromZero8[0]; }
+    };
+
+    template<typename _T> struct Const
+    {
+        typedef Vector<_T> V;
+        typedef typename V::EntryType T;
+        typedef typename V::Mask M;
+
+        static Vc_ALWAYS_INLINE Vc_CONST V _pi_4()        { return V(c_trig<T>::data[0]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_hi()     { return V(c_trig<T>::data[1]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem1()   { return V(c_trig<T>::data[2]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V _pi_4_rem2()   { return V(c_trig<T>::data[3]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V _1_16()        { return V(c_trig<T>::data[4]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V _16()          { return V(c_trig<T>::data[5]); }
+
+        static Vc_ALWAYS_INLINE Vc_CONST V atanP(int i)      { return V(c_trig<T>::data[(12 + i)]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V atanQ(int i)      { return V(c_trig<T>::data[(17 + i)]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V atanThrsHi()      { return V(c_trig<T>::data[22]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V atanThrsLo()      { return V(c_trig<T>::data[23]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V _pi_2_rem()       { return V(c_trig<T>::data[24]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V lossThreshold()   { return V(c_trig<T>::data[8]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V _4_pi()           { return V(c_trig<T>::data[9]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V _pi_2()           { return V(c_trig<T>::data[10]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V _pi()             { return V(c_trig<T>::data[11]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff0(int i) { return V(c_trig<T>::data[(28 + i)]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff1(int i) { return V(c_trig<T>::data[(33 + i)]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff2(int i) { return V(c_trig<T>::data[(37 + i)]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V asinCoeff3(int i) { return V(c_trig<T>::data[(43 + i)]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V smallAsinInput()  { return V(c_trig<T>::data[25]); }
+        static Vc_ALWAYS_INLINE Vc_CONST V largeAsinInput()  { return V(c_trig<T>::data[26]); }
+
+        static Vc_ALWAYS_INLINE Vc_CONST M exponentMask() { return M(V(c_log<T>::d(1)).data()); }
+        static Vc_ALWAYS_INLINE Vc_CONST V _1_2()         { return V(c_log<T>::d(18)); }
+        static Vc_ALWAYS_INLINE Vc_CONST V _1_sqrt2()     { return V(c_log<T>::d(15)); }
+        static Vc_ALWAYS_INLINE Vc_CONST V P(int i)       { return V(c_log<T>::d(2 + i)); }
+        static Vc_ALWAYS_INLINE Vc_CONST V Q(int i)       { return V(c_log<T>::d(8 + i)); }
+        static Vc_ALWAYS_INLINE Vc_CONST V min()          { return V(c_log<T>::d(14)); }
+        static Vc_ALWAYS_INLINE Vc_CONST V ln2_small()    { return V(c_log<T>::d(17)); }
+        static Vc_ALWAYS_INLINE Vc_CONST V ln2_large()    { return V(c_log<T>::d(16)); }
+        static Vc_ALWAYS_INLINE Vc_CONST V neginf()       { return V(c_log<T>::d(13)); }
+        static Vc_ALWAYS_INLINE Vc_CONST V log10_e()      { return V(c_log<T>::d(19)); }
+        static Vc_ALWAYS_INLINE Vc_CONST V log2_e()       { return V(c_log<T>::d(20)); }
+
+        static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask() Vc_ALWAYS_INLINE_R Vc_CONST_R;
+        static Vc_ALWAYS_INLINE_L Vc_CONST_L V highMask(int bits) Vc_ALWAYS_INLINE_R Vc_CONST_R;
+    };
+
+    template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask()
+    {
+        return _mm256_broadcast_ss(
+            reinterpret_cast<const float *>(&c_general::highMaskFloat));
+    }
+    template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask()
+    {
+        return _mm256_broadcast_sd(
+            reinterpret_cast<const double *>(&c_general::highMaskDouble));
+    }
+    template <> Vc_ALWAYS_INLINE Vc_CONST Vector<float> Const<float>::highMask(int bits)
+    {
+#ifdef Vc_IMPL_AVX2
+#if defined Vc_ICC || defined Vc_MSVC
+        __m256i allone = _mm256_set1_epi64x(~0);
+#else
+        auto allone = ~__m256i();
+#endif
+        return _mm256_castsi256_ps(_mm256_slli_epi32(allone, bits));
+#else
+        __m128 tmp = _mm_castsi128_ps(_mm_slli_epi32(_mm_setallone_si128(), bits));
+        return concat(tmp, tmp);
+#endif
+    }
+    template <> Vc_ALWAYS_INLINE Vc_CONST Vector<double> Const<double>::highMask(int bits)
+    {
+#ifdef Vc_IMPL_AVX2
+#if defined Vc_ICC || defined Vc_MSVC
+        __m256i allone = _mm256_set1_epi64x(~0);
+#else
+        auto allone = ~__m256i();
+#endif
+        return _mm256_castsi256_pd(_mm256_slli_epi64(allone, bits));
+#else
+        __m128d tmp = _mm_castsi128_pd(_mm_slli_epi64(_mm_setallone_si128(), bits));
+        return concat(tmp, tmp);
+#endif
+    }
+}  // namespace AVX
+
+namespace AVX2
+{
+using AVX::IndexesFromZeroData;
+using AVX::Const;
+}  // namespace AVX2
+}  // namespace Vc
+
+#endif // VC_AVX_CONST_H_
--- a/Vc/avx/const_data.h
+++ b/Vc/avx/const_data.h
@ -0,0 +1,100 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2012-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_AVX_CONST_DATA_H_
+#define VC_AVX_CONST_DATA_H_
+
+#include "../common/data.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace AVX
+{
+
+alignas(64) extern const unsigned int   _IndexesFromZero32[ 8];
+alignas(16) extern const unsigned short _IndexesFromZero16[16];
+alignas(16) extern const unsigned char  _IndexesFromZero8 [32];
+
+struct alignas(64) c_general
+{
+    static const float oneFloat;
+    static const unsigned int absMaskFloat[2];
+    static const unsigned int signMaskFloat[2];
+    static const unsigned int highMaskFloat;
+    static const unsigned short minShort[2];
+    static const unsigned short one16[2];
+    static const float _2power31;
+    static const double oneDouble;
+    static const unsigned long long frexpMask;
+    static const unsigned long long highMaskDouble;
+};
+
+template<typename T> struct c_trig
+{
+    alignas(64) static const T data[];
+};
+#ifndef Vc_MSVC
+template <> alignas(64) const float c_trig<float>::data[];
+template <> alignas(64) const double c_trig<double>::data[];
+#endif
+
+template<typename T> struct c_log
+{
+    typedef float floatAlias Vc_MAY_ALIAS;
+    static Vc_ALWAYS_INLINE float d(int i) { return *reinterpret_cast<const floatAlias *>(&data[i]); }
+    alignas(64) static const unsigned int data[21];
+};
+#ifndef Vc_MSVC
+template<> alignas(64) const unsigned int c_log<float>::data[21];
+#endif
+
+template<> struct c_log<double>
+{
+    enum VectorSize { Size = 16 / sizeof(double) };
+    typedef double doubleAlias Vc_MAY_ALIAS;
+    static Vc_ALWAYS_INLINE double d(int i) { return *reinterpret_cast<const doubleAlias *>(&data[i]); }
+    alignas(64) static const unsigned long long data[21];
+};
+
+}  // namespace AVX
+}  // namespace Vc
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace AVX2
+{
+    using AVX::_IndexesFromZero8;
+    using AVX::_IndexesFromZero16;
+    using AVX::_IndexesFromZero32;
+    using AVX::c_general;
+    using AVX::c_trig;
+    using AVX::c_log;
+}  // namespace AVX2
+}  // namespace Vc
+
+#endif // VC_AVX_CONST_DATA_H_
--- a/Vc/avx/debug.h
+++ b/Vc/avx/debug.h
@ -0,0 +1,124 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_AVX_DEBUG_H_
+#define VC_AVX_DEBUG_H_
+
+#ifndef NDEBUG
+#include "vector.h"
+#include <iostream>
+#include <iomanip>
+#endif
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace AVX
+{
+template <typename T, typename U> struct AddType {
+    const U &d;
+};
+template <typename T, typename U> AddType<T, U> addType(const U &x) { return {x}; }
+
+#ifdef NDEBUG
+class DebugStream
+{
+    public:
+        DebugStream(const char *, const char *, int) {}
+        template<typename T> inline DebugStream &operator<<(const T &) { return *this; }
+};
+#else
+class DebugStream
+{
+    private:
+        template<typename T, typename V> static void printVector(V _x)
+        {
+            enum { Size = sizeof(V) / sizeof(T) };
+            union { V v; T m[Size]; } x = { _x };
+            std::cerr << '[' << std::setprecision(24) << x.m[0];
+            for (int i = 1; i < Size; ++i) {
+                std::cerr << ", " << std::setprecision(24) << x.m[i];
+            }
+            std::cerr << ']';
+        }
+    public:
+        DebugStream(const char *func, const char *file, int line)
+        {
+            std::cerr << "\033[1;40;33mDEBUG: " << file << ':' << line << ' ' << func << ' ';
+        }
+
+        template<typename T> DebugStream &operator<<(const T &x) { std::cerr << x; return *this; }
+
+        template <typename T, typename U> DebugStream &operator<<(AddType<T, U> &&x)
+        {
+            printVector<T, U>(x.d);
+            return *this;
+        }
+        DebugStream &operator<<(__m128 x) {
+            printVector<float, __m128>(x);
+            return *this;
+        }
+        DebugStream &operator<<(__m256 x) {
+            printVector<float, __m256>(x);
+            return *this;
+        }
+        DebugStream &operator<<(__m128d x) {
+            printVector<double, __m128d>(x);
+            return *this;
+        }
+        DebugStream &operator<<(__m256d x) {
+            printVector<double, __m256d>(x);
+            return *this;
+        }
+        DebugStream &operator<<(__m128i x) {
+            printVector<unsigned int, __m128i>(x);
+            return *this;
+        }
+        DebugStream &operator<<(__m256i x) {
+            printVector<unsigned int, __m256i>(x);
+            return *this;
+        }
+
+        ~DebugStream()
+        {
+            std::cerr << "\033[0m" << std::endl;
+        }
+};
+#endif
+
+#ifdef Vc_DEBUG
+#undef Vc_DEBUG
+#endif
+#ifdef Vc_MSVC
+#define Vc_DEBUG Vc::AVX::DebugStream(__FUNCSIG__, __FILE__, __LINE__)
+#else
+#define Vc_DEBUG Vc::AVX::DebugStream(__PRETTY_FUNCTION__, __FILE__, __LINE__)
+#endif
+
+}  // namespace AVX
+}  // namespace Vc
+
+#endif // VC_AVX_DEBUG_H_
--- a/Vc/avx/deinterleave.tcc
+++ b/Vc/avx/deinterleave.tcc
@ -0,0 +1,290 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace AVX2
+{
+
+inline void deinterleave(double_v &Vc_RESTRICT a, double_v &Vc_RESTRICT b, double_v &Vc_RESTRICT c)
+{   // estimated latency (AVX): 4.5 cycles
+    const m256d tmp0 = Mem::shuffle128<X0, Y1>(a.data(), b.data());
+    const m256d tmp1 = Mem::shuffle128<X1, Y0>(a.data(), c.data());
+    const m256d tmp2 = Mem::shuffle128<X0, Y1>(b.data(), c.data());
+    a.data() = Mem::shuffle<X0, Y1, X2, Y3>(tmp0, tmp1);
+    b.data() = Mem::shuffle<X1, Y0, X3, Y2>(tmp0, tmp2);
+    c.data() = Mem::shuffle<X0, Y1, X2, Y3>(tmp1, tmp2);
+}
+
+inline void deinterleave(float_v &Vc_RESTRICT a, float_v &Vc_RESTRICT b, float_v &Vc_RESTRICT c)
+{
+    //                               abc   abc abc
+    // a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121
+    // b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211
+    // c = [b5 c5 a6 b6 c6 a7 b7 c7] 233 = 121+112
+    const m256 ac0 = Mem::shuffle128<X0, Y0>(a.data(), c.data()); // a0 b0 c0 a1 b5 c5 a6 b6
+    const m256 ac1 = Mem::shuffle128<X1, Y1>(a.data(), c.data()); // b1 c1 a2 b2 c6 a7 b7 c7
+
+    m256 tmp0 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>( ac0, b.data());
+           tmp0 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(tmp0,      ac1); // a0 a3 a2 a1 a4 a7 a6 a5
+    m256 tmp1 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>( ac0, b.data());
+           tmp1 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(tmp1,      ac1); // b1 b0 b3 b2 b5 b4 b7 b6
+    m256 tmp2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>( ac0, b.data());
+           tmp2 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>(tmp2,      ac1); // c2 c1 c0 c3 c6 c5 c4 c7
+
+    a.data() = Mem::permute<X0, X3, X2, X1>(tmp0);
+    b.data() = Mem::permute<X1, X0, X3, X2>(tmp1);
+    c.data() = Mem::permute<X2, X1, X0, X3>(tmp2);
+}
+
+inline void deinterleave(int_v &Vc_RESTRICT a, int_v &Vc_RESTRICT b, int_v &Vc_RESTRICT c)
+{
+    deinterleave(reinterpret_cast<float_v &>(a), reinterpret_cast<float_v &>(b),
+            reinterpret_cast<float_v &>(c));
+}
+
+inline void deinterleave(uint_v &Vc_RESTRICT a, uint_v &Vc_RESTRICT b, uint_v &Vc_RESTRICT c)
+{
+    deinterleave(reinterpret_cast<float_v &>(a), reinterpret_cast<float_v &>(b),
+            reinterpret_cast<float_v &>(c));
+}
+
+inline void deinterleave(Vector<short> &Vc_RESTRICT , Vector<short> &Vc_RESTRICT ,
+        Vector<short> &Vc_RESTRICT )
+{
+    return;
+    /* TODO:
+    //                               abc   abc abc
+    // a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121
+    // b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211
+    // c = [b5 c5 a6 b6 c6 a7 b7 c7] 233 = 121+112
+    m128i ac0 = _mm_unpacklo_epi64(a.data(), c.data()); // a0 b0 c0 a1 b5 c5 a6 b6
+    m128i ac1 = _mm_unpackhi_epi64(a.data(), c.data()); // b1 c1 a2 b2 c6 a7 b7 c7
+
+    m128i tmp0 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>( ac0, b.data());
+            tmp0 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(tmp0,      ac1); // a0 a3 a2 a1 a4 a7 a6 a5
+    m128i tmp1 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>( ac0, b.data());
+            tmp1 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(tmp1,      ac1); // b1 b0 b3 b2 b5 b4 b7 b6
+    m128i tmp2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>( ac0, b.data());
+            tmp2 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>(tmp2,      ac1); // c2 c1 c0 c3 c6 c5 c4 c7
+
+    a.data() = Mem::permuteHi<X4, X7, X6, X5>(Mem::permuteLo<X0, X3, X2, X1>(tmp0));
+    b.data() = Mem::permuteHi<X5, X4, X7, X6>(Mem::permuteLo<X1, X0, X3, X2>(tmp1));
+    c.data() = Mem::permuteHi<X6, X5, X4, X7>(Mem::permuteLo<X2, X1, X0, X3>(tmp2));
+    */
+}
+
+inline void deinterleave(Vector<unsigned short> &Vc_RESTRICT a, Vector<unsigned short> &Vc_RESTRICT b,
+        Vector<unsigned short> &Vc_RESTRICT c)
+{
+    deinterleave(reinterpret_cast<Vector<short> &>(a), reinterpret_cast<Vector<short> &>(b),
+            reinterpret_cast<Vector<short> &>(c));
+}
+
+inline void deinterleave(Vector<float> &a, Vector<float> &b)
+{
+    // a7 a6 a5 a4 a3 a2 a1 a0
+    // b7 b6 b5 b4 b3 b2 b1 b0
+    const m256 tmp0 = Reg::permute128<Y0, X0>(a.data(), b.data()); // b3 b2 b1 b0 a3 a2 a1 a0
+    const m256 tmp1 = Reg::permute128<Y1, X1>(a.data(), b.data()); // b7 b6 b5 b4 a7 a6 a5 a4
+
+    const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0
+    const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2
+
+    a.data() = _mm256_unpacklo_ps(tmp2, tmp3); // b6 b4 b2 b0 a6 a4 a2 a0
+    b.data() = _mm256_unpackhi_ps(tmp2, tmp3); // b7 b5 b3 b1 a7 a5 a3 a1
+}
+
+inline void deinterleave(Vector<short> &a, // a0 b0 a1 b1 a2 b2 a3 b3 | a4 b4 a5 ...
+                         Vector<short> &b) // a8 b8 a9 ...
+{
+    auto v0 = Mem::shuffle128<X0, Y0>(a.data(), b.data());
+    auto v1 = Mem::shuffle128<X1, Y1>(a.data(), b.data());
+    auto v2 = AVX::unpacklo_epi16(v0, v1); // a0 a4 ...
+    auto v3 = AVX::unpackhi_epi16(v0, v1); // a2 a6 ...
+    v0 = AVX::unpacklo_epi16(v2, v3); // a0 a2 ...
+    v1 = AVX::unpackhi_epi16(v2, v3); // a1 a3 ...
+    a.data() = AVX::unpacklo_epi16(v0, v1); // a0 a1 ...
+    b.data() = AVX::unpackhi_epi16(v0, v1); // b0 b1 ...
+}
+
+inline void deinterleave(Vector<ushort> &a, Vector<ushort> &b)
+{
+    auto v0 = Mem::shuffle128<X0, Y0>(a.data(), b.data());
+    auto v1 = Mem::shuffle128<X1, Y1>(a.data(), b.data());
+    auto v2 = AVX::unpacklo_epi16(v0, v1); // a0 a4 ...
+    auto v3 = AVX::unpackhi_epi16(v0, v1); // a2 a6 ...
+    v0 = AVX::unpacklo_epi16(v2, v3); // a0 a2 ...
+    v1 = AVX::unpackhi_epi16(v2, v3); // a1 a3 ...
+    a.data() = AVX::unpacklo_epi16(v0, v1); // a0 a1 ...
+    b.data() = AVX::unpackhi_epi16(v0, v1); // b0 b1 ...
+}
+
+}  // namespace AVX2
+namespace Detail
+{
+template <typename Flags>
+inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const float *m, Flags align)
+{
+    a.load(m, align);
+    b.load(m + AVX2::float_v::Size, align);
+    Vc::AVX2::deinterleave(a, b);
+}
+
+template <typename Flags>
+inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const short *m, Flags f)
+{
+    using namespace Vc::AVX2;
+    const auto tmp = Detail::load32(m, f);
+    a.data() =
+        _mm256_cvtepi32_ps(concat(_mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
+                                  _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16)));
+    b.data() = _mm256_cvtepi32_ps(
+        concat(_mm_srai_epi32(lo128(tmp), 16), _mm_srai_epi32(hi128(tmp), 16)));
+}
+
+template <typename Flags>
+inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const unsigned short *m, Flags f)
+{
+    using namespace Vc::AVX2;
+    const auto tmp = Detail::load32(m, f);
+    a.data() = _mm256_cvtepi32_ps(
+        concat(_mm_blend_epi16(lo128(tmp), _mm_setzero_si128(), 0xaa),
+               _mm_blend_epi16(hi128(tmp), _mm_setzero_si128(), 0xaa)));
+    b.data() = _mm256_cvtepi32_ps(
+        concat(_mm_srli_epi32(lo128(tmp), 16), _mm_srli_epi32(hi128(tmp), 16)));
+}
+
+template <typename Flags>
+inline void deinterleave(AVX2::double_v &a, AVX2::double_v &b, const double *m, Flags align)
+{
+    using namespace Vc::AVX2;
+
+    a.load(m, align);
+    b.load(m + AVX2::double_v::Size, align);
+
+    m256d tmp0 = Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data());  // b1 b0 a1 a0
+    m256d tmp1 = Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data());  // b3 b2 a3 a2
+
+    a.data() = _mm256_unpacklo_pd(tmp0, tmp1);  // b2 b0 a2 a0
+    b.data() = _mm256_unpackhi_pd(tmp0, tmp1);  // b3 b1 a3 a1
+}
+
+template <typename Flags>
+inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const int *m, Flags align)
+{
+    using namespace AVX;
+    a.load(m, align);
+    b.load(m + AVX2::int_v::Size, align);
+
+    const m256 tmp0 = avx_cast<m256>(Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()));
+    const m256 tmp1 = avx_cast<m256>(Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()));
+
+    const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0
+    const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2
+
+    a.data() = avx_cast<m256i>(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0
+    b.data() = avx_cast<m256i>(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1
+}
+
+template <typename Flags>
+inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const short *m, Flags f)
+{
+    using namespace Vc::AVX;
+    const AVX2::short_v tmp0(m, f);
+    const m256i tmp = tmp0.data();
+    a.data() = concat(
+                _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
+                _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16));
+    b.data() = concat(
+                _mm_srai_epi32(lo128(tmp), 16),
+                _mm_srai_epi32(hi128(tmp), 16));
+}
+
+template <typename Flags>
+inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned int *m, Flags align)
+{
+    using namespace AVX;
+    a.load(m, align);
+    b.load(m + AVX2::uint_v::Size, align);
+
+    const m256 tmp0 = avx_cast<m256>(Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()));
+    const m256 tmp1 = avx_cast<m256>(Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()));
+
+    const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0
+    const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2
+
+    a.data() = avx_cast<m256i>(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0
+    b.data() = avx_cast<m256i>(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1
+}
+
+template <typename Flags>
+inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned short *m, Flags f)
+{
+    using namespace Vc::AVX;
+    const AVX2::ushort_v tmp0(m, f);
+    const m256i tmp = tmp0.data();
+    a.data() = concat(
+                _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
+                _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16));
+    b.data() = concat(
+                _mm_srai_epi32(lo128(tmp), 16),
+                _mm_srai_epi32(hi128(tmp), 16));
+}
+
+template <typename Flags>
+inline void deinterleave(AVX2::short_v &a, AVX2::short_v &b, const short *m, Flags align)
+{
+    a.load(m, align);
+    b.load(m + AVX2::short_v::Size, align);
+    Vc::AVX2::deinterleave(a, b);
+}
+
+template <typename Flags>
+inline void deinterleave(AVX2::ushort_v &a, AVX2::ushort_v &b, const unsigned short *m, Flags align)
+{
+    a.load(m, align);
+    b.load(m + AVX2::ushort_v::Size, align);
+    Vc::AVX2::deinterleave(a, b);
+}
+
+// only support M == V::EntryType -> no specialization
+template <typename T, typename M, typename Flags>
+Vc_ALWAYS_INLINE void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
+                                   AVX2::Vector<T> &Vc_RESTRICT b,
+                                   AVX2::Vector<T> &Vc_RESTRICT c,
+                                   const M *Vc_RESTRICT memory, Flags align)
+{
+    using V = AVX2::Vector<T>;
+    a.load(&memory[0 * V::Size], align);
+    b.load(&memory[1 * V::Size], align);
+    c.load(&memory[2 * V::Size], align);
+    Vc::AVX2::deinterleave(a, b, c);
+}
+
+}  // namespace Detail
+}  // namespace Vc
--- a/Vc/avx/detail.h
+++ b/Vc/avx/detail.h
--- a/Vc/avx/helperimpl.h
+++ b/Vc/avx/helperimpl.h
@ -0,0 +1,119 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_AVX_HELPERIMPL_H_
+#define VC_AVX_HELPERIMPL_H_
+
+#include "../sse/helperimpl.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Detail
+{
+template <typename A>
+inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const float *, A);
+template <typename A>
+inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const short *, A);
+template <typename A>
+inline void deinterleave(AVX2::float_v &, AVX2::float_v &, const ushort *, A);
+template <typename A>
+inline void deinterleave(AVX2::double_v &, AVX2::double_v &, const double *, A);
+template <typename A>
+inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const int *, A);
+template <typename A>
+inline void deinterleave(AVX2::int_v &, AVX2::int_v &, const short *, A);
+template <typename A>
+inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const uint *, A);
+template <typename A>
+inline void deinterleave(AVX2::uint_v &, AVX2::uint_v &, const ushort *, A);
+template <typename A>
+inline void deinterleave(AVX2::short_v &, AVX2::short_v &, const short *, A);
+template <typename A>
+inline void deinterleave(AVX2::ushort_v &, AVX2::ushort_v &, const ushort *, A);
+
+template <typename T, typename M, typename A>
+Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
+                                     AVX2::Vector<T> &Vc_RESTRICT b,
+                                     AVX2::Vector<T> &Vc_RESTRICT c,
+                                     const M *Vc_RESTRICT memory,
+                                     A align) Vc_ALWAYS_INLINE_R;
+template <typename T, typename M, typename A>
+Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
+                                     AVX2::Vector<T> &Vc_RESTRICT b,
+                                     AVX2::Vector<T> &Vc_RESTRICT c,
+                                     AVX2::Vector<T> &Vc_RESTRICT d,
+                                     const M *Vc_RESTRICT memory,
+                                     A align) Vc_ALWAYS_INLINE_R;
+template <typename T, typename M, typename A>
+Vc_ALWAYS_INLINE_L void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
+                                     AVX2::Vector<T> &Vc_RESTRICT b,
+                                     AVX2::Vector<T> &Vc_RESTRICT c,
+                                     AVX2::Vector<T> &Vc_RESTRICT d,
+                                     AVX2::Vector<T> &Vc_RESTRICT e,
+                                     const M *Vc_RESTRICT memory,
+                                     A align) Vc_ALWAYS_INLINE_R;
+template <typename T, typename M, typename A>
+Vc_ALWAYS_INLINE_L void deinterleave(
+    AVX2::Vector<T> &Vc_RESTRICT a, AVX2::Vector<T> &Vc_RESTRICT b,
+    AVX2::Vector<T> &Vc_RESTRICT c, AVX2::Vector<T> &Vc_RESTRICT d,
+    AVX2::Vector<T> &Vc_RESTRICT e, AVX2::Vector<T> &Vc_RESTRICT f,
+    const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R;
+template <typename T, typename M, typename A>
+Vc_ALWAYS_INLINE_L void deinterleave(
+    AVX2::Vector<T> &Vc_RESTRICT a, AVX2::Vector<T> &Vc_RESTRICT b,
+    AVX2::Vector<T> &Vc_RESTRICT c, AVX2::Vector<T> &Vc_RESTRICT d,
+    AVX2::Vector<T> &Vc_RESTRICT e, AVX2::Vector<T> &Vc_RESTRICT f,
+    AVX2::Vector<T> &Vc_RESTRICT g, AVX2::Vector<T> &Vc_RESTRICT h,
+    const M *Vc_RESTRICT memory, A align) Vc_ALWAYS_INLINE_R;
+
+Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr, VectorAbi::Avx)
+{
+    prefetchForOneRead(addr, VectorAbi::Sse());
+}
+Vc_ALWAYS_INLINE void prefetchForModify(const void *addr, VectorAbi::Avx)
+{
+    prefetchForModify(addr, VectorAbi::Sse());
+}
+Vc_ALWAYS_INLINE void prefetchClose(const void *addr, VectorAbi::Avx)
+{
+    prefetchClose(addr, VectorAbi::Sse());
+}
+Vc_ALWAYS_INLINE void prefetchMid(const void *addr, VectorAbi::Avx)
+{
+    prefetchMid(addr, VectorAbi::Sse());
+}
+Vc_ALWAYS_INLINE void prefetchFar(const void *addr, VectorAbi::Avx)
+{
+    prefetchFar(addr, VectorAbi::Sse());
+}
+}  // namespace Detail
+}  // namespace Vc
+
+#include "deinterleave.tcc"
+
+#endif // VC_AVX_HELPERIMPL_H_
--- a/Vc/avx/intrinsics.h
+++ b/Vc/avx/intrinsics.h
@ -0,0 +1,670 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_AVX_INTRINSICS_H_
+#define VC_AVX_INTRINSICS_H_
+
+#include "../global.h"
+#include "../traits/type_traits.h"
+
+// see comment in sse/intrinsics.h
+extern "C" {
+// AVX
+#include <immintrin.h>
+
+#if (defined(Vc_IMPL_XOP) || defined(Vc_IMPL_FMA4)) && !defined(Vc_MSVC)
+#include <x86intrin.h>
+#endif
+}
+
+#include "../common/fix_clang_emmintrin.h"
+
+#include "const_data.h"
+#include "../common/types.h"
+#include "macros.h"
+#include <cstdlib>
+
+#if (defined Vc_CLANG && Vc_CLANG >= 0x30900 && Vc_CLANG < 0x70000)
+#ifdef _mm256_permute2f128_si256
+#undef _mm256_permute2f128_si256
+#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
+                                           (__v8si)(__m256i)(V2), (char)(M)); })
+#endif
+
+#ifdef _mm256_permute2f128_ps
+#undef _mm256_permute2f128_ps
+#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
+                                          (__v8sf)(__m256)(V2), (char)(M)); })
+#endif
+
+#ifdef _mm256_permute2x128_si256
+#undef _mm256_permute2x128_si256
+#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (char)(M)); })
+#endif
+#endif
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace AvxIntrinsics
+{
+    using AVX::c_general;
+    using AVX::_IndexesFromZero32;
+    using AVX::_IndexesFromZero16;
+    using AVX::_IndexesFromZero8;
+
+    typedef __m128  m128 ;
+    typedef __m128d m128d;
+    typedef __m128i m128i;
+    typedef __m256  m256 ;
+    typedef __m256d m256d;
+    typedef __m256i m256i;
+
+#ifdef Vc_GCC
+    // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin
+    // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :)
+    static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) * static_cast<__v4df>(b)); }
+    static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) + static_cast<__v4df>(b)); }
+    static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) - static_cast<__v4df>(b)); }
+    static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) * static_cast<__v8sf>(b)); }
+    static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) + static_cast<__v8sf>(b)); }
+    static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); }
+#endif
+
+    static Vc_INTRINSIC m256d Vc_CONST set1_pd   (double a) { return _mm256_set1_pd   (a); }
+    static Vc_INTRINSIC m256i Vc_CONST set1_epi32(int    a) { return _mm256_set1_epi32(a); }
+
+    static Vc_INTRINSIC Vc_CONST m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
+    static Vc_INTRINSIC Vc_CONST m128  _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
+    static Vc_INTRINSIC Vc_CONST m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
+
+    static Vc_INTRINSIC Vc_CONST m256i setallone_si256() { return _mm256_castps_si256(_mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet))); }
+    static Vc_INTRINSIC Vc_CONST m256d setallone_pd() { return _mm256_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
+    static Vc_INTRINSIC Vc_CONST m256  setallone_ps() { return _mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
+
+    static Vc_INTRINSIC m256i Vc_CONST setone_epi8 ()  { return _mm256_set1_epi8(1); }
+    static Vc_INTRINSIC m256i Vc_CONST setone_epu8 ()  { return setone_epi8(); }
+    static Vc_INTRINSIC m256i Vc_CONST setone_epi16()  { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::one16))); }
+    static Vc_INTRINSIC m256i Vc_CONST setone_epu16()  { return setone_epi16(); }
+    static Vc_INTRINSIC m256i Vc_CONST setone_epi32()  { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&_IndexesFromZero32[1]))); }
+    static Vc_INTRINSIC m256i Vc_CONST setone_epu32()  { return setone_epi32(); }
+
+    static Vc_INTRINSIC m256  Vc_CONST setone_ps()     { return _mm256_broadcast_ss(&c_general::oneFloat); }
+    static Vc_INTRINSIC m256d Vc_CONST setone_pd()     { return _mm256_broadcast_sd(&c_general::oneDouble); }
+
+    static Vc_INTRINSIC m256d Vc_CONST setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::absMaskFloat[0])); }
+    static Vc_INTRINSIC m256  Vc_CONST setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::absMaskFloat[1])); }
+    static Vc_INTRINSIC m256d Vc_CONST setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::signMaskFloat[0])); }
+    static Vc_INTRINSIC m256  Vc_CONST setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1])); }
+
+    static Vc_INTRINSIC m256  Vc_CONST set2power31_ps()    { return _mm256_broadcast_ss(&c_general::_2power31); }
+    static Vc_INTRINSIC m128  Vc_CONST _mm_set2power31_ps()    { return _mm_broadcast_ss(&c_general::_2power31); }
+    static Vc_INTRINSIC m256i Vc_CONST set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
+    static Vc_INTRINSIC m128i Vc_CONST _mm_set2power31_epu32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
+
+    static Vc_INTRINSIC m256i Vc_CONST setmin_epi8 () { return _mm256_set1_epi8(-0x80); }
+    static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
+    static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
+    static Vc_INTRINSIC m256i Vc_CONST setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
+    static Vc_INTRINSIC m256i Vc_CONST setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
+
+    template <int i>
+    static Vc_INTRINSIC Vc_CONST unsigned int extract_epu32(__m128i x)
+    {
+        return _mm_extract_epi32(x, i);
+    }
+
+    template <int offset> Vc_INTRINSIC __m256  insert128(__m256  a, __m128  b) { return _mm256_insertf128_ps(a, b, offset); }
+    template <int offset> Vc_INTRINSIC __m256d insert128(__m256d a, __m128d b) { return _mm256_insertf128_pd(a, b, offset); }
+    template <int offset> Vc_INTRINSIC __m256i insert128(__m256i a, __m128i b) {
+#ifdef Vc_IMPL_AVX2
+        return _mm256_inserti128_si256(a, b, offset);
+#else
+        return _mm256_insertf128_si256(a, b, offset);
+#endif
+    }
+
+    template <int offset> Vc_INTRINSIC __m128  extract128(__m256  a) { return _mm256_extractf128_ps(a, offset); }
+    template <int offset> Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); }
+    template <int offset> Vc_INTRINSIC __m128i extract128(__m256i a) {
+#ifdef Vc_IMPL_AVX2
+        return _mm256_extracti128_si256(a, offset);
+#else
+        return _mm256_extractf128_si256(a, offset);
+#endif
+    }
+
+    /////////////////////// COMPARE OPS ///////////////////////
+#ifdef Vc_GCC
+    // GCC needs builtin compare operators to enable constant folding
+    Vc_INTRINSIC __m256d cmpeq_pd   (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a == b); }
+    Vc_INTRINSIC __m256d cmpneq_pd  (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a != b); }
+    Vc_INTRINSIC __m256d cmplt_pd   (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a < b); }
+    Vc_INTRINSIC __m256d cmpge_pd   (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a >= b); }
+    Vc_INTRINSIC __m256d cmple_pd   (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a <= b); }
+    Vc_INTRINSIC __m256d cmpgt_pd   (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a > b); }
+
+    Vc_INTRINSIC __m256  cmpeq_ps   (__m256  a, __m256  b) { return reinterpret_cast<__m256 >(a == b); }
+    Vc_INTRINSIC __m256  cmpneq_ps  (__m256  a, __m256  b) { return reinterpret_cast<__m256 >(a != b); }
+    Vc_INTRINSIC __m256  cmplt_ps   (__m256  a, __m256  b) { return reinterpret_cast<__m256 >(a < b); }
+    Vc_INTRINSIC __m256  cmpge_ps   (__m256  a, __m256  b) { return reinterpret_cast<__m256 >(a >= b); }
+    Vc_INTRINSIC __m256  cmple_ps   (__m256  a, __m256  b) { return reinterpret_cast<__m256 >(a <= b); }
+    Vc_INTRINSIC __m256  cmpgt_ps   (__m256  a, __m256  b) { return reinterpret_cast<__m256 >(a > b); }
+#else
+    Vc_INTRINSIC __m256d cmpeq_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
+    Vc_INTRINSIC __m256d cmpneq_pd  (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
+    Vc_INTRINSIC __m256d cmplt_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
+    Vc_INTRINSIC __m256d cmpge_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
+    Vc_INTRINSIC __m256d cmple_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
+    Vc_INTRINSIC __m256d cmpgt_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
+
+    Vc_INTRINSIC __m256  cmpeq_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
+    Vc_INTRINSIC __m256  cmpneq_ps  (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
+    Vc_INTRINSIC __m256  cmplt_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
+    Vc_INTRINSIC __m256  cmpge_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
+    Vc_INTRINSIC __m256  cmple_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
+    Vc_INTRINSIC __m256  cmpgt_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
+#endif
+    Vc_INTRINSIC __m256d cmpnlt_pd  (__m256d a, __m256d b) { return cmpge_pd(a, b); }
+    Vc_INTRINSIC __m256d cmpnle_pd  (__m256d a, __m256d b) { return cmpgt_pd(a, b); }
+    Vc_INTRINSIC __m256  cmpnlt_ps  (__m256  a, __m256  b) { return cmpge_ps(a, b); }
+    Vc_INTRINSIC __m256  cmpnle_ps  (__m256  a, __m256  b) { return cmpgt_ps(a, b); }
+
+    Vc_INTRINSIC __m256d cmpord_pd  (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); }
+    Vc_INTRINSIC __m256d cmpunord_pd(__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); }
+    Vc_INTRINSIC __m256  cmpord_ps  (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); }
+    Vc_INTRINSIC __m256  cmpunord_ps(__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); }
+
+#if defined(Vc_IMPL_XOP)
+    static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
+        return _mm_comlt_epu16(a, b);
+    }
+    static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
+        return _mm_comgt_epu16(a, b);
+    }
+#else
+    static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
+        return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
+    }
+    static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
+        return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
+    }
+#endif
+
+#ifdef Vc_IMPL_AVX2
+    template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
+    {
+        return _mm256_alignr_epi8(s1, s2, shift);
+    }
+#else
+    template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
+    {
+        return insert128<1>(
+            _mm256_castsi128_si256(_mm_alignr_epi8(_mm256_castsi256_si128(s1),
+                                                   _mm256_castsi256_si128(s2), shift)),
+            _mm_alignr_epi8(extract128<1>(s1), extract128<1>(s2), shift));
+    }
+#endif
+
+#ifdef Vc_IMPL_AVX2
+#define Vc_AVX_TO_SSE_2_NEW(name)                                                        \
+    Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0)                             \
+    {                                                                                    \
+        return _mm256_##name(a0, b0);                                                    \
+    }
+#define Vc_AVX_TO_SSE_256_128(name)                                                      \
+    Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0)                             \
+    {                                                                                    \
+        return _mm256_##name(a0, b0);                                                    \
+    }
+#define Vc_AVX_TO_SSE_1i(name)                                                           \
+    template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0)                        \
+    {                                                                                    \
+        return _mm256_##name(a0, i);                                                     \
+    }
+#define Vc_AVX_TO_SSE_1(name)                                                            \
+    Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) { return _mm256_##name(a0); }
+#define Vc_AVX_TO_SSE_1_128(name, shift__)                                               \
+    Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) { return _mm256_##name(a0); }
+#else
+/**\internal
+ * Defines the function \p name, which takes to __m256i arguments and calls `_mm_##name` on the low
+ * and high 128 bit halfs of the arguments.
+ *
+ * In case the AVX2 intrinsics are enabled, the arguments are directly passed to a single
+ * `_mm256_##name` call.
+ */
+#define Vc_AVX_TO_SSE_1(name)                                                            \
+    Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0)                                       \
+    {                                                                                    \
+        __m128i a1 = extract128<1>(a0);                                                  \
+        __m128i r0 = _mm_##name(_mm256_castsi256_si128(a0));                             \
+        __m128i r1 = _mm_##name(a1);                                                     \
+        return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
+    }
+#define Vc_AVX_TO_SSE_1_128(name, shift__)                                               \
+    Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0)                                       \
+    {                                                                                    \
+        __m128i r0 = _mm_##name(a0);                                                     \
+        __m128i r1 = _mm_##name(_mm_srli_si128(a0, shift__));                            \
+        return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
+    }
+#define Vc_AVX_TO_SSE_2_NEW(name)                                                        \
+    Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0)                             \
+    {                                                                                    \
+        m128i a1 = extract128<1>(a0);                                                    \
+        m128i b1 = extract128<1>(b0);                                                    \
+        m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0));   \
+        m128i r1 = _mm_##name(a1, b1);                                                   \
+        return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
+    }
+#define Vc_AVX_TO_SSE_256_128(name)                                                      \
+    Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0)                             \
+    {                                                                                    \
+        m128i a1 = extract128<1>(a0);                                                    \
+        m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), b0);                           \
+        m128i r1 = _mm_##name(a1, b0);                                                   \
+        return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
+    }
+#define Vc_AVX_TO_SSE_1i(name)                                                           \
+    template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0)                        \
+    {                                                                                    \
+        m128i a1 = extract128<1>(a0);                                                    \
+        m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i);                            \
+        m128i r1 = _mm_##name(a1, i);                                                    \
+        return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
+    }
+#endif
+    Vc_INTRINSIC Vc_CONST __m128i sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); }
+    Vc_INTRINSIC Vc_CONST __m128i sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); }
+    Vc_INTRINSIC Vc_CONST __m128i sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); }
+    Vc_INTRINSIC Vc_CONST __m128i srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); }
+    Vc_INTRINSIC Vc_CONST __m128i srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); }
+    Vc_INTRINSIC Vc_CONST __m128i srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); }
+    Vc_INTRINSIC Vc_CONST __m128i sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); }
+    Vc_INTRINSIC Vc_CONST __m128i sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); }
+
+    Vc_AVX_TO_SSE_1i(slli_epi16)
+    Vc_AVX_TO_SSE_1i(slli_epi32)
+    Vc_AVX_TO_SSE_1i(slli_epi64)
+    Vc_AVX_TO_SSE_1i(srai_epi16)
+    Vc_AVX_TO_SSE_1i(srai_epi32)
+    Vc_AVX_TO_SSE_1i(srli_epi16)
+    Vc_AVX_TO_SSE_1i(srli_epi32)
+    Vc_AVX_TO_SSE_1i(srli_epi64)
+
+    Vc_AVX_TO_SSE_256_128(sll_epi16)
+    Vc_AVX_TO_SSE_256_128(sll_epi32)
+    Vc_AVX_TO_SSE_256_128(sll_epi64)
+    Vc_AVX_TO_SSE_256_128(srl_epi16)
+    Vc_AVX_TO_SSE_256_128(srl_epi32)
+    Vc_AVX_TO_SSE_256_128(srl_epi64)
+    Vc_AVX_TO_SSE_256_128(sra_epi16)
+    Vc_AVX_TO_SSE_256_128(sra_epi32)
+
+    Vc_AVX_TO_SSE_2_NEW(cmpeq_epi8)
+    Vc_AVX_TO_SSE_2_NEW(cmpeq_epi16)
+    Vc_AVX_TO_SSE_2_NEW(cmpeq_epi32)
+    Vc_AVX_TO_SSE_2_NEW(cmpeq_epi64)
+    Vc_AVX_TO_SSE_2_NEW(cmpgt_epi8)
+    Vc_AVX_TO_SSE_2_NEW(cmpgt_epi16)
+    Vc_AVX_TO_SSE_2_NEW(cmpgt_epi32)
+    Vc_AVX_TO_SSE_2_NEW(cmpgt_epi64)
+    Vc_AVX_TO_SSE_2_NEW(unpackhi_epi16)
+    Vc_AVX_TO_SSE_2_NEW(unpacklo_epi16)
+    Vc_AVX_TO_SSE_2_NEW(add_epi16)
+    Vc_AVX_TO_SSE_2_NEW(add_epi32)
+    Vc_AVX_TO_SSE_2_NEW(add_epi64)
+    Vc_AVX_TO_SSE_2_NEW(sub_epi16)
+    Vc_AVX_TO_SSE_2_NEW(sub_epi32)
+    Vc_AVX_TO_SSE_2_NEW(mullo_epi16)
+    Vc_AVX_TO_SSE_2_NEW(sign_epi16)
+    Vc_AVX_TO_SSE_2_NEW(sign_epi32)
+    Vc_AVX_TO_SSE_2_NEW(min_epi8)
+    Vc_AVX_TO_SSE_2_NEW(max_epi8)
+    Vc_AVX_TO_SSE_2_NEW(min_epu16)
+    Vc_AVX_TO_SSE_2_NEW(max_epu16)
+    Vc_AVX_TO_SSE_2_NEW(min_epi32)
+    Vc_AVX_TO_SSE_2_NEW(max_epi32)
+    Vc_AVX_TO_SSE_2_NEW(min_epu32)
+    Vc_AVX_TO_SSE_2_NEW(max_epu32)
+    Vc_AVX_TO_SSE_2_NEW(mullo_epi32)
+
+    Vc_AVX_TO_SSE_1(abs_epi8)
+    Vc_AVX_TO_SSE_1(abs_epi16)
+    Vc_AVX_TO_SSE_1(abs_epi32)
+    Vc_AVX_TO_SSE_1_128(cvtepi8_epi16, 8)
+    Vc_AVX_TO_SSE_1_128(cvtepi8_epi32, 4)
+    Vc_AVX_TO_SSE_1_128(cvtepi8_epi64, 2)
+    Vc_AVX_TO_SSE_1_128(cvtepi16_epi32, 8)
+    Vc_AVX_TO_SSE_1_128(cvtepi16_epi64, 4)
+    Vc_AVX_TO_SSE_1_128(cvtepi32_epi64, 8)
+    Vc_AVX_TO_SSE_1_128(cvtepu8_epi16, 8)
+    Vc_AVX_TO_SSE_1_128(cvtepu8_epi32, 4)
+    Vc_AVX_TO_SSE_1_128(cvtepu8_epi64, 2)
+    Vc_AVX_TO_SSE_1_128(cvtepu16_epi32, 8)
+    Vc_AVX_TO_SSE_1_128(cvtepu16_epi64, 4)
+    Vc_AVX_TO_SSE_1_128(cvtepu32_epi64, 8)
+#ifndef Vc_IMPL_AVX2
+
+/////////////////////////////////////////////////////////////////////////
+// implementation of the intrinsics missing in AVX
+/////////////////////////////////////////////////////////////////////////
+
+    static Vc_INTRINSIC m256i Vc_CONST and_si256(__m256i x, __m256i y) {
+        return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
+    }
+    static Vc_INTRINSIC m256i Vc_CONST andnot_si256(__m256i x, __m256i y) {
+        return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
+    }
+    static Vc_INTRINSIC m256i Vc_CONST or_si256(__m256i x, __m256i y) {
+        return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
+    }
+    static Vc_INTRINSIC m256i Vc_CONST xor_si256(__m256i x, __m256i y) {
+        return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
+    }
+
+    Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
+    {
+        m128i a1 = extract128<1>(a0);
+        return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0));
+    }
+    template <int m> Vc_INTRINSIC Vc_CONST m256i blend_epi16(__m256i a0, __m256i b0)
+    {
+        m128i a1 = extract128<1>(a0);
+        m128i b1 = extract128<1>(b0);
+        m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff);
+        m128i r1 = _mm_blend_epi16(a1, b1, m >> 8);
+        return insert128<1>(_mm256_castsi128_si256(r0), r1);
+    }
+    Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0) {
+        m128i a1 = extract128<1>(a0);
+        m128i b1 = extract128<1>(b0);
+        m128i m1 = extract128<1>(m0);
+        m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0));
+        m128i r1 = _mm_blendv_epi8(a1, b1, m1);
+        return insert128<1>(_mm256_castsi128_si256(r0), r1);
+    }
+    // mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
+
+#else // Vc_IMPL_AVX2
+
+static Vc_INTRINSIC Vc_CONST m256i xor_si256(__m256i x, __m256i y) { return _mm256_xor_si256(x, y); }
+static Vc_INTRINSIC Vc_CONST m256i or_si256(__m256i x, __m256i y) { return _mm256_or_si256(x, y); }
+static Vc_INTRINSIC Vc_CONST m256i and_si256(__m256i x, __m256i y) { return _mm256_and_si256(x, y); }
+static Vc_INTRINSIC Vc_CONST m256i andnot_si256(__m256i x, __m256i y) { return _mm256_andnot_si256(x, y); }
+
+/////////////////////////////////////////////////////////////////////////
+// implementation of the intrinsics missing in AVX2
+/////////////////////////////////////////////////////////////////////////
+Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0)
+{
+    return _mm256_blendv_epi8(a0, b0, m0);
+}
+Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
+{
+    return _mm256_movemask_epi8(a0);
+}
+
+#endif // Vc_IMPL_AVX2
+
+/////////////////////////////////////////////////////////////////////////
+// implementation of intrinsics missing in AVX and AVX2
+/////////////////////////////////////////////////////////////////////////
+
+static Vc_INTRINSIC m256i cmplt_epi64(__m256i a, __m256i b) {
+    return cmpgt_epi64(b, a);
+}
+static Vc_INTRINSIC m256i cmplt_epi32(__m256i a, __m256i b) {
+    return cmpgt_epi32(b, a);
+}
+static Vc_INTRINSIC m256i cmplt_epi16(__m256i a, __m256i b) {
+    return cmpgt_epi16(b, a);
+}
+static Vc_INTRINSIC m256i cmplt_epi8(__m256i a, __m256i b) {
+    return cmpgt_epi8(b, a);
+}
+
+static Vc_INTRINSIC m256i cmpgt_epu8(__m256i a, __m256i b) {
+    return cmpgt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8()));
+}
+#if defined(Vc_IMPL_XOP)
+    Vc_AVX_TO_SSE_2_NEW(comlt_epu32)
+    Vc_AVX_TO_SSE_2_NEW(comgt_epu32)
+    Vc_AVX_TO_SSE_2_NEW(comlt_epu16)
+    Vc_AVX_TO_SSE_2_NEW(comgt_epu16)
+    static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i a, __m256i b) { return comlt_epu32(a, b); }
+    static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i a, __m256i b) { return comgt_epu32(a, b); }
+    static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i a, __m256i b) { return comlt_epu16(a, b); }
+    static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i a, __m256i b) { return comgt_epu16(a, b); }
+#else
+    static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i _a, __m256i _b) {
+        m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
+        m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
+        return cmplt_epi32(a, b);
+    }
+    static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i _a, __m256i _b) {
+        m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
+        m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
+        return cmpgt_epi32(a, b);
+    }
+    static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i _a, __m256i _b) {
+        m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
+        m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
+        return cmplt_epi16(a, b);
+    }
+    static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i _a, __m256i _b) {
+        m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
+        m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
+        return cmpgt_epi16(a, b);
+    }
+#endif
+
+static Vc_INTRINSIC void _mm256_maskstore(float *mem, const __m256 mask, const __m256 v) {
+    _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v);
+}
+static Vc_INTRINSIC void _mm256_maskstore(double *mem, const __m256d mask, const __m256d v) {
+    _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v);
+}
+static Vc_INTRINSIC void _mm256_maskstore(int *mem, const __m256i mask, const __m256i v) {
+#ifdef Vc_IMPL_AVX2
+    _mm256_maskstore_epi32(mem, mask, v);
+#else
+    _mm256_maskstore_ps(reinterpret_cast<float *>(mem), mask, _mm256_castsi256_ps(v));
+#endif
+}
+static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const __m256i mask, const __m256i v) {
+    _mm256_maskstore(reinterpret_cast<int *>(mem), mask, v);
+}
+static Vc_INTRINSIC void _mm256_maskstore(short *mem, const __m256i mask, const __m256i v) {
+    using namespace AVX;
+    _mm_maskmoveu_si128(_mm256_castsi256_si128(v), _mm256_castsi256_si128(mask), reinterpret_cast<char *>(&mem[0]));
+    _mm_maskmoveu_si128(extract128<1>(v), extract128<1>(mask), reinterpret_cast<char *>(&mem[8]));
+}
+static Vc_INTRINSIC void _mm256_maskstore(unsigned short *mem, const __m256i mask, const __m256i v) {
+    _mm256_maskstore(reinterpret_cast<short *>(mem), mask, v);
+}
+
+#undef Vc_AVX_TO_SSE_1
+#undef Vc_AVX_TO_SSE_1_128
+#undef Vc_AVX_TO_SSE_2_NEW
+#undef Vc_AVX_TO_SSE_256_128
+#undef Vc_AVX_TO_SSE_1i
+
+template<typename R> Vc_INTRINSIC_L R stream_load(const float *mem) Vc_INTRINSIC_R;
+template<> Vc_INTRINSIC m128 stream_load<m128>(const float *mem)
+{
+    return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
+}
+template<> Vc_INTRINSIC m256 stream_load<m256>(const float *mem)
+{
+    return insert128<1>(_mm256_castps128_ps256(stream_load<m128>(mem)),
+                                stream_load<m128>(mem + 4));
+}
+
+template<typename R> Vc_INTRINSIC_L R stream_load(const double *mem) Vc_INTRINSIC_R;
+template<> Vc_INTRINSIC m128d stream_load<m128d>(const double *mem)
+{
+    return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
+}
+template<> Vc_INTRINSIC m256d stream_load<m256d>(const double *mem)
+{
+    return insert128<1>(_mm256_castpd128_pd256(stream_load<m128d>(mem)),
+                                stream_load<m128d>(mem + 2));
+}
+
+template<typename R> Vc_INTRINSIC_L R stream_load(const void *mem) Vc_INTRINSIC_R;
+template<> Vc_INTRINSIC m128i stream_load<m128i>(const void *mem)
+{
+    return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<void *>(mem)));
+}
+template<> Vc_INTRINSIC m256i stream_load<m256i>(const void *mem)
+{
+    return insert128<1>(_mm256_castsi128_si256(stream_load<m128i>(mem)),
+                                stream_load<m128i>(static_cast<const __m128i *>(mem) + 1));
+}
+
+Vc_INTRINSIC void stream_store(float *mem, __m128 value, __m128 mask)
+{
+    _mm_maskmoveu_si128(_mm_castps_si128(value), _mm_castps_si128(mask), reinterpret_cast<char *>(mem));
+}
+Vc_INTRINSIC void stream_store(float *mem, __m256 value, __m256 mask)
+{
+    stream_store(mem, _mm256_castps256_ps128(value), _mm256_castps256_ps128(mask));
+    stream_store(mem + 4, extract128<1>(value), extract128<1>(mask));
+}
+Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask)
+{
+    _mm_maskmoveu_si128(_mm_castpd_si128(value), _mm_castpd_si128(mask), reinterpret_cast<char *>(mem));
+}
+Vc_INTRINSIC void stream_store(double *mem, __m256d value, __m256d mask)
+{
+    stream_store(mem, _mm256_castpd256_pd128(value), _mm256_castpd256_pd128(mask));
+    stream_store(mem + 2, extract128<1>(value), extract128<1>(mask));
+}
+Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask)
+{
+    _mm_maskmoveu_si128(value, mask, reinterpret_cast<char *>(mem));
+}
+Vc_INTRINSIC void stream_store(void *mem, __m256i value, __m256i mask)
+{
+    stream_store(mem, _mm256_castsi256_si128(value), _mm256_castsi256_si128(mask));
+    stream_store(static_cast<__m128i *>(mem) + 1, extract128<1>(value), extract128<1>(mask));
+}
+
+#ifndef __x86_64__
+Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
+    return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
+}
+#endif
+
+#ifdef Vc_IMPL_AVX2
+template <int Scale> __m256 gather(const float *addr, __m256i idx)
+{
+    return _mm256_i32gather_ps(addr, idx, Scale);
+}
+template <int Scale> __m256d gather(const double *addr, __m128i idx)
+{
+    return _mm256_i32gather_pd(addr, idx, Scale);
+}
+template <int Scale> __m256i gather(const int *addr, __m256i idx)
+{
+    return _mm256_i32gather_epi32(addr, idx, Scale);
+}
+template <int Scale> __m256i gather(const unsigned *addr, __m256i idx)
+{
+    return _mm256_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
+}
+
+template <int Scale> __m256 gather(__m256 src, __m256 k, const float *addr, __m256i idx)
+{
+    return _mm256_mask_i32gather_ps(src, addr, idx, k, Scale);
+}
+template <int Scale>
+__m256d gather(__m256d src, __m256d k, const double *addr, __m128i idx)
+{
+    return _mm256_mask_i32gather_pd(src, addr, idx, k, Scale);
+}
+template <int Scale> __m256i gather(__m256i src, __m256i k, const int *addr, __m256i idx)
+{
+    return _mm256_mask_i32gather_epi32(src, addr, idx, k, Scale);
+}
+template <int Scale>
+__m256i gather(__m256i src, __m256i k, const unsigned *addr, __m256i idx)
+{
+    return _mm256_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
+}
+#endif
+
+}  // namespace AvxIntrinsics
+}  // namespace Vc
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace AVX
+{
+    using namespace AvxIntrinsics;
+}  // namespace AVX
+namespace AVX2
+{
+    using namespace AvxIntrinsics;
+}  // namespace AVX2
+namespace AVX
+{
+    template<typename T> struct VectorTypeHelper;
+    template<> struct VectorTypeHelper<         char > { typedef __m256i Type; };
+    template<> struct VectorTypeHelper<  signed char > { typedef __m256i Type; };
+    template<> struct VectorTypeHelper<unsigned char > { typedef __m256i Type; };
+    template<> struct VectorTypeHelper<         short> { typedef __m256i Type; };
+    template<> struct VectorTypeHelper<unsigned short> { typedef __m256i Type; };
+    template<> struct VectorTypeHelper<         int  > { typedef __m256i Type; };
+    template<> struct VectorTypeHelper<unsigned int  > { typedef __m256i Type; };
+    template<> struct VectorTypeHelper<         long > { typedef __m256i Type; };
+    template<> struct VectorTypeHelper<unsigned long > { typedef __m256i Type; };
+    template<> struct VectorTypeHelper<         long long> { typedef __m256i Type; };
+    template<> struct VectorTypeHelper<unsigned long long> { typedef __m256i Type; };
+    template<> struct VectorTypeHelper<         float> { typedef __m256  Type; };
+    template<> struct VectorTypeHelper<        double> { typedef __m256d Type; };
+
+    template <typename T>
+    using IntegerVectorType =
+        typename std::conditional<sizeof(T) == 16, __m128i, __m256i>::type;
+    template <typename T>
+    using DoubleVectorType =
+        typename std::conditional<sizeof(T) == 16, __m128d, __m256d>::type;
+    template <typename T>
+    using FloatVectorType =
+        typename std::conditional<sizeof(T) == 16, __m128, __m256>::type;
+
+    template<typename T> struct VectorHelper {};
+    template<typename T> struct VectorHelperSize;
+}  // namespace AVX
+}  // namespace Vc
+
+#endif // VC_AVX_INTRINSICS_H_
--- a/Vc/avx/limits.h
+++ b/Vc/avx/limits.h
@ -0,0 +1,87 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_AVX_LIMITS_H_
+#define VC_AVX_LIMITS_H_
+
+#include "intrinsics.h"
+#include "types.h"
+#include "macros.h"
+
+namespace std
+{
+#define Vc_NUM_LIM(T, _max, _min)                                                        \
+    template <> struct numeric_limits<Vc::AVX2::Vector<T>> : public numeric_limits<T> {  \
+        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> max() Vc_NOEXCEPT               \
+        {                                                                                \
+            return _max;                                                                 \
+        }                                                                                \
+        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> min() Vc_NOEXCEPT               \
+        {                                                                                \
+            return _min;                                                                 \
+        }                                                                                \
+        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> lowest() Vc_NOEXCEPT            \
+        {                                                                                \
+            return min();                                                                \
+        }                                                                                \
+        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> epsilon() Vc_NOEXCEPT           \
+        {                                                                                \
+            return Vc::AVX2::Vector<T>::Zero();                                          \
+        }                                                                                \
+        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> round_error() Vc_NOEXCEPT       \
+        {                                                                                \
+            return Vc::AVX2::Vector<T>::Zero();                                          \
+        }                                                                                \
+        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> infinity() Vc_NOEXCEPT          \
+        {                                                                                \
+            return Vc::AVX2::Vector<T>::Zero();                                          \
+        }                                                                                \
+        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> quiet_NaN() Vc_NOEXCEPT         \
+        {                                                                                \
+            return Vc::AVX2::Vector<T>::Zero();                                          \
+        }                                                                                \
+        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> signaling_NaN() Vc_NOEXCEPT     \
+        {                                                                                \
+            return Vc::AVX2::Vector<T>::Zero();                                          \
+        }                                                                                \
+        static Vc_INTRINSIC Vc_CONST Vc::AVX2::Vector<T> denorm_min() Vc_NOEXCEPT        \
+        {                                                                                \
+            return Vc::AVX2::Vector<T>::Zero();                                          \
+        }                                                                                \
+    }
+
+#ifdef Vc_IMPL_AVX2
+Vc_NUM_LIM(unsigned short, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>());
+Vc_NUM_LIM(         short, _mm256_srli_epi16(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi16());
+Vc_NUM_LIM(  unsigned int, Vc::Detail::allone<__m256i>(), Vc::Detail::zero<__m256i>());
+Vc_NUM_LIM(           int, _mm256_srli_epi32(Vc::Detail::allone<__m256i>(), 1), Vc::AVX::setmin_epi32());
+#endif
+#undef Vc_NUM_LIM
+
+} // namespace std
+
+#endif // VC_AVX_LIMITS_H_
--- a/Vc/avx/macros.h
+++ b/Vc/avx/macros.h
@ -0,0 +1,33 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#include "../common/macros.h"
+
+#ifndef VC_AVX_MACROS_H_
+#define VC_AVX_MACROS_H_
+
+#endif // VC_AVX_MACROS_H_
--- a/Vc/avx/mask.h
+++ b/Vc/avx/mask.h
@ -0,0 +1,235 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_AVX_MASK_H_
+#define VC_AVX_MASK_H_
+
+#include <array>
+
+#include "intrinsics.h"
+#include "../common/storage.h"
+#include "../common/bitscanintrinsics.h"
+#include "../common/maskbool.h"
+#include "detail.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+template <typename T> class Mask<T, VectorAbi::Avx>
+{
+public:
+    using abi = VectorAbi::Avx;
+
+    /**
+     * The \c EntryType of masks is always bool, independent of \c T.
+     */
+    typedef bool EntryType;
+    using value_type = EntryType;
+
+    using MaskBool = Common::MaskBool<sizeof(T)>;
+    /**
+     * The \c VectorEntryType, in contrast to \c EntryType, reveals information about the SIMD
+     * implementation. This type is useful for the \c sizeof operator in generic functions.
+     */
+    using VectorEntryType = MaskBool;
+
+    /**
+     * The associated Vector<T> type.
+     */
+    using Vector = AVX2::Vector<T>;
+
+    ///\internal
+    using VectorTypeF = AVX::FloatVectorType<typename AVX::VectorTypeHelper<T>::Type>;
+    ///\internal
+    using VectorTypeD = AVX::DoubleVectorType<VectorTypeF>;
+    ///\internal
+    using VectorTypeI = AVX::IntegerVectorType<VectorTypeF>;
+
+private:
+    typedef const VectorTypeF VArg;
+    typedef const VectorTypeD VdArg;
+    typedef const VectorTypeI ViArg;
+
+public:
+    static constexpr size_t Size = sizeof(VectorTypeF) / sizeof(T);
+    static constexpr size_t MemoryAlignment = Size;
+    static constexpr std::size_t size() { return Size; }
+    Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType));
+
+private:
+    typedef Common::Storage<T, Size> Storage;
+
+public:
+    /**
+     * The \c VectorType reveals the implementation-specific internal type used for the
+     * SIMD type.
+     */
+    using VectorType = typename Storage::VectorType;
+
+    using EntryReference = Vc::Detail::ElementReference<Mask>;
+    using reference = EntryReference;
+
+        // abstracts the way Masks are passed to functions, it can easily be changed to const ref here
+#if defined Vc_MSVC && defined _WIN32
+        typedef const Mask &AsArg;
+#else
+        typedef const Mask AsArg;
+#endif
+
+        Vc_INTRINSIC Mask() {}
+        Vc_INTRINSIC Mask(VArg  x) : d(AVX::avx_cast<VectorType>(x)) {}
+        Vc_INTRINSIC Mask(VdArg x) : d(AVX::avx_cast<VectorType>(x)) {}
+        Vc_INTRINSIC Mask(ViArg x) : d(AVX::avx_cast<VectorType>(x)) {}
+        Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero) : d(Detail::zero<VectorType>()) {}
+        Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne) : d(Detail::allone<VectorType>()) {}
+        Vc_INTRINSIC explicit Mask(bool b)
+            : d(b ? Detail::allone<VectorType>() : Detail::zero<VectorType>())
+        {
+        }
+        Vc_INTRINSIC static Mask Zero() { return Mask{Vc::Zero}; }
+        Vc_INTRINSIC static Mask One() { return Mask{Vc::One}; }
+
+        // implicit cast
+        template <typename U>
+        Vc_INTRINSIC Mask(
+            U &&rhs, Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg)
+            : d(AVX::avx_cast<VectorType>(
+                  Detail::mask_cast<Traits::decay<U>::Size, Size, VectorTypeF>(
+                      rhs.dataI())))
+        {
+        }
+
+#if Vc_IS_VERSION_1
+        // explicit cast, implemented via simd_cast (in avx/simd_cast_caller.h)
+        template <typename U>
+        Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
+                      "mask types") Vc_INTRINSIC
+            explicit Mask(U &&rhs,
+                          Common::enable_if_mask_converts_explicitly<T, U> = nullarg);
+#endif
+
+        template<typename Flags = DefaultLoadTag> Vc_INTRINSIC explicit Mask(const bool *mem, Flags f = Flags()) { load(mem, f); }
+
+        template<typename Flags = DefaultLoadTag> Vc_INTRINSIC void load(const bool *mem, Flags = Flags());
+
+        template<typename Flags = DefaultLoadTag> Vc_INTRINSIC void store(bool *mem, Flags = Flags()) const;
+
+        Vc_INTRINSIC Mask &operator=(const Mask &) = default;
+        Vc_INTRINSIC_L Mask &operator=(const std::array<bool, Size> &values) Vc_INTRINSIC_R;
+        Vc_INTRINSIC_L operator std::array<bool, Size>() const Vc_INTRINSIC_R;
+
+        // specializations in mask.tcc
+        Vc_INTRINSIC Vc_PURE bool operator==(const Mask &rhs) const
+        { return Detail::movemask(d.v()) == Detail::movemask(rhs.d.v()); }
+
+        Vc_INTRINSIC Vc_PURE bool operator!=(const Mask &rhs) const
+        { return !operator==(rhs); }
+
+        Vc_INTRINSIC Mask operator!() const
+        {
+#ifdef Vc_GCC
+            return ~dataI();
+#else
+            return Detail::andnot_(dataF(), Detail::allone<VectorTypeF>());
+#endif
+        }
+
+        Vc_INTRINSIC Mask &operator&=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::and_(data(), rhs.data())); return *this; }
+        Vc_INTRINSIC Mask &operator|=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::or_ (data(), rhs.data())); return *this; }
+        Vc_INTRINSIC Mask &operator^=(const Mask &rhs) { d.v() = AVX::avx_cast<VectorType>(Detail::xor_(data(), rhs.data())); return *this; }
+
+        Vc_INTRINSIC Vc_PURE Mask operator&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); }
+        Vc_INTRINSIC Vc_PURE Mask operator|(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); }
+        Vc_INTRINSIC Vc_PURE Mask operator^(const Mask &rhs) const { return Detail::xor_(data(), rhs.data()); }
+
+        Vc_INTRINSIC Vc_PURE Mask operator&&(const Mask &rhs) const { return Detail::and_(data(), rhs.data()); }
+        Vc_INTRINSIC Vc_PURE Mask operator||(const Mask &rhs) const { return Detail::or_(data(), rhs.data()); }
+
+        // no need for expression template optimizations because cmp(n)eq for floats are not bitwise
+        // compares
+        Vc_INTRINSIC_L bool isNotEmpty() const Vc_INTRINSIC_R;
+        Vc_INTRINSIC_L bool isEmpty() const Vc_INTRINSIC_R;
+        Vc_INTRINSIC_L bool isFull() const Vc_INTRINSIC_R;
+        Vc_INTRINSIC_L bool isMix() const Vc_INTRINSIC_R;
+
+        Vc_INTRINSIC Vc_PURE int shiftMask() const { return Detail::movemask(dataI()); }
+        Vc_INTRINSIC Vc_PURE int toInt() const { return Detail::mask_to_int<Size>(dataI()); }
+
+        Vc_INTRINSIC VectorType  data () const { return d.v(); }
+        Vc_INTRINSIC VectorTypeF dataF() const { return AVX::avx_cast<VectorTypeF>(d.v()); }
+        Vc_INTRINSIC VectorTypeI dataI() const { return AVX::avx_cast<VectorTypeI>(d.v()); }
+        Vc_INTRINSIC VectorTypeD dataD() const { return AVX::avx_cast<VectorTypeD>(d.v()); }
+
+private:
+    friend reference;
+    static Vc_INTRINSIC Vc_PURE value_type get(const Mask &m, int i) noexcept
+    {
+        return m.toInt() & (1 << i);
+    }
+    template <typename U>
+    static Vc_INTRINSIC void set(Mask &m, int i,
+                                 U &&v) noexcept(noexcept(MaskBool(std::declval<U>())))
+    {
+        m.d.set(i, MaskBool(std::forward<U>(v)));
+    }
+
+public:
+    /**
+     * \note the returned object models the concept of a reference and
+     * as such it can exist longer than the data it is referencing.
+     * \note to avoid lifetime issues, we strongly advice not to store
+     * any reference objects.
+     */
+    Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
+    {
+        return {*this, int(index)};
+    }
+    Vc_ALWAYS_INLINE Vc_PURE value_type operator[](size_t index) const noexcept
+    {
+        return get(*this, index);
+    }
+
+        Vc_INTRINSIC Vc_PURE int count() const { return Detail::popcnt16(toInt()); }
+        Vc_INTRINSIC Vc_PURE int firstOne() const { return _bit_scan_forward(toInt()); }
+
+        template <typename G> static Vc_INTRINSIC_L Mask generate(G &&gen) Vc_INTRINSIC_R;
+        Vc_INTRINSIC_L Vc_PURE_L Mask shifted(int amount) const Vc_INTRINSIC_R Vc_PURE_R;
+
+    private:
+#ifdef Vc_COMPILE_BENCHMARKS
+    public:
+#endif
+        Storage d;
+};
+template <typename T> constexpr size_t Mask<T, VectorAbi::Avx>::Size;
+template <typename T> constexpr size_t Mask<T, VectorAbi::Avx>::MemoryAlignment;
+
+}  // namespace Vc
+
+#include "mask.tcc"
+
+#endif // VC_AVX_MASK_H_
--- a/Vc/avx/mask.tcc
+++ b/Vc/avx/mask.tcc
@ -0,0 +1,292 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+// store {{{1
+template <typename T>
+template <typename Flags>
+Vc_INTRINSIC void Mask<T, VectorAbi::Avx>::store(bool *mem, Flags f) const
+{
+    Detail::mask_store<Size>(dataI(), mem, f);
+}
+
+// load {{{1
+template <typename T>
+template <typename Flags>
+Vc_INTRINSIC void Mask<T, VectorAbi::Avx>::load(const bool *mem, Flags f)
+{
+    d.v() = AVX::avx_cast<VectorType>(Detail::mask_load<VectorTypeF, Size>(mem, f));
+}
+
+// operator[] {{{1
+#ifdef Vc_IMPL_AVX2
+template <>
+Vc_INTRINSIC Vc_PURE bool AVX2::Mask<int16_t>::get(const AVX2::Mask<int16_t> &m,
+                                                   int index) noexcept
+{
+    return m.shiftMask() & (1 << 2 * index);
+}
+template <>
+Vc_INTRINSIC Vc_PURE bool AVX2::Mask<uint16_t>::get(const AVX2::Mask<uint16_t> &m,
+                                                    int index) noexcept
+{
+    return m.shiftMask() & (1 << 2 * index);
+}
+#endif
+// operator== {{{1
+template <> Vc_INTRINSIC Vc_PURE bool AVX2::double_m::operator==(const AVX2::double_m &rhs) const
+{ return Detail::movemask(dataD()) == Detail::movemask(rhs.dataD()); }
+#ifdef Vc_IMPL_AVX2
+template <> Vc_INTRINSIC Vc_PURE bool AVX2::short_m::operator==(const AVX2::short_m &rhs) const
+{ return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); }
+template <> Vc_INTRINSIC Vc_PURE bool AVX2::ushort_m::operator==(const AVX2::ushort_m &rhs) const
+{ return Detail::movemask(dataI()) == Detail::movemask(rhs.dataI()); }
+#endif
+
+// isFull, isNotEmpty, isEmpty, isMix specializations{{{1
+template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isFull() const {
+    if (sizeof(T) == 8) {
+        return 0 != Detail::testc(dataD(), Detail::allone<VectorTypeD>());
+    } else if (sizeof(T) == 4) {
+        return 0 != Detail::testc(dataF(), Detail::allone<VectorTypeF>());
+    } else {
+        return 0 != Detail::testc(dataI(), Detail::allone<VectorTypeI>());
+    }
+}
+
+template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isNotEmpty() const {
+    if (sizeof(T) == 8) {
+        return 0 == Detail::testz(dataD(), dataD());
+    } else if (sizeof(T) == 4) {
+        return 0 == Detail::testz(dataF(), dataF());
+    } else {
+        return 0 == Detail::testz(dataI(), dataI());
+    }
+}
+
+template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isEmpty() const {
+    if (sizeof(T) == 8) {
+        return 0 != Detail::testz(dataD(), dataD());
+    } else if (sizeof(T) == 4) {
+        return 0 != Detail::testz(dataF(), dataF());
+    } else {
+        return 0 != Detail::testz(dataI(), dataI());
+    }
+}
+
+template <typename T> Vc_INTRINSIC bool Mask<T, VectorAbi::Avx>::isMix() const {
+    if (sizeof(T) == 8) {
+        return 0 != Detail::testnzc(dataD(), Detail::allone<VectorTypeD>());
+    } else if (sizeof(T) == 4) {
+        return 0 != Detail::testnzc(dataF(), Detail::allone<VectorTypeF>());
+    } else {
+        return 0 != Detail::testnzc(dataI(), Detail::allone<VectorTypeI>());
+    }
+}
+
+// generate {{{1
+template <typename M, typename G>
+Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4 + 32>)
+{
+    return _mm256_setr_epi64x(
+        gen(0) ? 0xffffffffffffffffull : 0, gen(1) ? 0xffffffffffffffffull : 0,
+        gen(2) ? 0xffffffffffffffffull : 0, gen(3) ? 0xffffffffffffffffull : 0);
+}
+template <typename M, typename G>
+Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8 + 32>)
+{
+    return _mm256_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
+                             gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0,
+                             gen(4) ? 0xfffffffful : 0, gen(5) ? 0xfffffffful : 0,
+                             gen(6) ? 0xfffffffful : 0, gen(7) ? 0xfffffffful : 0);
+}
+template <typename M, typename G>
+Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 16 + 32>)
+{
+    return _mm256_setr_epi16(gen(0) ? 0xfffful : 0, gen(1) ? 0xfffful : 0,
+                             gen(2) ? 0xfffful : 0, gen(3) ? 0xfffful : 0,
+                             gen(4) ? 0xfffful : 0, gen(5) ? 0xfffful : 0,
+                             gen(6) ? 0xfffful : 0, gen(7) ? 0xfffful : 0,
+                             gen(8) ? 0xfffful : 0, gen(9) ? 0xfffful : 0,
+                             gen(10) ? 0xfffful : 0, gen(11) ? 0xfffful : 0,
+                             gen(12) ? 0xfffful : 0, gen(13) ? 0xfffful : 0,
+                             gen(14) ? 0xfffful : 0, gen(15) ? 0xfffful : 0);
+}
+template <typename T>
+template <typename G>
+Vc_INTRINSIC AVX2::Mask<T> Mask<T, VectorAbi::Avx>::generate(G &&gen)
+{
+    return generate_impl<AVX2::Mask<T>>(std::forward<G>(gen),
+                                  std::integral_constant<int, Size + sizeof(Storage)>());
+}
+// shifted {{{1
+template <typename T> Vc_INTRINSIC Vc_PURE AVX2::Mask<T> Mask<T, VectorAbi::Avx>::shifted(int amount) const
+{
+    switch (amount * int(sizeof(VectorEntryType))) {
+    case   0: return *this;
+    case   1: return Detail::shifted<  1>(dataI());
+    case   2: return Detail::shifted<  2>(dataI());
+    case   3: return Detail::shifted<  3>(dataI());
+    case   4: return Detail::shifted<  4>(dataI());
+    case   5: return Detail::shifted<  5>(dataI());
+    case   6: return Detail::shifted<  6>(dataI());
+    case   7: return Detail::shifted<  7>(dataI());
+    case   8: return Detail::shifted<  8>(dataI());
+    case   9: return Detail::shifted<  9>(dataI());
+    case  10: return Detail::shifted< 10>(dataI());
+    case  11: return Detail::shifted< 11>(dataI());
+    case  12: return Detail::shifted< 12>(dataI());
+    case  13: return Detail::shifted< 13>(dataI());
+    case  14: return Detail::shifted< 14>(dataI());
+    case  15: return Detail::shifted< 15>(dataI());
+    case  16: return Detail::shifted< 16>(dataI());
+    case  17: return Detail::shifted< 17>(dataI());
+    case  18: return Detail::shifted< 18>(dataI());
+    case  19: return Detail::shifted< 19>(dataI());
+    case  20: return Detail::shifted< 20>(dataI());
+    case  21: return Detail::shifted< 21>(dataI());
+    case  22: return Detail::shifted< 22>(dataI());
+    case  23: return Detail::shifted< 23>(dataI());
+    case  24: return Detail::shifted< 24>(dataI());
+    case  25: return Detail::shifted< 25>(dataI());
+    case  26: return Detail::shifted< 26>(dataI());
+    case  27: return Detail::shifted< 27>(dataI());
+    case  28: return Detail::shifted< 28>(dataI());
+    case  29: return Detail::shifted< 29>(dataI());
+    case  30: return Detail::shifted< 30>(dataI());
+    case  31: return Detail::shifted< 31>(dataI());
+    case  -1: return Detail::shifted< -1>(dataI());
+    case  -2: return Detail::shifted< -2>(dataI());
+    case  -3: return Detail::shifted< -3>(dataI());
+    case  -4: return Detail::shifted< -4>(dataI());
+    case  -5: return Detail::shifted< -5>(dataI());
+    case  -6: return Detail::shifted< -6>(dataI());
+    case  -7: return Detail::shifted< -7>(dataI());
+    case  -8: return Detail::shifted< -8>(dataI());
+    case  -9: return Detail::shifted< -9>(dataI());
+    case -10: return Detail::shifted<-10>(dataI());
+    case -11: return Detail::shifted<-11>(dataI());
+    case -12: return Detail::shifted<-12>(dataI());
+    case -13: return Detail::shifted<-13>(dataI());
+    case -14: return Detail::shifted<-14>(dataI());
+    case -15: return Detail::shifted<-15>(dataI());
+    case -16: return Detail::shifted<-16>(dataI());
+    case -17: return Detail::shifted<-17>(dataI());
+    case -18: return Detail::shifted<-18>(dataI());
+    case -19: return Detail::shifted<-19>(dataI());
+    case -20: return Detail::shifted<-20>(dataI());
+    case -21: return Detail::shifted<-21>(dataI());
+    case -22: return Detail::shifted<-22>(dataI());
+    case -23: return Detail::shifted<-23>(dataI());
+    case -24: return Detail::shifted<-24>(dataI());
+    case -25: return Detail::shifted<-25>(dataI());
+    case -26: return Detail::shifted<-26>(dataI());
+    case -27: return Detail::shifted<-27>(dataI());
+    case -28: return Detail::shifted<-28>(dataI());
+    case -29: return Detail::shifted<-29>(dataI());
+    case -30: return Detail::shifted<-30>(dataI());
+    case -31: return Detail::shifted<-31>(dataI());
+    }
+    return Zero();
+}
+// }}}1
+
+/*
+template<> Vc_INTRINSIC AVX2::Mask< 4, 32> &AVX2::Mask< 4, 32>::operator=(const std::array<bool, 4> &values) {
+    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
+    unsigned int x = *reinterpret_cast<const unsigned int *>(values.data());
+    x *= 0xffu;
+    __m128i y = _mm_cvtsi32_si128(x); //  4 Bytes
+    y = _mm_unpacklo_epi8(y, y);    //  8 Bytes
+    y = _mm_unpacklo_epi16(y, y);   // 16 Bytes
+    d.v() = AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(y, y), _mm_unpackhi_epi32(y, y)));
+    return *this;
+}
+template<> Vc_INTRINSIC AVX2::Mask< 8, 32> &AVX2::Mask< 8, 32>::operator=(const std::array<bool, 8> &values) {
+    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
+    unsigned long long x = *reinterpret_cast<const unsigned long long *>(values.data());
+    x *= 0xffull;
+    __m128i y = _mm_cvtsi64_si128(x); //  8 Bytes
+    y = _mm_unpacklo_epi8(y, y);   // 16 Bytes
+    d.v() = AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(y, y), _mm_unpackhi_epi16(y, y)));
+    return *this;
+}
+template<> Vc_INTRINSIC AVX2::Mask< 8, 16> &AVX2::Mask< 8, 16>::operator=(const std::array<bool, 8> &values) {
+    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
+    unsigned long long x = *reinterpret_cast<const unsigned long long *>(values.data());
+    x *= 0xffull;
+    __m128i y = _mm_cvtsi64_si128(x); //  8 Bytes
+    d.v() = AVX::avx_cast<__m128>(_mm_unpacklo_epi8(y, y));
+    return *this;
+}
+template<> Vc_INTRINSIC AVX2::Mask<16, 16> &AVX2::Mask<16, 16>::operator=(const std::array<bool, 16> &values) {
+    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
+    __m128i x = _mm_loadu_si128(reinterpret_cast<const __m128i *>(values.data()));
+    d.v() = _mm_andnot_ps(AVX::_mm_setallone_ps(), AVX::avx_cast<__m128>(_mm_sub_epi8(x, _mm_set1_epi8(1))));
+    return *this;
+}
+
+template<> Vc_INTRINSIC AVX2::Mask< 4, 32>::operator std::array<bool, 4>() const {
+    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
+    __m128i x = _mm_packs_epi32(AVX::lo128(dataI()), AVX::hi128(dataI())); // 64bit -> 32bit
+    x = _mm_packs_epi32(x, x); // 32bit -> 16bit
+    x = _mm_srli_epi16(x, 15);
+    x = _mm_packs_epi16(x, x); // 16bit ->  8bit
+    std::array<bool, 4> r;
+    asm volatile("vmovd %1,%0" : "=m"(*r.data()) : "x"(x));
+    return r;
+}
+template<> Vc_INTRINSIC AVX2::Mask< 8, 32>::operator std::array<bool, 8>() const {
+    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
+    __m128i x = _mm_packs_epi32(AVX::lo128(dataI()), AVX::hi128(dataI())); // 32bit -> 16bit
+    x = _mm_srli_epi16(x, 15);
+    x = _mm_packs_epi16(x, x); // 16bit ->  8bit
+    std::array<bool, 8> r;
+    asm volatile("vmovq %1,%0" : "=m"(*r.data()) : "x"(x));
+    return r;
+}
+template<> Vc_INTRINSIC AVX2::Mask< 8, 16>::operator std::array<bool, 8>() const {
+    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
+    __m128i x = _mm_srli_epi16(dataI(), 15);
+    x = _mm_packs_epi16(x, x); // 16bit ->  8bit
+    std::array<bool, 8> r;
+    asm volatile("vmovq %1,%0" : "=m"(*r.data()) : "x"(x));
+    return r;
+}
+template<> Vc_INTRINSIC AVX2::Mask<16, 16>::operator std::array<bool, 16>() const {
+    static_assert(sizeof(bool) == 1, "Vc expects bool to have a sizeof 1 Byte");
+    __m128 x = _mm_and_ps(d.v(), AVX::avx_cast<__m128>(_mm_set1_epi32(0x01010101)));
+    std::array<bool, 16> r;
+    asm volatile("vmovups %1,%0" : "=m"(*r.data()) : "x"(x));
+    return r;
+}
+*/
+
+}
+
+// vim: foldmethod=marker
--- a/Vc/avx/math.h
+++ b/Vc/avx/math.h
@ -0,0 +1,321 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_AVX_MATH_H_
+#define VC_AVX_MATH_H_
+
+#include "const.h"
+#include "limits.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+// min & max {{{1
+#ifdef Vc_IMPL_AVX2
+Vc_ALWAYS_INLINE AVX2::int_v    min(const AVX2::int_v    &x, const AVX2::int_v    &y) { return _mm256_min_epi32(x.data(), y.data()); }
+Vc_ALWAYS_INLINE AVX2::uint_v   min(const AVX2::uint_v   &x, const AVX2::uint_v   &y) { return _mm256_min_epu32(x.data(), y.data()); }
+Vc_ALWAYS_INLINE AVX2::short_v  min(const AVX2::short_v  &x, const AVX2::short_v  &y) { return _mm256_min_epi16(x.data(), y.data()); }
+Vc_ALWAYS_INLINE AVX2::ushort_v min(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_min_epu16(x.data(), y.data()); }
+Vc_ALWAYS_INLINE AVX2::int_v    max(const AVX2::int_v    &x, const AVX2::int_v    &y) { return _mm256_max_epi32(x.data(), y.data()); }
+Vc_ALWAYS_INLINE AVX2::uint_v   max(const AVX2::uint_v   &x, const AVX2::uint_v   &y) { return _mm256_max_epu32(x.data(), y.data()); }
+Vc_ALWAYS_INLINE AVX2::short_v  max(const AVX2::short_v  &x, const AVX2::short_v  &y) { return _mm256_max_epi16(x.data(), y.data()); }
+Vc_ALWAYS_INLINE AVX2::ushort_v max(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_max_epu16(x.data(), y.data()); }
+#endif
+Vc_ALWAYS_INLINE AVX2::float_v  min(const AVX2::float_v  &x, const AVX2::float_v  &y) { return _mm256_min_ps(x.data(), y.data()); }
+Vc_ALWAYS_INLINE AVX2::double_v min(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_min_pd(x.data(), y.data()); }
+Vc_ALWAYS_INLINE AVX2::float_v  max(const AVX2::float_v  &x, const AVX2::float_v  &y) { return _mm256_max_ps(x.data(), y.data()); }
+Vc_ALWAYS_INLINE AVX2::double_v max(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_max_pd(x.data(), y.data()); }
+
+// sqrt {{{1
+template <typename T>
+Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> sqrt(const AVX2::Vector<T> &x)
+{
+    return AVX::VectorHelper<T>::sqrt(x.data());
+}
+
+// rsqrt {{{1
+template <typename T>
+Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> rsqrt(const AVX2::Vector<T> &x)
+{
+    return AVX::VectorHelper<T>::rsqrt(x.data());
+}
+
+// reciprocal {{{1
+template <typename T>
+Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> reciprocal(const AVX2::Vector<T> &x)
+{
+    return AVX::VectorHelper<T>::reciprocal(x.data());
+}
+
+// round {{{1
+template <typename T>
+Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> round(const AVX2::Vector<T> &x)
+{
+    return AVX::VectorHelper<T>::round(x.data());
+}
+
+// abs {{{1
+Vc_INTRINSIC Vc_CONST AVX2::double_v abs(AVX2::double_v x)
+{
+    return Detail::and_(x.data(), AVX::setabsmask_pd());
+}
+Vc_INTRINSIC Vc_CONST AVX2::float_v abs(AVX2::float_v x)
+{
+    return Detail::and_(x.data(), AVX::setabsmask_ps());
+}
+#ifdef Vc_IMPL_AVX2
+Vc_INTRINSIC Vc_CONST AVX2::int_v abs(AVX2::int_v x)
+{
+    return _mm256_abs_epi32(x.data());
+}
+Vc_INTRINSIC Vc_CONST AVX2::short_v abs(AVX2::short_v x)
+{
+    return _mm256_abs_epi16(x.data());
+}
+#endif
+
+// isfinite {{{1
+Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isfinite(const AVX2::double_v &x)
+{
+    return AVX::cmpord_pd(x.data(), _mm256_mul_pd(Detail::zero<__m256d>(), x.data()));
+}
+
+Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isfinite(const AVX2::float_v &x)
+{
+    return AVX::cmpord_ps(x.data(), _mm256_mul_ps(Detail::zero<__m256>(), x.data()));
+}
+
+// isinf {{{1
+Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isinf(const AVX2::double_v &x)
+{
+    return _mm256_castsi256_pd(AVX::cmpeq_epi64(
+        _mm256_castpd_si256(abs(x).data()),
+        _mm256_castpd_si256(Detail::avx_broadcast(AVX::c_log<double>::d(1)))));
+}
+
+Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isinf(const AVX2::float_v &x)
+{
+    return _mm256_castsi256_ps(
+        AVX::cmpeq_epi32(_mm256_castps_si256(abs(x).data()),
+                         _mm256_castps_si256(Detail::avx_broadcast(AVX::c_log<float>::d(1)))));
+}
+
+// isnan {{{1
+Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isnan(const AVX2::double_v &x)
+{
+    return AVX::cmpunord_pd(x.data(), x.data());
+}
+
+Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isnan(const AVX2::float_v &x)
+{
+    return AVX::cmpunord_ps(x.data(), x.data());
+}
+
+// copysign {{{1
+Vc_INTRINSIC Vc_CONST AVX2::float_v copysign(AVX2::float_v mag, AVX2::float_v sign)
+{
+    return _mm256_or_ps(_mm256_and_ps(sign.data(), AVX::setsignmask_ps()),
+                        _mm256_and_ps(mag.data(), AVX::setabsmask_ps()));
+}
+Vc_INTRINSIC Vc_CONST AVX2::double_v copysign(AVX2::double_v::AsArg mag,
+                                              AVX2::double_v::AsArg sign)
+{
+    return _mm256_or_pd(_mm256_and_pd(sign.data(), AVX::setsignmask_pd()),
+                        _mm256_and_pd(mag.data(), AVX::setabsmask_pd()));
+}
+
+//}}}1
+// frexp {{{1
+/**
+ * splits \p v into exponent and mantissa, the sign is kept with the mantissa
+ *
+ * The return value will be in the range [0.5, 1.0[
+ * The \p e value will be an integer defining the power-of-two exponent
+ */
+inline AVX2::double_v frexp(AVX2::double_v::AsArg v, SimdArray<int, 4> *e)
+{
+    const __m256d exponentBits = AVX::Const<double>::exponentMask().dataD();
+    const __m256d exponentPart = _mm256_and_pd(v.data(), exponentBits);
+    auto lo = AVX::avx_cast<__m128i>(AVX::lo128(exponentPart));
+    auto hi = AVX::avx_cast<__m128i>(AVX::hi128(exponentPart));
+    lo = _mm_sub_epi32(_mm_srli_epi64(lo, 52), _mm_set1_epi64x(0x3fe));
+    hi = _mm_sub_epi32(_mm_srli_epi64(hi, 52), _mm_set1_epi64x(0x3fe));
+    SSE::int_v exponent = Mem::shuffle<X0, X2, Y0, Y2>(lo, hi);
+    const __m256d exponentMaximized = _mm256_or_pd(v.data(), exponentBits);
+    AVX2::double_v ret =
+        _mm256_and_pd(exponentMaximized,
+                      _mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask)));
+    const double_m zeroMask = v == AVX2::double_v::Zero();
+    ret(isnan(v) || !isfinite(v) || zeroMask) = v;
+    exponent.setZero(simd_cast<SSE::int_m>(zeroMask));
+    internal_data(*e) = exponent;
+    return ret;
+}
+
+#ifdef Vc_IMPL_AVX2
+inline SimdArray<double, 8> frexp(const SimdArray<double, 8> &v, SimdArray<int, 8> *e)
+{
+    const __m256d exponentBits = AVX::Const<double>::exponentMask().dataD();
+    const __m256d w[2] = {internal_data(internal_data0(v)).data(),
+                          internal_data(internal_data1(v)).data()};
+    const __m256i exponentPart[2] = {
+        _mm256_castpd_si256(_mm256_and_pd(w[0], exponentBits)),
+        _mm256_castpd_si256(_mm256_and_pd(w[1], exponentBits))};
+    const __m256i lo = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[0], 52),
+                                        _mm256_set1_epi32(0x3fe));   // 0.1. 2.3.
+    const __m256i hi = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[1], 52),
+                                        _mm256_set1_epi32(0x3fe));   // 4.5. 6.7.
+    const __m256i a = _mm256_unpacklo_epi32(lo, hi);                 // 04.. 26..
+    const __m256i b = _mm256_unpackhi_epi32(lo, hi);                 // 15.. 37..
+    const __m256i tmp = _mm256_unpacklo_epi32(a, b);                 // 0145 2367
+    const __m256i exponent =
+        AVX::concat(_mm_unpacklo_epi64(AVX::lo128(tmp), AVX::hi128(tmp)),
+                    _mm_unpackhi_epi64(AVX::lo128(tmp), AVX::hi128(tmp)));  // 0123 4567
+    const __m256d exponentMaximized[2] = {_mm256_or_pd(w[0], exponentBits),
+                                          _mm256_or_pd(w[1], exponentBits)};
+    const auto frexpMask =
+        _mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask));
+    fixed_size_simd<double, 8> ret = {
+        fixed_size_simd<double, 4>(
+            AVX::double_v(_mm256_and_pd(exponentMaximized[0], frexpMask))),
+        fixed_size_simd<double, 4>(
+            AVX::double_v(_mm256_and_pd(exponentMaximized[1], frexpMask)))};
+    const auto zeroMask = v == v.Zero();
+    ret(isnan(v) || !isfinite(v) || zeroMask) = v;
+    internal_data(*e) =
+        Detail::andnot_(simd_cast<AVX2::int_m>(zeroMask).dataI(), exponent);
+    return ret;
+}
+#endif  // Vc_IMPL_AVX2
+
+namespace Detail
+{
+Vc_INTRINSIC AVX2::float_v::IndexType extractExponent(__m256 e)
+{
+    SimdArray<uint, float_v::Size> exponentPart;
+    const auto ee = AVX::avx_cast<__m256i>(e);
+#ifdef Vc_IMPL_AVX2
+    exponentPart = AVX2::uint_v(ee);
+#else
+    internal_data(internal_data0(exponentPart)) = AVX::lo128(ee);
+    internal_data(internal_data1(exponentPart)) = AVX::hi128(ee);
+#endif
+    return (exponentPart >> 23) - 0x7e;
+}
+}  // namespace Detail
+inline AVX2::float_v frexp(AVX2::float_v::AsArg v, SimdArray<int, 8> *e)
+{
+    using namespace Detail;
+    using namespace AVX2;
+    const __m256 exponentBits = Const<float>::exponentMask().data();
+    *e = extractExponent(and_(v.data(), exponentBits));
+    const __m256 exponentMaximized = or_(v.data(), exponentBits);
+    AVX2::float_v ret = _mm256_and_ps(exponentMaximized, avx_cast<__m256>(set1_epi32(0xbf7fffffu)));
+    ret(isnan(v) || !isfinite(v) || v == AVX2::float_v::Zero()) = v;
+    e->setZero(simd_cast<decltype(*e == *e)>(v == AVX2::float_v::Zero()));
+    return ret;
+}
+
+// ldexp {{{1
+/*             -> x * 2^e
+ * x == NaN    -> NaN
+ * x == (-)inf -> (-)inf
+ */
+inline AVX2::double_v ldexp(AVX2::double_v::AsArg v, const SimdArray<int, 4> &_e)
+{
+    SSE::int_v e = internal_data(_e);
+    e.setZero(simd_cast<SSE::int_m>(v == AVX2::double_v::Zero()));
+    const __m256i exponentBits =
+        AVX::concat(_mm_slli_epi64(_mm_unpacklo_epi32(e.data(), e.data()), 52),
+                    _mm_slli_epi64(_mm_unpackhi_epi32(e.data(), e.data()), 52));
+    return AVX::avx_cast<__m256d>(
+        AVX::add_epi64(AVX::avx_cast<__m256i>(v.data()), exponentBits));
+}
+inline AVX2::float_v ldexp(AVX2::float_v::AsArg v, SimdArray<int, 8> e)
+{
+    e.setZero(simd_cast<decltype(e == e)>(v == AVX2::float_v::Zero()));
+    e <<= 23;
+#ifdef Vc_IMPL_AVX2
+    return {AVX::avx_cast<__m256>(
+        AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())),
+                                  AVX::lo128(internal_data(e).data())),
+                    _mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())),
+                                  AVX::hi128(internal_data(e).data()))))};
+#else
+    return {AVX::avx_cast<__m256>(
+        AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())),
+                                  internal_data(internal_data0(e)).data()),
+                    _mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())),
+                                  internal_data(internal_data1(e)).data())))};
+#endif
+}
+
+// trunc {{{1
+Vc_ALWAYS_INLINE AVX2::float_v trunc(AVX2::float_v::AsArg v)
+{
+    return _mm256_round_ps(v.data(), 0x3);
+}
+Vc_ALWAYS_INLINE AVX2::double_v trunc(AVX2::double_v::AsArg v)
+{
+    return _mm256_round_pd(v.data(), 0x3);
+}
+
+// floor {{{1
+Vc_ALWAYS_INLINE AVX2::float_v floor(AVX2::float_v::AsArg v)
+{
+    return _mm256_floor_ps(v.data());
+}
+Vc_ALWAYS_INLINE AVX2::double_v floor(AVX2::double_v::AsArg v)
+{
+    return _mm256_floor_pd(v.data());
+}
+
+// ceil {{{1
+Vc_ALWAYS_INLINE AVX2::float_v ceil(AVX2::float_v::AsArg v)
+{
+    return _mm256_ceil_ps(v.data());
+}
+Vc_ALWAYS_INLINE AVX2::double_v ceil(AVX2::double_v::AsArg v)
+{
+    return _mm256_ceil_pd(v.data());
+}
+
+// fma {{{1
+template <typename T>
+Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx> fma(Vector<T, VectorAbi::Avx> a,
+                                               Vector<T, VectorAbi::Avx> b,
+                                               Vector<T, VectorAbi::Avx> c)
+{
+    return Detail::fma(a.data(), b.data(), c.data(), T());
+}
+
+// }}}1
+}  // namespace Vc
+
+#endif // VC_AVX_MATH_H_
+
+// vim: foldmethod=marker
--- a/Vc/avx/shuffle.h
+++ b/Vc/avx/shuffle.h
@ -0,0 +1,308 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_AVX_SHUFFLE_H_
+#define VC_AVX_SHUFFLE_H_
+
+#include "../sse/shuffle.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Detail
+{
+template <int... Dst> struct Permutation {};
+template <uint8_t... Sel> struct Mask {};
+
+#ifdef Vc_IMPL_AVX2
+template <uint8_t Sel0, uint8_t Sel1, uint8_t Sel2, uint8_t Sel3, uint8_t Sel4,
+          uint8_t Sel5, uint8_t Sel6, uint8_t Sel7, uint8_t Sel8, uint8_t Sel9,
+          uint8_t Sel10, uint8_t Sel11, uint8_t Sel12, uint8_t Sel13, uint8_t Sel14,
+          uint8_t Sel15>
+Vc_INTRINSIC Vc_CONST __m256i
+blend(__m256i a, __m256i b, Mask<Sel0, Sel1, Sel2, Sel3, Sel4, Sel5, Sel6, Sel7, Sel8,
+                                 Sel9, Sel10, Sel11, Sel12, Sel13, Sel14, Sel15>)
+{
+    static_assert((Sel0 == 0 || Sel0 == 1) && (Sel1 == 0 || Sel1 == 1) &&
+                      (Sel2 == 0 || Sel2 == 1) && (Sel3 == 0 || Sel3 == 1) &&
+                      (Sel4 == 0 || Sel4 == 1) && (Sel5 == 0 || Sel5 == 1) &&
+                      (Sel6 == 0 || Sel6 == 1) && (Sel7 == 0 || Sel7 == 1) &&
+                      (Sel8 == 0 || Sel8 == 1) && (Sel9 == 0 || Sel9 == 1) &&
+                      (Sel10 == 0 || Sel10 == 1) && (Sel11 == 0 || Sel11 == 1) &&
+                      (Sel12 == 0 || Sel12 == 1) && (Sel13 == 0 || Sel13 == 1) &&
+                      (Sel14 == 0 || Sel14 == 1) && (Sel15 == 0 || Sel15 == 1),
+                  "Selectors must be 0 or 1 to select the value from a or b");
+    constexpr uint8_t mask = static_cast<uint8_t>(
+        (Sel0  << 0 ) | (Sel1  << 1 ) | (Sel2  << 2 ) | (Sel3  << 3 ) |
+        (Sel4  << 4 ) | (Sel5  << 5 ) | (Sel6  << 6 ) | (Sel7  << 7 ) |
+        (Sel8  << 8 ) | (Sel9  << 9 ) | (Sel10 << 10) | (Sel11 << 11) |
+        (Sel12 << 12) | (Sel13 << 13) | (Sel14 << 14) | (Sel15 << 15));
+    return _mm256_blend_epi16(a, b, mask);
+}
+#endif  // Vc_IMPL_AVX2
+}  // namespace Detail
+namespace Mem
+{
+#ifdef Vc_IMPL_AVX2
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteLo(__m256i x) {
+            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
+            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
+            return _mm256_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
+        }
+
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteHi(__m256i x) {
+            static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
+            static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
+            return _mm256_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
+        }
+#endif  // Vc_IMPL_AVX2
+
+        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x) {
+            static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
+            static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
+            return _mm256_permute2f128_ps(
+                x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
+        }
+        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x) {
+            static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
+            static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
+            return _mm256_permute2f128_pd(
+                x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
+        }
+        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x) {
+            static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
+            static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
+#ifdef Vc_IMPL_AVX2
+            return _mm256_permute2x128_si256(
+                x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
+#else
+            return _mm256_permute2f128_si256(
+                x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
+#endif
+        }
+        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle128(__m256 x, __m256 y) {
+            static_assert(L >= X0 && H >= X0, "Incorrect_Range");
+            static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
+            return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
+        }
+        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST shuffle128(__m256i x, __m256i y) {
+            static_assert(L >= X0 && H >= X0, "Incorrect_Range");
+            static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
+#ifdef Vc_IMPL_AVX2
+            return _mm256_permute2x128_si256(
+                x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
+#else
+            return _mm256_permute2f128_si256(
+                x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
+#endif
+        }
+        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle128(__m256d x, __m256d y) {
+            static_assert(L >= X0 && H >= X0, "Incorrect_Range");
+            static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
+            return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
+        }
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
+            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
+            static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
+            return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
+        }
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
+            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
+            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
+            return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
+        }
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute(__m256i x) {
+            return _mm256_castps_si256(permute<Dst0, Dst1, Dst2, Dst3>(_mm256_castsi256_ps(x)));
+        }
+#ifdef Vc_IMPL_AVX2
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute4x64(__m256i x) {
+            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
+            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
+            return _mm256_permute4x64_epi64(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
+        }
+#endif  // Vc_IMPL_AVX2
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
+            static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
+            static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
+            return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
+        }
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
+            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
+            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
+            return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
+        }
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
+        static Vc_ALWAYS_INLINE __m256 Vc_CONST blend(__m256 x, __m256 y) {
+            static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
+            static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
+            static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
+            static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
+            static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
+            static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
+            static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
+            static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
+            return _mm256_blend_ps(x, y,
+                    (Dst0 / Y0) *  1 + (Dst1 / Y1) *  2 +
+                    (Dst2 / Y2) *  4 + (Dst3 / Y3) *  8 +
+                    (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
+                    (Dst6 / Y6) * 64 + (Dst7 / Y7) *128
+                    );
+        }
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
+        static Vc_ALWAYS_INLINE __m256i Vc_CONST blend(__m256i x, __m256i y) {
+            return _mm256_castps_si256(blend<Dst0, Dst1, Dst2, Dst3, Dst4, Dst5, Dst6, Dst7>(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
+        }
+        template<VecPos Dst> struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; };
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
+        static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
+            static_assert(Dst0 >= X0 && Dst0 <= X7, "Incorrect_Range");
+            static_assert(Dst1 >= X0 && Dst1 <= X7, "Incorrect_Range");
+            static_assert(Dst2 >= X0 && Dst2 <= X7, "Incorrect_Range");
+            static_assert(Dst3 >= X0 && Dst3 <= X7, "Incorrect_Range");
+            static_assert(Dst4 >= X0 && Dst4 <= X7, "Incorrect_Range");
+            static_assert(Dst5 >= X0 && Dst5 <= X7, "Incorrect_Range");
+            static_assert(Dst6 >= X0 && Dst6 <= X7, "Incorrect_Range");
+            static_assert(Dst7 >= X0 && Dst7 <= X7, "Incorrect_Range");
+            if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) {
+                return permute<Dst0, Dst1, Dst2, Dst3>(x);
+            }
+            const __m128 loIn = _mm256_castps256_ps128(x);
+            const __m128 hiIn = _mm256_extractf128_ps(x, 1);
+            __m128 lo, hi;
+
+            if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) {
+                lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
+            } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) {
+                lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
+            } else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) {
+                lo = shuffle<Dst0, Dst1, Dst2 - X4 + Y0, Dst3 - X4 + Y0>(loIn, hiIn);
+            } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) {
+                lo = shuffle<Dst0 - X4, Dst1 - X4, Dst2 + Y0, Dst3 + Y0>(hiIn, loIn);
+            } else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) {
+                lo = _mm_unpacklo_ps(loIn, hiIn);
+            } else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) {
+                lo = _mm_unpacklo_ps(hiIn, loIn);
+            } else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) {
+                lo = _mm_unpackhi_ps(loIn, hiIn);
+            } else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) {
+                lo = _mm_unpackhi_ps(hiIn, loIn);
+            } else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) {
+                lo = blend<ScaleForBlend<Dst0>::Value, ScaleForBlend<Dst1>::Value,
+                   ScaleForBlend<Dst2>::Value, ScaleForBlend<Dst3>::Value>(loIn, hiIn);
+            }
+
+            if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) {
+                hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
+            } else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) {
+                hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
+            } else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) {
+                hi = shuffle<Dst4, Dst5, Dst6 - X4 + Y0, Dst7 - X4 + Y0>(loIn, hiIn);
+            } else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) {
+                hi = shuffle<Dst4 - X4, Dst5 - X4, Dst6 + Y0, Dst7 + Y0>(hiIn, loIn);
+            } else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) {
+                hi = _mm_unpacklo_ps(loIn, hiIn);
+            } else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) {
+                hi = _mm_unpacklo_ps(hiIn, loIn);
+            } else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) {
+                hi = _mm_unpackhi_ps(loIn, hiIn);
+            } else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) {
+                hi = _mm_unpackhi_ps(hiIn, loIn);
+            } else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) {
+                hi = blend<ScaleForBlend<Dst4>::Value, ScaleForBlend<Dst5>::Value,
+                   ScaleForBlend<Dst6>::Value, ScaleForBlend<Dst7>::Value>(loIn, hiIn);
+            }
+
+            return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
+        }
+}  // namespace Mem
+}  // namespace Vc
+
+    // little endian has the lo bits on the right and high bits on the left
+    // with vectors this becomes greatly confusing:
+    // Mem: abcd
+    // Reg: dcba
+    //
+    // The shuffles and permutes above use memory ordering. The ones below use register ordering:
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Reg
+{
+        template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x, __m256 y) {
+            static_assert(L >= X0 && H >= X0, "Incorrect_Range");
+            static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
+            return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
+        }
+        template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x, __m256i y) {
+            static_assert(L >= X0 && H >= X0, "Incorrect_Range");
+            static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
+#ifdef Vc_IMPL_AVX2
+            return _mm256_permute2x128_si256(
+                x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
+#else
+            return _mm256_permute2f128_si256(
+                x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
+#endif
+        }
+        template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x, __m256d y) {
+            static_assert(L >= X0 && H >= X0, "Incorrect_Range");
+            static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
+            return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
+        }
+        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
+            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
+            static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
+            return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
+        }
+        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
+            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
+            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
+            return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
+        }
+        template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST permute(__m128d x) {
+            static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
+            static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
+            return _mm_permute_pd(x, Dst0 + Dst1 * 2);
+        }
+        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
+            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
+            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
+            return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
+        }
+        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
+            static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
+            static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
+            return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
+        }
+        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
+            static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
+            static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
+            return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
+        }
+}  // namespace Reg
+}  // namespace Vc
+
+#endif // VC_AVX_SHUFFLE_H_
--- a/Vc/avx/simd_cast.h
+++ b/Vc/avx/simd_cast.h
--- a/Vc/avx/simd_cast_caller.tcc
+++ b/Vc/avx/simd_cast_caller.tcc
@ -0,0 +1,55 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef Vc_AVX_SIMD_CAST_CALLER_TCC_
+#define Vc_AVX_SIMD_CAST_CALLER_TCC_
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+#if Vc_IS_VERSION_1
+template <typename T>
+template <typename U, typename>
+Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(U &&x)
+    : d(simd_cast<Vector>(std::forward<U>(x)).data())
+{
+}
+
+template <typename T>
+template <typename U>
+Vc_INTRINSIC Mask<T, VectorAbi::Avx>::Mask(U &&rhs,
+                                 Common::enable_if_mask_converts_explicitly<T, U>)
+    : Mask(simd_cast<Mask>(std::forward<U>(rhs)))
+{
+}
+#endif  // Vc_IS_VERSION_1
+}
+
+#endif  // Vc_AVX_SIMD_CAST_CALLER_TCC_
+
+// vim: foldmethod=marker
--- a/Vc/avx/types.h
+++ b/Vc/avx/types.h
@ -0,0 +1,120 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_AVX_TYPES_H_
+#define VC_AVX_TYPES_H_
+
+#include "../sse/types.h"
+#include "../traits/type_traits.h"
+#include "macros.h"
+
+#ifdef Vc_DEFAULT_IMPL_AVX2
+#define Vc_DOUBLE_V_SIZE 4
+#define Vc_FLOAT_V_SIZE 8
+#define Vc_INT_V_SIZE 8
+#define Vc_UINT_V_SIZE 8
+#define Vc_SHORT_V_SIZE 16
+#define Vc_USHORT_V_SIZE 16
+#elif defined Vc_DEFAULT_IMPL_AVX
+#define Vc_DOUBLE_V_SIZE 4
+#define Vc_FLOAT_V_SIZE 8
+#define Vc_INT_V_SIZE 4
+#define Vc_UINT_V_SIZE 4
+#define Vc_SHORT_V_SIZE 8
+#define Vc_USHORT_V_SIZE 8
+#endif
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace AVX
+{
+template <typename T> using Vector = Vc::Vector<T, VectorAbi::Avx1Abi<T>>;
+typedef Vector<double>         double_v;
+typedef Vector<float>           float_v;
+typedef Vector<int>               int_v;
+typedef Vector<unsigned int>     uint_v;
+typedef Vector<short>           short_v;
+typedef Vector<unsigned short> ushort_v;
+
+template <typename T> using Mask = Vc::Mask<T, VectorAbi::Avx1Abi<T>>;
+typedef Mask<double>         double_m;
+typedef Mask<float>           float_m;
+typedef Mask<int>               int_m;
+typedef Mask<unsigned int>     uint_m;
+typedef Mask<short>           short_m;
+typedef Mask<unsigned short> ushort_m;
+
+template <typename T> struct Const;
+
+template <typename T> struct is_vector : public std::false_type {};
+template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
+template <typename T> struct is_mask : public std::false_type {};
+template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
+}  // namespace AVX
+
+namespace AVX2
+{
+template <typename T> using Vector = Vc::Vector<T, VectorAbi::Avx>;
+using double_v = Vector<double>;
+using  float_v = Vector< float>;
+using    int_v = Vector<   int>;
+using   uint_v = Vector<  uint>;
+using  short_v = Vector< short>;
+using ushort_v = Vector<ushort>;
+
+template <typename T> using Mask = Vc::Mask<T, VectorAbi::Avx>;
+using double_m = Mask<double>;
+using  float_m = Mask< float>;
+using  llong_m = Mask< llong>;
+using ullong_m = Mask<ullong>;
+using   long_m = Mask<  long>;
+using  ulong_m = Mask< ulong>;
+using    int_m = Mask<   int>;
+using   uint_m = Mask<  uint>;
+using  short_m = Mask< short>;
+using ushort_m = Mask<ushort>;
+using  schar_m = Mask< schar>;
+using  uchar_m = Mask< uchar>;
+
+template <typename T> struct is_vector : public std::false_type {};
+template <typename T> struct is_vector<Vector<T>> : public std::true_type {};
+template <typename T> struct is_mask : public std::false_type {};
+template <typename T> struct is_mask<Mask<T>> : public std::true_type {};
+}  // namespace AVX2
+
+namespace Traits
+{
+template <class T> struct
+is_simd_vector_internal<Vector<T, VectorAbi::Avx>>
+  : public is_valid_vector_argument<T> {};
+
+template<typename T> struct is_simd_mask_internal<Mask<T, VectorAbi::Avx>>
+  : public std::true_type {};
+}  // namespace Traits
+}  // namespace Vc
+
+#endif // VC_AVX_TYPES_H_
--- a/Vc/avx/vector.h
+++ b/Vc/avx/vector.h
@ -0,0 +1,545 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_AVX_VECTOR_H_
+#define VC_AVX_VECTOR_H_
+
+#include "intrinsics.h"
+#include "casts.h"
+#include "../sse/vector.h"
+#include "shuffle.h"
+#include "vectorhelper.h"
+#include "mask.h"
+#include <algorithm>
+#include <cmath>
+#include "../common/aliasingentryhelper.h"
+#include "../common/memoryfwd.h"
+#include "../common/where.h"
+#include "macros.h"
+
+#ifdef isfinite
+#undef isfinite
+#endif
+#ifdef isnan
+#undef isnan
+#endif
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Detail
+{
+template <typename T, typename Abi> struct VectorTraits
+{
+    using mask_type = Vc::Mask<T, Abi>;
+    using vector_type = Vc::Vector<T, Abi>;
+    using writemasked_vector_type = Common::WriteMaskedVector<vector_type, mask_type>;
+    using intrinsic_type = typename AVX::VectorTypeHelper<T>::Type;
+};
+}  // namespace Detail
+
+#define Vc_CURRENT_CLASS_NAME Vector
+template <typename T> class Vector<T, VectorAbi::Avx>
+{
+public:
+    using abi = VectorAbi::Avx;
+
+private:
+    using traits_type = Detail::VectorTraits<T, abi>;
+    static_assert(
+        std::is_arithmetic<T>::value,
+        "Vector<T> only accepts arithmetic builtin types as template parameter T.");
+
+    using WriteMaskedVector = typename traits_type::writemasked_vector_type;
+
+public:
+    using VectorType = typename traits_type::intrinsic_type;
+    using vector_type = VectorType;
+
+    using mask_type = typename traits_type::mask_type;
+    using Mask = mask_type;
+    using MaskType = mask_type;
+    using MaskArg Vc_DEPRECATED_ALIAS("Use MaskArgument instead.") = typename Mask::AsArg;
+    using MaskArgument = typename Mask::AsArg;
+    using reference = Detail::ElementReference<Vector>;
+
+    Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(VectorType));
+
+    using EntryType = T;
+        using value_type = EntryType;
+        typedef EntryType VectorEntryType;
+        static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
+        static constexpr size_t MemoryAlignment = alignof(VectorType);
+    using IndexType = fixed_size_simd<int, Size>;
+    using index_type = IndexType;
+        typedef Vector<T, abi> AsArg;
+        typedef VectorType VectorTypeArg;
+
+    protected:
+        template <typename U> using V = Vector<U, abi>;
+
+        // helper that specializes on VectorType
+        typedef AVX::VectorHelper<VectorType> HV;
+
+        // helper that specializes on T
+        typedef AVX::VectorHelper<T> HT;
+
+        // cast any m256/m128 to VectorType
+        template <typename V> static Vc_INTRINSIC VectorType _cast(V v)
+        {
+            return AVX::avx_cast<VectorType>(v);
+        }
+
+        typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
+        StorageType d;
+
+        using WidthT = Common::WidthT<VectorType>;
+        // ICC can't compile this:
+        // static constexpr WidthT Width = WidthT();
+
+    public:
+#include "../common/generalinterface.h"
+
+        static Vc_ALWAYS_INLINE_L Vector Random() Vc_ALWAYS_INLINE_R;
+
+        ///////////////////////////////////////////////////////////////////////////////////////////
+        // internal: required to enable returning objects of VectorType
+        Vc_ALWAYS_INLINE Vector(VectorTypeArg x) : d(x) {}
+
+        // implict conversion from compatible Vector<U, abi>
+        template <typename U>
+        Vc_INTRINSIC Vector(
+            V<U> x, typename std::enable_if<Traits::is_implicit_cast_allowed<U, T>::value,
+                                            void *>::type = nullptr)
+            : d(AVX::convert<U, T>(x.data()))
+        {
+        }
+
+#if Vc_IS_VERSION_1
+        // static_cast from the remaining Vector<U, abi>
+        template <typename U>
+        Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
+                      "vector types") Vc_INTRINSIC explicit Vector(
+            V<U> x,
+            typename std::enable_if<!Traits::is_implicit_cast_allowed<U, T>::value,
+                                    void *>::type = nullptr)
+            : d(Detail::zeroExtendIfNeeded(AVX::convert<U, T>(x.data())))
+        {
+        }
+
+        // static_cast from other types, implemented via the non-member simd_cast function in
+        // simd_cast_caller.tcc
+        template <typename U,
+                  typename = enable_if<Traits::is_simd_vector<U>::value &&
+                                       !std::is_same<Vector, Traits::decay<U>>::value>>
+        Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
+                      "vector types") Vc_INTRINSIC_L
+            explicit Vector(U &&x) Vc_INTRINSIC_R;
+#endif
+
+        Vc_INTRINSIC explicit Vector(reference a) : Vector(static_cast<EntryType>(a)) {}
+
+        ///////////////////////////////////////////////////////////////////////////////////////////
+        // broadcast
+        Vc_INTRINSIC Vector(EntryType a) : d(Detail::avx_broadcast(a)) {}
+        template <typename U>
+        Vc_INTRINSIC Vector(U a,
+                            typename std::enable_if<std::is_same<U, int>::value &&
+                                                        !std::is_same<U, EntryType>::value,
+                                                    void *>::type = nullptr)
+            : Vector(static_cast<EntryType>(a))
+        {
+        }
+
+        //template<typename U>
+        explicit Vector(std::initializer_list<EntryType>)
+        {
+            static_assert(std::is_same<EntryType, void>::value,
+                          "A SIMD vector object cannot be initialized from an initializer list "
+                          "because the number of entries in the vector is target-dependent.");
+        }
+
+#include "../common/loadinterface.h"
+#include "../common/storeinterface.h"
+
+        ///////////////////////////////////////////////////////////////////////////////////////////
+        // zeroing
+        Vc_INTRINSIC_L void setZero() Vc_INTRINSIC_R;
+        Vc_INTRINSIC_L void setZero(const Mask &k) Vc_INTRINSIC_R;
+        Vc_INTRINSIC_L void setZeroInverted(const Mask &k) Vc_INTRINSIC_R;
+
+        Vc_INTRINSIC_L void setQnan() Vc_INTRINSIC_R;
+        Vc_INTRINSIC_L void setQnan(MaskArgument k) Vc_INTRINSIC_R;
+
+#include "../common/gatherinterface.h"
+#include "../common/scatterinterface.h"
+#if defined Vc_IMPL_AVX2 && !defined Vc_MSVC
+        // skip this code for MSVC because it fails to do overload resolution correctly
+
+        ////////////////////////////////////////////////////////////////////////////////
+        // non-converting pd, ps, and epi32 gathers
+        template <class U, class A, int Scale, int N = Vector<U, A>::size(),
+                  class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
+        Vc_INTRINSIC void gatherImplementation(
+            const Common::GatherArguments<T, Vector<U, A>, Scale> &args)
+        {
+            d.v() = AVX::gather<sizeof(T) * Scale>(
+                args.address,
+                simd_cast<conditional_t<Size == 4, SSE::int_v, AVX2::int_v>>(args.indexes)
+                    .data());
+        }
+
+        // masked overload
+        template <class U, class A, int Scale, int N = Vector<U, A>::size(),
+                  class = enable_if<(Vector<U, A>::size() >= size() && sizeof(T) >= 4)>>
+        Vc_INTRINSIC void gatherImplementation(
+            const Common::GatherArguments<T, Vector<U, A>, Scale> &args, MaskArgument k)
+        {
+            d.v() = AVX::gather<sizeof(T) * Scale>(
+                d.v(), k.data(), args.address,
+                simd_cast<conditional_t<Size == 4, SSE::int_v, AVX2::int_v>>(args.indexes)
+                    .data());
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////
+        // converting (from 8-bit and 16-bit integers only) epi16 gather emulation via
+        // epi32 gathers
+        template <
+            class MT, class U, class A, int Scale,
+            class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
+                               (sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
+        Vc_INTRINSIC void gatherImplementation(
+            const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
+        {
+            using AVX2::int_v;
+            const auto idx0 = simd_cast<int_v, 0>(args.indexes).data();
+            const auto idx1 = simd_cast<int_v, 1>(args.indexes).data();
+            *this = simd_cast<Vector>(int_v(AVX::gather<sizeof(MT) * Scale>(
+                                          aliasing_cast<int>(args.address), idx0)),
+                                      int_v(AVX::gather<sizeof(MT) * Scale>(
+                                          aliasing_cast<int>(args.address), idx1)));
+            if (sizeof(MT) == 1) {
+                if (std::is_signed<MT>::value) {
+                    using Signed = AVX2::Vector<typename std::make_signed<T>::type>;
+                    *this = (simd_cast<Signed>(*this) << 8) >> 8;  // sign extend
+                } else {
+                    *this &= 0xff;
+                }
+            }
+        }
+
+        // masked overload
+        template <
+            class MT, class U, class A, int Scale,
+            class = enable_if<(sizeof(T) == 2 && std::is_integral<MT>::value &&
+                               (sizeof(MT) <= 2) && Vector<U, A>::size() >= size())>>
+        Vc_INTRINSIC void gatherImplementation(
+            const Common::GatherArguments<MT, Vector<U, A>, Scale> &args, MaskArgument k)
+        {
+            using AVX2::int_v;
+            const auto idx0 = simd_cast<int_v, 0>(args.indexes).data();
+            const auto idx1 = simd_cast<int_v, 1>(args.indexes).data();
+            const auto k0 = simd_cast<AVX2::int_m, 0>(k).data();
+            const auto k1 = simd_cast<AVX2::int_m, 1>(k).data();
+            auto v = simd_cast<Vector>(
+                int_v(AVX::gather<sizeof(MT) * Scale>(
+                    _mm256_setzero_si256(), k0, aliasing_cast<int>(args.address), idx0)),
+                int_v(AVX::gather<sizeof(MT) * Scale>(
+                    _mm256_setzero_si256(), k1, aliasing_cast<int>(args.address), idx1)));
+            if (sizeof(MT) == 1) {
+                if (std::is_signed<MT>::value) {
+                    using Signed = AVX2::Vector<typename std::make_signed<T>::type>;
+                    v = (simd_cast<Signed>(v) << 8) >> 8;  // sign extend
+                } else {
+                    v &= 0xff;
+                }
+            }
+            assign(v, k);
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////
+        // all remaining converting gathers
+        template <class MT, class U, class A, int Scale>
+        Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
+                                Traits::is_valid_vector_argument<MT>::value &&
+                                !std::is_same<MT, T>::value &&
+                                Vector<U, A>::size() >= size()),
+                               void>
+        gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args)
+        {
+            *this = simd_cast<Vector>(fixed_size_simd<MT, Size>(args));
+        }
+
+        // masked overload
+        template <class MT, class U, class A, int Scale>
+        Vc_INTRINSIC enable_if<((sizeof(T) != 2 || sizeof(MT) > 2) &&
+                                Traits::is_valid_vector_argument<MT>::value &&
+                                !std::is_same<MT, T>::value &&
+                                Vector<U, A>::size() >= size()),
+                               void>
+        gatherImplementation(const Common::GatherArguments<MT, Vector<U, A>, Scale> &args,
+                             MaskArgument k)
+        {
+            assign(simd_cast<Vector>(fixed_size_simd<MT, Size>(args, k)), k);
+        }
+#endif  // Vc_IMPL_AVX2 && !MSVC
+
+        ///////////////////////////////////////////////////////////////////////////////////////////
+        //prefix
+        Vc_ALWAYS_INLINE Vector &operator++() { data() = Detail::add(data(), Detail::one(T()), T()); return *this; }
+        Vc_ALWAYS_INLINE Vector &operator--() { data() = Detail::sub(data(), Detail::one(T()), T()); return *this; }
+        //postfix
+        Vc_ALWAYS_INLINE Vector operator++(int) { const Vector r = *this; data() = Detail::add(data(), Detail::one(T()), T()); return r; }
+        Vc_ALWAYS_INLINE Vector operator--(int) { const Vector r = *this; data() = Detail::sub(data(), Detail::one(T()), T()); return r; }
+
+    private:
+        friend reference;
+        Vc_INTRINSIC static value_type get(const Vector &o, int i) noexcept
+        {
+            return o.d.m(i);
+        }
+        template <typename U>
+        Vc_INTRINSIC static void set(Vector &o, int i, U &&v) noexcept(
+            noexcept(std::declval<value_type &>() = v))
+        {
+            return o.d.set(i, v);
+        }
+
+    public:
+        /**
+         * \note the returned object models the concept of a reference and
+         * as such it can exist longer than the data it is referencing.
+         * \note to avoid lifetime issues, we strongly advice not to store
+         * any reference objects.
+         */
+        Vc_ALWAYS_INLINE reference operator[](size_t index) noexcept
+        {
+            static_assert(noexcept(reference{std::declval<Vector &>(), int()}), "");
+            return {*this, int(index)};
+        }
+        Vc_ALWAYS_INLINE value_type operator[](size_t index) const noexcept
+        {
+            return d.m(index);
+        }
+
+        Vc_INTRINSIC_L Vc_PURE_L Vector operator[](Permutation::ReversedTag) const Vc_INTRINSIC_R Vc_PURE_R;
+        Vc_INTRINSIC_L Vc_PURE_L Vector operator[](const IndexType &perm) const Vc_INTRINSIC_R Vc_PURE_R;
+
+        Vc_INTRINSIC Vc_PURE Mask operator!() const
+        {
+            return *this == Zero();
+        }
+        Vc_ALWAYS_INLINE Vector operator~() const
+        {
+#ifndef Vc_ENABLE_FLOAT_BIT_OPERATORS
+            static_assert(std::is_integral<T>::value,
+                          "bit-complement can only be used with Vectors of integral type");
+#endif
+            return Detail::andnot_(data(), Detail::allone<VectorType>());
+        }
+        Vc_ALWAYS_INLINE_L Vc_PURE_L Vector operator-() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
+        Vc_INTRINSIC Vc_PURE Vector operator+() const { return *this; }
+
+        // shifts
+#define Vc_OP_VEC(op)                                                                    \
+    Vc_INTRINSIC Vector &operator op##=(AsArg x);                                        \
+    Vc_INTRINSIC Vc_PURE Vector operator op(AsArg x) const                               \
+    {                                                                                    \
+        static_assert(                                                                   \
+            std::is_integral<T>::value,                                                  \
+            "bitwise-operators can only be used with Vectors of integral type");         \
+    }
+    Vc_ALL_SHIFTS(Vc_OP_VEC);
+#undef Vc_OP_VEC
+
+        Vc_ALWAYS_INLINE_L Vector &operator>>=(int x) Vc_ALWAYS_INLINE_R;
+        Vc_ALWAYS_INLINE_L Vector &operator<<=(int x) Vc_ALWAYS_INLINE_R;
+        Vc_ALWAYS_INLINE_L Vector operator>>(int x) const Vc_ALWAYS_INLINE_R;
+        Vc_ALWAYS_INLINE_L Vector operator<<(int x) const Vc_ALWAYS_INLINE_R;
+
+        Vc_DEPRECATED("use isnegative(x) instead") Vc_INTRINSIC Vc_PURE Mask
+            isNegative() const
+        {
+            return Vc::isnegative(*this);
+        }
+
+        Vc_ALWAYS_INLINE void assign( const Vector &v, const Mask &mask ) {
+            data() = Detail::blend(data(), v.data(), mask.data());
+        }
+
+        template <typename V2>
+        Vc_DEPRECATED("Use simd_cast instead of Vector::staticCast") Vc_ALWAYS_INLINE V2
+            staticCast() const
+        {
+            return V2(*this);
+        }
+        template <typename V2>
+        Vc_DEPRECATED("use reinterpret_components_cast instead") Vc_ALWAYS_INLINE V2
+            reinterpretCast() const
+        {
+            return AVX::avx_cast<typename V2::VectorType>(data());
+        }
+
+        Vc_ALWAYS_INLINE WriteMaskedVector operator()(const Mask &k)
+        {
+            return {*this, k};
+        }
+
+        Vc_ALWAYS_INLINE VectorType &data() { return d.v(); }
+        Vc_ALWAYS_INLINE const VectorType &data() const { return d.v(); }
+
+        template<int Index>
+        Vc_INTRINSIC_L Vector broadcast() const Vc_INTRINSIC_R;
+
+        Vc_INTRINSIC_L std::pair<Vector, int> minIndex() const Vc_INTRINSIC_R;
+        Vc_INTRINSIC_L std::pair<Vector, int> maxIndex() const Vc_INTRINSIC_R;
+
+        Vc_ALWAYS_INLINE EntryType min() const { return Detail::min(data(), T()); }
+        Vc_ALWAYS_INLINE EntryType max() const { return Detail::max(data(), T()); }
+        Vc_ALWAYS_INLINE EntryType product() const { return Detail::mul(data(), T()); }
+        Vc_ALWAYS_INLINE EntryType sum() const { return Detail::add(data(), T()); }
+        Vc_ALWAYS_INLINE_L Vector partialSum() const Vc_ALWAYS_INLINE_R;
+        //template<typename BinaryOperation> Vc_ALWAYS_INLINE_L Vector partialSum(BinaryOperation op) const Vc_ALWAYS_INLINE_R;
+        Vc_ALWAYS_INLINE_L EntryType min(MaskArgument m) const Vc_ALWAYS_INLINE_R;
+        Vc_ALWAYS_INLINE_L EntryType max(MaskArgument m) const Vc_ALWAYS_INLINE_R;
+        Vc_ALWAYS_INLINE_L EntryType product(MaskArgument m) const Vc_ALWAYS_INLINE_R;
+        Vc_ALWAYS_INLINE_L EntryType sum(MaskArgument m) const Vc_ALWAYS_INLINE_R;
+
+        Vc_INTRINSIC_L Vector shifted(int amount, Vector shiftIn) const Vc_INTRINSIC_R;
+        Vc_INTRINSIC_L Vector shifted(int amount) const Vc_INTRINSIC_R;
+        Vc_INTRINSIC_L Vector rotated(int amount) const Vc_INTRINSIC_R;
+        Vc_INTRINSIC_L Vc_PURE_L Vector reversed() const Vc_INTRINSIC_R Vc_PURE_R;
+        Vc_ALWAYS_INLINE_L Vc_PURE_L Vector sorted() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
+
+        template <typename F> void callWithValuesSorted(F &&f)
+        {
+            EntryType value = d.m(0);
+            f(value);
+            for (size_t i = 1; i < Size; ++i) {
+                if (d.m(i) != value) {
+                    value = d.m(i);
+                    f(value);
+                }
+            }
+        }
+
+        template <typename F> Vc_INTRINSIC void call(F &&f) const
+        {
+            Common::for_all_vector_entries<Size>([&](size_t i) { f(EntryType(d.m(i))); });
+        }
+
+        template <typename F> Vc_INTRINSIC void call(F &&f, const Mask &mask) const
+        {
+            for (size_t i : where(mask)) {
+                f(EntryType(d.m(i)));
+            }
+        }
+
+        template <typename F> Vc_INTRINSIC Vector apply(F &&f) const
+        {
+            Vector r;
+            Common::for_all_vector_entries<Size>(
+                [&](size_t i) { r.d.set(i, f(EntryType(d.m(i)))); });
+            return r;
+        }
+
+        template <typename F> Vc_INTRINSIC Vector apply(F &&f, const Mask &mask) const
+        {
+            Vector r(*this);
+            for (size_t i : where(mask)) {
+                r.d.set(i, f(EntryType(r.d.m(i))));
+            }
+            return r;
+        }
+
+        template<typename IndexT> Vc_INTRINSIC void fill(EntryType (&f)(IndexT)) {
+            Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f(i)); });
+        }
+        Vc_INTRINSIC void fill(EntryType (&f)()) {
+            Common::for_all_vector_entries<Size>([&](size_t i) { d.set(i, f()); });
+        }
+
+        template <typename G> static Vc_INTRINSIC_L Vector generate(G gen) Vc_INTRINSIC_R;
+
+        Vc_DEPRECATED("use copysign(x, y) instead") Vc_INTRINSIC Vector
+            copySign(AsArg x) const
+        {
+            return Vc::copysign(*this, x);
+        }
+
+        Vc_DEPRECATED("use exponent(x) instead") Vc_INTRINSIC Vector exponent() const
+        {
+            Vc::exponent(*this);
+        }
+
+        Vc_INTRINSIC_L Vector interleaveLow(Vector x) const Vc_INTRINSIC_R;
+        Vc_INTRINSIC_L Vector interleaveHigh(Vector x) const Vc_INTRINSIC_R;
+};
+#undef Vc_CURRENT_CLASS_NAME
+template <typename T> constexpr size_t Vector<T, VectorAbi::Avx>::Size;
+template <typename T> constexpr size_t Vector<T, VectorAbi::Avx>::MemoryAlignment;
+
+#define Vc_CONDITIONAL_ASSIGN(name_, op_)                                                \
+    template <Operator O, typename T, typename M, typename U>                            \
+    Vc_INTRINSIC enable_if<O == Operator::name_, void> conditional_assign(               \
+        AVX2::Vector<T> &lhs, M &&mask, U &&rhs)                                         \
+    {                                                                                    \
+        lhs(mask) op_ rhs;                                                               \
+    }                                                                                    \
+    Vc_NOTHING_EXPECTING_SEMICOLON
+Vc_CONDITIONAL_ASSIGN(          Assign,  =);
+Vc_CONDITIONAL_ASSIGN(      PlusAssign, +=);
+Vc_CONDITIONAL_ASSIGN(     MinusAssign, -=);
+Vc_CONDITIONAL_ASSIGN(  MultiplyAssign, *=);
+Vc_CONDITIONAL_ASSIGN(    DivideAssign, /=);
+Vc_CONDITIONAL_ASSIGN( RemainderAssign, %=);
+Vc_CONDITIONAL_ASSIGN(       XorAssign, ^=);
+Vc_CONDITIONAL_ASSIGN(       AndAssign, &=);
+Vc_CONDITIONAL_ASSIGN(        OrAssign, |=);
+Vc_CONDITIONAL_ASSIGN( LeftShiftAssign,<<=);
+Vc_CONDITIONAL_ASSIGN(RightShiftAssign,>>=);
+#undef Vc_CONDITIONAL_ASSIGN
+
+#define Vc_CONDITIONAL_ASSIGN(name_, expr_)                                              \
+    template <Operator O, typename T, typename M>                                        \
+    Vc_INTRINSIC enable_if<O == Operator::name_, AVX2::Vector<T>> conditional_assign(    \
+        AVX2::Vector<T> &lhs, M &&mask)                                                  \
+    {                                                                                    \
+        return expr_;                                                                    \
+    }                                                                                    \
+    Vc_NOTHING_EXPECTING_SEMICOLON
+Vc_CONDITIONAL_ASSIGN(PostIncrement, lhs(mask)++);
+Vc_CONDITIONAL_ASSIGN( PreIncrement, ++lhs(mask));
+Vc_CONDITIONAL_ASSIGN(PostDecrement, lhs(mask)--);
+Vc_CONDITIONAL_ASSIGN( PreDecrement, --lhs(mask));
+#undef Vc_CONDITIONAL_ASSIGN
+
+}  // namespace Vc
+
+#include "vector.tcc"
+#include "simd_cast.h"
+
+#endif // VC_AVX_VECTOR_H_
--- a/Vc/avx/vector.tcc
+++ b/Vc/avx/vector.tcc
@ -0,0 +1,939 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#include "../common/x86_prefetches.h"
+#include "../common/gatherimplementation.h"
+#include "../common/scatterimplementation.h"
+#include "limits.h"
+#include "const.h"
+#include "../common/set.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Detail
+{
+// compare operators {{{1
+Vc_INTRINSIC AVX2::double_m operator==(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpeq_pd(a.data(), b.data()); }
+Vc_INTRINSIC AVX2:: float_m operator==(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpeq_ps(a.data(), b.data()); }
+Vc_INTRINSIC AVX2::double_m operator!=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpneq_pd(a.data(), b.data()); }
+Vc_INTRINSIC AVX2:: float_m operator!=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpneq_ps(a.data(), b.data()); }
+Vc_INTRINSIC AVX2::double_m operator>=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpnlt_pd(a.data(), b.data()); }
+Vc_INTRINSIC AVX2:: float_m operator>=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpnlt_ps(a.data(), b.data()); }
+Vc_INTRINSIC AVX2::double_m operator<=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmple_pd(a.data(), b.data()); }
+Vc_INTRINSIC AVX2:: float_m operator<=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmple_ps(a.data(), b.data()); }
+Vc_INTRINSIC AVX2::double_m operator> (AVX2::double_v a, AVX2::double_v b) { return AVX::cmpgt_pd(a.data(), b.data()); }
+Vc_INTRINSIC AVX2:: float_m operator> (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpgt_ps(a.data(), b.data()); }
+Vc_INTRINSIC AVX2::double_m operator< (AVX2::double_v a, AVX2::double_v b) { return AVX::cmplt_pd(a.data(), b.data()); }
+Vc_INTRINSIC AVX2:: float_m operator< (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmplt_ps(a.data(), b.data()); }
+
+#ifdef Vc_IMPL_AVX2
+Vc_INTRINSIC AVX2::   int_m operator==(AVX2::   int_v a, AVX2::   int_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
+Vc_INTRINSIC AVX2::  uint_m operator==(AVX2::  uint_v a, AVX2::  uint_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
+Vc_INTRINSIC AVX2:: short_m operator==(AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
+Vc_INTRINSIC AVX2::ushort_m operator==(AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
+Vc_INTRINSIC AVX2::   int_m operator!=(AVX2::   int_v a, AVX2::   int_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
+Vc_INTRINSIC AVX2::  uint_m operator!=(AVX2::  uint_v a, AVX2::  uint_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
+Vc_INTRINSIC AVX2:: short_m operator!=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
+Vc_INTRINSIC AVX2::ushort_m operator!=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
+Vc_INTRINSIC AVX2::   int_m operator>=(AVX2::   int_v a, AVX2::   int_v b) { return not_(AVX::cmplt_epi32(a.data(), b.data())); }
+Vc_INTRINSIC AVX2::  uint_m operator>=(AVX2::  uint_v a, AVX2::  uint_v b) { return not_(AVX::cmplt_epu32(a.data(), b.data())); }
+Vc_INTRINSIC AVX2:: short_m operator>=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmplt_epi16(a.data(), b.data())); }
+Vc_INTRINSIC AVX2::ushort_m operator>=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmplt_epu16(a.data(), b.data())); }
+Vc_INTRINSIC AVX2::   int_m operator<=(AVX2::   int_v a, AVX2::   int_v b) { return not_(AVX::cmpgt_epi32(a.data(), b.data())); }
+Vc_INTRINSIC AVX2::  uint_m operator<=(AVX2::  uint_v a, AVX2::  uint_v b) { return not_(AVX::cmpgt_epu32(a.data(), b.data())); }
+Vc_INTRINSIC AVX2:: short_m operator<=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpgt_epi16(a.data(), b.data())); }
+Vc_INTRINSIC AVX2::ushort_m operator<=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpgt_epu16(a.data(), b.data())); }
+Vc_INTRINSIC AVX2::   int_m operator> (AVX2::   int_v a, AVX2::   int_v b) { return AVX::cmpgt_epi32(a.data(), b.data()); }
+Vc_INTRINSIC AVX2::  uint_m operator> (AVX2::  uint_v a, AVX2::  uint_v b) { return AVX::cmpgt_epu32(a.data(), b.data()); }
+Vc_INTRINSIC AVX2:: short_m operator> (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpgt_epi16(a.data(), b.data()); }
+Vc_INTRINSIC AVX2::ushort_m operator> (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpgt_epu16(a.data(), b.data()); }
+Vc_INTRINSIC AVX2::   int_m operator< (AVX2::   int_v a, AVX2::   int_v b) { return AVX::cmplt_epi32(a.data(), b.data()); }
+Vc_INTRINSIC AVX2::  uint_m operator< (AVX2::  uint_v a, AVX2::  uint_v b) { return AVX::cmplt_epu32(a.data(), b.data()); }
+Vc_INTRINSIC AVX2:: short_m operator< (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmplt_epi16(a.data(), b.data()); }
+Vc_INTRINSIC AVX2::ushort_m operator< (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmplt_epu16(a.data(), b.data()); }
+#endif  // Vc_IMPL_AVX2
+
+// bitwise operators {{{1
+template <typename T>
+Vc_INTRINSIC AVX2::Vector<T> operator^(AVX2::Vector<T> a, AVX2::Vector<T> b)
+{
+    return xor_(a.data(), b.data());
+}
+template <typename T>
+Vc_INTRINSIC AVX2::Vector<T> operator&(AVX2::Vector<T> a, AVX2::Vector<T> b)
+{
+    return and_(a.data(), b.data());
+}
+template <typename T>
+Vc_INTRINSIC AVX2::Vector<T> operator|(AVX2::Vector<T> a, AVX2::Vector<T> b)
+{
+    return or_(a.data(), b.data());
+}
+// }}}1
+// arithmetic operators {{{1
+template <typename T>
+Vc_INTRINSIC AVX2::Vector<T> operator+(AVX2::Vector<T> a, AVX2::Vector<T> b)
+{
+    return add(a.data(), b.data(), T());
+}
+template <typename T>
+Vc_INTRINSIC AVX2::Vector<T> operator-(AVX2::Vector<T> a, AVX2::Vector<T> b)
+{
+    return sub(a.data(), b.data(), T());
+}
+template <typename T>
+Vc_INTRINSIC AVX2::Vector<T> operator*(AVX2::Vector<T> a, AVX2::Vector<T> b)
+{
+    return mul(a.data(), b.data(), T());
+}
+template <typename T>
+Vc_INTRINSIC AVX2::Vector<T> operator/(AVX2::Vector<T> a, AVX2::Vector<T> b)
+{
+    return div(a.data(), b.data(), T());
+}
+Vc_INTRINSIC AVX2::Vector<ushort> operator/(AVX2::Vector<ushort> a,
+                                            AVX2::Vector<ushort> b)
+{
+    using namespace AVX;
+    const __m256 lo = _mm256_div_ps(convert<ushort, float>(lo128(a.data())),
+                                    convert<ushort, float>(lo128(b.data())));
+    const __m256 hi = _mm256_div_ps(convert<ushort, float>(hi128(a.data())),
+                                    convert<ushort, float>(hi128(b.data())));
+    const float_v threshold = 32767.f;
+    using Detail::operator>;
+    const __m128i loShort = (Vc_IS_UNLIKELY((float_v(lo) > threshold).isNotEmpty()))
+                                ? convert<float, ushort>(lo)
+                                : convert<float, short>(lo);
+    const __m128i hiShort = (Vc_IS_UNLIKELY((float_v(hi) > threshold).isNotEmpty()))
+                                ? convert<float, ushort>(hi)
+                                : convert<float, short>(hi);
+    return concat(loShort, hiShort);
+}
+template <typename T>
+Vc_INTRINSIC enable_if<std::is_integral<T>::value, AVX2::Vector<T>> operator%(
+    AVX2::Vector<T> a, AVX2::Vector<T> b)
+{
+    return a - a / b * b;
+}
+// }}}1
+}  // namespace Detail
+///////////////////////////////////////////////////////////////////////////////////////////
+// generate {{{1
+template <> template <typename G> Vc_INTRINSIC AVX2::double_v AVX2::double_v::generate(G gen)
+{
+    const auto tmp0 = gen(0);
+    const auto tmp1 = gen(1);
+    const auto tmp2 = gen(2);
+    const auto tmp3 = gen(3);
+    return _mm256_setr_pd(tmp0, tmp1, tmp2, tmp3);
+}
+template <> template <typename G> Vc_INTRINSIC AVX2::float_v AVX2::float_v::generate(G gen)
+{
+    const auto tmp0 = gen(0);
+    const auto tmp1 = gen(1);
+    const auto tmp2 = gen(2);
+    const auto tmp3 = gen(3);
+    const auto tmp4 = gen(4);
+    const auto tmp5 = gen(5);
+    const auto tmp6 = gen(6);
+    const auto tmp7 = gen(7);
+    return _mm256_setr_ps(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+}
+#ifdef Vc_IMPL_AVX2
+template <> template <typename G> Vc_INTRINSIC AVX2::int_v AVX2::int_v::generate(G gen)
+{
+    const auto tmp0 = gen(0);
+    const auto tmp1 = gen(1);
+    const auto tmp2 = gen(2);
+    const auto tmp3 = gen(3);
+    const auto tmp4 = gen(4);
+    const auto tmp5 = gen(5);
+    const auto tmp6 = gen(6);
+    const auto tmp7 = gen(7);
+    return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+}
+template <> template <typename G> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::generate(G gen)
+{
+    const auto tmp0 = gen(0);
+    const auto tmp1 = gen(1);
+    const auto tmp2 = gen(2);
+    const auto tmp3 = gen(3);
+    const auto tmp4 = gen(4);
+    const auto tmp5 = gen(5);
+    const auto tmp6 = gen(6);
+    const auto tmp7 = gen(7);
+    return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+}
+template <> template <typename G> Vc_INTRINSIC AVX2::short_v AVX2::short_v::generate(G gen)
+{
+    const auto tmp0 = gen(0);
+    const auto tmp1 = gen(1);
+    const auto tmp2 = gen(2);
+    const auto tmp3 = gen(3);
+    const auto tmp4 = gen(4);
+    const auto tmp5 = gen(5);
+    const auto tmp6 = gen(6);
+    const auto tmp7 = gen(7);
+    const auto tmp8 = gen(8);
+    const auto tmp9 = gen(9);
+    const auto tmp10 = gen(10);
+    const auto tmp11 = gen(11);
+    const auto tmp12 = gen(12);
+    const auto tmp13 = gen(13);
+    const auto tmp14 = gen(14);
+    const auto tmp15 = gen(15);
+    return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
+}
+template <> template <typename G> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::generate(G gen)
+{
+    const auto tmp0 = gen(0);
+    const auto tmp1 = gen(1);
+    const auto tmp2 = gen(2);
+    const auto tmp3 = gen(3);
+    const auto tmp4 = gen(4);
+    const auto tmp5 = gen(5);
+    const auto tmp6 = gen(6);
+    const auto tmp7 = gen(7);
+    const auto tmp8 = gen(8);
+    const auto tmp9 = gen(9);
+    const auto tmp10 = gen(10);
+    const auto tmp11 = gen(11);
+    const auto tmp12 = gen(12);
+    const auto tmp13 = gen(13);
+    const auto tmp14 = gen(14);
+    const auto tmp15 = gen(15);
+    return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
+}
+#endif
+
+// constants {{{1
+template <typename T> Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(VectorSpecialInitializerZero) : d{} {}
+
+template <> Vc_INTRINSIC Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_pd()) {}
+template <> Vc_INTRINSIC Vector< float, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_ps()) {}
+#ifdef Vc_IMPL_AVX2
+template <> Vc_INTRINSIC Vector<   int, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi32()) {}
+template <> Vc_INTRINSIC Vector<  uint, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu32()) {}
+template <> Vc_INTRINSIC Vector< short, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi16()) {}
+template <> Vc_INTRINSIC Vector<ushort, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu16()) {}
+template <> Vc_INTRINSIC Vector< schar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi8()) {}
+template <> Vc_INTRINSIC Vector< uchar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu8()) {}
+#endif
+
+template <typename T>
+Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx>::Vector(
+    VectorSpecialInitializerIndexesFromZero)
+    : Vector(AVX::IndexesFromZeroData<T>::address(), Vc::Aligned)
+{
+}
+
+template <>
+Vc_ALWAYS_INLINE Vector<float, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
+    : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
+{
+}
+template <>
+Vc_ALWAYS_INLINE Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
+    : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
+{
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// load member functions {{{1
+// general load, implemented via LoadHelper {{{2
+template <typename DstT>
+template <typename SrcT, typename Flags>
+Vc_INTRINSIC typename Vector<DstT, VectorAbi::Avx>::
+#ifndef Vc_MSVC
+template
+#endif
+load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Avx>::load(const SrcT *mem, Flags flags)
+{
+    Common::handleLoadPrefetches(mem, flags);
+    d.v() = Detail::load<VectorType, DstT>(mem, flags);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// zeroing {{{1
+template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero()
+{
+    data() = Detail::zero<VectorType>();
+}
+template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero(const Mask &k)
+{
+    data() = Detail::andnot_(k.data(), data());
+}
+template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZeroInverted(const Mask &k)
+{
+    data() = Detail::and_(k.data(), data());
+}
+
+template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan()
+{
+    data() = Detail::allone<VectorType>();
+}
+template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan(MaskArgument k)
+{
+    data() = _mm256_or_pd(data(), k.dataD());
+}
+template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan()
+{
+    data() = Detail::allone<VectorType>();
+}
+template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan(MaskArgument k)
+{
+    data() = _mm256_or_ps(data(), k.dataF());
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// stores {{{1
+template <typename T>
+template <typename U,
+          typename Flags,
+          typename>
+Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Flags flags) const
+{
+    Common::handleStorePrefetches(mem, flags);
+    HV::template store<Flags>(mem, data());
+}
+
+template <typename T>
+template <typename U,
+          typename Flags,
+          typename>
+Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Mask mask, Flags flags) const
+{
+    Common::handleStorePrefetches(mem, flags);
+    HV::template store<Flags>(mem, data(), mask.data());
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// integer ops {{{1
+#ifdef Vc_IMPL_AVX2
+template <> Vc_ALWAYS_INLINE AVX2::Vector<   int> Vector<   int, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
+template <> Vc_ALWAYS_INLINE AVX2::Vector<  uint> Vector<  uint, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
+template <> Vc_ALWAYS_INLINE AVX2::Vector<   int> Vector<   int, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srav_epi32(d.v(), x.d.v()); }
+template <> Vc_ALWAYS_INLINE AVX2::Vector<  uint> Vector<  uint, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srlv_epi32(d.v(), x.d.v()); }
+template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
+template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
+template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
+template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
+template <typename T>
+Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(AsArg x)
+{
+    static_assert(std::is_integral<T>::value,
+                  "bitwise-operators can only be used with Vectors of integral type");
+    return *this = *this << x;
+}
+template <typename T>
+Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(AsArg x)
+{
+    static_assert(std::is_integral<T>::value,
+                  "bitwise-operators can only be used with Vectors of integral type");
+    return *this = *this >> x;
+}
+#endif
+
+template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(int shift) {
+    d.v() = Detail::shiftRight(d.v(), shift, T());
+    return *static_cast<AVX2::Vector<T> *>(this);
+}
+template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator>>(int shift) const {
+    return Detail::shiftRight(d.v(), shift, T());
+}
+template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(int shift) {
+    d.v() = Detail::shiftLeft(d.v(), shift, T());
+    return *static_cast<AVX2::Vector<T> *>(this);
+}
+template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator<<(int shift) const {
+    return Detail::shiftLeft(d.v(), shift, T());
+}
+
+// isnegative {{{1
+Vc_INTRINSIC Vc_CONST AVX2::float_m isnegative(AVX2::float_v x)
+{
+    return AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
+        AVX::avx_cast<__m256i>(_mm256_and_ps(AVX::setsignmask_ps(), x.data()))));
+}
+Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x)
+{
+    return Mem::permute<X1, X1, X3, X3>(AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
+        AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data())))));
+}
+// gathers {{{1
+#define Vc_GATHER_IMPL(V_)                                                               \
+    template <>                                                                          \
+    template <class MT, class IT, int Scale>                                             \
+    inline void AVX2::V_::gatherImplementation(                                          \
+        const Common::GatherArguments<MT, IT, Scale> &args)
+#define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
+Vc_GATHER_IMPL(double_v) { d.v() = _mm256_setr_pd(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
+
+Vc_GATHER_IMPL(float_v)
+{
+    d.v() = _mm256_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6),
+                           Vc_M(7));
+}
+
+#ifdef Vc_IMPL_AVX2
+Vc_GATHER_IMPL(int_v)
+{
+    d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
+                              Vc_M(6), Vc_M(7));
+}
+
+Vc_GATHER_IMPL(uint_v)
+{
+    d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
+                              Vc_M(6), Vc_M(7));
+}
+
+Vc_GATHER_IMPL(short_v)
+{
+    d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
+                              Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
+                              Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
+}
+
+Vc_GATHER_IMPL(ushort_v)
+{
+    d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
+                              Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
+                              Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
+}
+#endif
+#undef Vc_M
+#undef Vc_GATHER_IMPL
+
+template <class T>
+template <class MT, class IT, int Scale>
+inline void Vector<T, VectorAbi::Avx>::gatherImplementation(
+    const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
+{
+    const auto *mem = args.address;
+    const auto indexes = Scale * args.indexes;
+    using Selector = std::integral_constant < Common::GatherScatterImplementation,
+#ifdef Vc_USE_SET_GATHERS
+          Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
+#endif
+#ifdef Vc_USE_BSF_GATHERS
+                                            Common::GatherScatterImplementation::BitScanLoop
+#elif defined Vc_USE_POPCNT_BSF_GATHERS
+              Common::GatherScatterImplementation::PopcntSwitch
+#else
+              Common::GatherScatterImplementation::SimpleLoop
+#endif
+                                                > ;
+    Common::executeGather(Selector(), *this, mem, indexes, mask);
+}
+
+template <typename T>
+template <typename MT, typename IT>
+inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes) const
+{
+    Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
+}
+
+template <typename T>
+template <typename MT, typename IT>
+inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
+{
+    using Selector = std::integral_constant < Common::GatherScatterImplementation,
+#ifdef Vc_USE_SET_GATHERS
+          Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
+#endif
+#ifdef Vc_USE_BSF_GATHERS
+                                            Common::GatherScatterImplementation::BitScanLoop
+#elif defined Vc_USE_POPCNT_BSF_GATHERS
+              Common::GatherScatterImplementation::PopcntSwitch
+#else
+              Common::GatherScatterImplementation::SimpleLoop
+#endif
+                                                > ;
+    Common::executeScatter(Selector(), *this, mem, std::forward<IT>(indexes), mask);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// operator- {{{1
+#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
+template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
+{
+    return VectorType(-d.builtin());
+}
+#else
+template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
+{
+    return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// horizontal ops {{{1
+template <typename T>
+Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
+Vector<T, VectorAbi::Avx>::minIndex() const
+{
+    AVX2::Vector<T> x = min();
+    return std::make_pair(x, (*this == x).firstOne());
+}
+template <typename T>
+Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
+Vector<T, VectorAbi::Avx>::maxIndex() const
+{
+    AVX2::Vector<T> x = max();
+    return std::make_pair(x, (*this == x).firstOne());
+}
+template <> Vc_INTRINSIC std::pair<AVX2::float_v, int> AVX2::float_v::minIndex() const
+{
+    /*
+    // 28 cycles latency:
+    __m256 x = _mm256_min_ps(Mem::permute128<X1, X0>(d.v()), d.v());
+    x = _mm256_min_ps(x, Reg::permute<X2, X3, X0, X1>(x));
+    AVX2::float_v xx = _mm256_min_ps(x, Reg::permute<X1, X0, X3, X2>(x));
+    AVX2::uint_v idx = AVX2::uint_v::IndexesFromZero();
+    idx = _mm256_castps_si256(
+        _mm256_or_ps((*this != xx).data(), _mm256_castsi256_ps(idx.data())));
+    return std::make_pair(xx, (*this == xx).firstOne());
+
+    __m128 loData = AVX::lo128(d.v());
+    __m128 hiData = AVX::hi128(d.v());
+    const __m128 less2 = _mm_cmplt_ps(hiData, loData);
+    loData = _mm_min_ps(loData, hiData);
+    hiData = Mem::permute<X2, X3, X0, X1>(loData);
+    const __m128 less1 = _mm_cmplt_ps(hiData, loData);
+    loData = _mm_min_ps(loData, hiData);
+    hiData = Mem::permute<X1, X0, X3, X2>(loData);
+    const __m128 less0 = _mm_cmplt_ps(hiData, loData);
+    unsigned bits = _mm_movemask_ps(less0) & 0x1;
+    bits |= ((_mm_movemask_ps(less1) << 1) - bits) & 0x2;
+    bits |= ((_mm_movemask_ps(less2) << 3) - bits) & 0x4;
+    loData = _mm_min_ps(loData, hiData);
+    return std::make_pair(AVX::concat(loData, loData), bits);
+    */
+
+    // 28 cycles Latency:
+    __m256 x = d.v();
+    __m256 idx = Vector<float>::IndexesFromZero().data();
+    __m256 y = Mem::permute128<X1, X0>(x);
+    __m256 idy = Mem::permute128<X1, X0>(idx);
+    __m256 less = AVX::cmplt_ps(x, y);
+
+    x = _mm256_blendv_ps(y, x, less);
+    idx = _mm256_blendv_ps(idy, idx, less);
+    y = Reg::permute<X2, X3, X0, X1>(x);
+    idy = Reg::permute<X2, X3, X0, X1>(idx);
+    less = AVX::cmplt_ps(x, y);
+
+    x = _mm256_blendv_ps(y, x, less);
+    idx = _mm256_blendv_ps(idy, idx, less);
+    y = Reg::permute<X1, X0, X3, X2>(x);
+    idy = Reg::permute<X1, X0, X3, X2>(idx);
+    less = AVX::cmplt_ps(x, y);
+
+    idx = _mm256_blendv_ps(idy, idx, less);
+
+    const auto index = _mm_cvtsi128_si32(AVX::avx_cast<__m128i>(idx));
+#ifdef Vc_GNU_ASM
+    __asm__ __volatile__(""); // help GCC to order the instructions better
+#endif
+    x = _mm256_blendv_ps(y, x, less);
+    return std::make_pair(x, index);
+}
+template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum() const
+{
+    //   a    b    c    d    e    f    g    h
+    // +      a    b    c    d    e    f    g    -> a ab bc  cd   de    ef     fg      gh
+    // +           a    ab   bc   cd   de   ef   -> a ab abc abcd bcde  cdef   defg    efgh
+    // +                     a    ab   abc  abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
+    AVX2::Vector<T> tmp = *this;
+    if (Size >  1) tmp += tmp.shifted(-1);
+    if (Size >  2) tmp += tmp.shifted(-2);
+    if (Size >  4) tmp += tmp.shifted(-4);
+    if (Size >  8) tmp += tmp.shifted(-8);
+    if (Size > 16) tmp += tmp.shifted(-16);
+    return tmp;
+}
+
+/* This function requires correct masking because the neutral element of \p op is not necessarily 0
+ *
+template<typename T> template<typename BinaryOperation> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum(BinaryOperation op) const
+{
+    //   a    b    c    d    e    f    g    h
+    // +      a    b    c    d    e    f    g    -> a ab bc  cd   de    ef     fg      gh
+    // +           a    ab   bc   cd   de   ef   -> a ab abc abcd bcde  cdef   defg    efgh
+    // +                     a    ab   abc  abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
+    AVX2::Vector<T> tmp = *this;
+    Mask mask(true);
+    if (Size >  1) tmp(mask) = op(tmp, tmp.shifted(-1));
+    if (Size >  2) tmp(mask) = op(tmp, tmp.shifted(-2));
+    if (Size >  4) tmp(mask) = op(tmp, tmp.shifted(-4));
+    if (Size >  8) tmp(mask) = op(tmp, tmp.shifted(-8));
+    if (Size > 16) tmp(mask) = op(tmp, tmp.shifted(-16));
+    return tmp;
+}
+*/
+
+template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::min(MaskArgument m) const
+{
+    AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::max();
+    tmp(m) = *this;
+    return tmp.min();
+}
+template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::max(MaskArgument m) const
+{
+    AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::min();
+    tmp(m) = *this;
+    return tmp.max();
+}
+template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::product(MaskArgument m) const
+{
+    AVX2::Vector<T> tmp(Vc::One);
+    tmp(m) = *this;
+    return tmp.product();
+}
+template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::sum(MaskArgument m) const
+{
+    AVX2::Vector<T> tmp(Vc::Zero);
+    tmp(m) = *this;
+    return tmp.sum();
+}//}}}
+// exponent {{{1
+namespace Detail
+{
+Vc_INTRINSIC Vc_CONST __m256 exponent(__m256 v)
+{
+    using namespace AVX;
+    __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(v), 23);
+    __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(v)), 23);
+    tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
+    tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
+    return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
+}
+Vc_INTRINSIC Vc_CONST __m256d exponent(__m256d v)
+{
+    using namespace AVX;
+    __m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(v), 52);
+    __m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(v)), 52);
+    tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
+    tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
+    return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1))));
+}
+} // namespace Detail
+
+Vc_INTRINSIC Vc_CONST AVX2::float_v exponent(AVX2::float_v x)
+{
+    using Detail::operator>=;
+    Vc_ASSERT((x >= x.Zero()).isFull());
+    return Detail::exponent(x.data());
+}
+Vc_INTRINSIC Vc_CONST AVX2::double_v exponent(AVX2::double_v x)
+{
+    using Detail::operator>=;
+    Vc_ASSERT((x >= x.Zero()).isFull());
+    return Detail::exponent(x.data());
+}
+// }}}1
+// Random {{{1
+static Vc_ALWAYS_INLINE __m256i _doRandomStep()
+{
+    using Detail::operator*;
+    using Detail::operator+;
+#ifdef Vc_IMPL_AVX2
+    using AVX2::uint_v;
+    uint_v state0(&Common::RandomState[0]);
+    uint_v state1(&Common::RandomState[uint_v::Size]);
+    (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
+    uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
+                        _mm256_srli_epi32(state1.data(), 16)))
+        .store(&Common::RandomState[0]);
+    return state0.data();
+#else
+    using SSE::uint_v;
+    uint_v state0(&Common::RandomState[0]);
+    uint_v state1(&Common::RandomState[uint_v::Size]);
+    uint_v state2(&Common::RandomState[2 * uint_v::Size]);
+    uint_v state3(&Common::RandomState[3 * uint_v::Size]);
+    (state2 * uint_v(0xdeece66du) + uint_v(11))
+        .store(&Common::RandomState[2 * uint_v::Size]);
+    (state3 * uint_v(0xdeece66du) + uint_v(11))
+        .store(&Common::RandomState[3 * uint_v::Size]);
+    uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
+                        _mm_srli_epi32(state2.data(), 16)))
+        .store(&Common::RandomState[0]);
+    uint_v(Detail::xor_((state1 * uint_v(0xdeece66du) + uint_v(11)).data(),
+                        _mm_srli_epi32(state3.data(), 16)))
+        .store(&Common::RandomState[uint_v::Size]);
+    return AVX::concat(state0.data(), state1.data());
+#endif
+}
+
+#ifdef Vc_IMPL_AVX2
+template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::Random()
+{
+    return {_doRandomStep()};
+}
+#endif
+
+template <> Vc_ALWAYS_INLINE AVX2::float_v AVX2::float_v::Random()
+{
+    return HT::sub(Detail::or_(_cast(AVX::srli_epi32<2>(_doRandomStep())), HT::one()),
+                   HT::one());
+}
+
+template<> Vc_ALWAYS_INLINE AVX2::double_v AVX2::double_v::Random()
+{
+    const __m256i state = Detail::load(&Common::RandomState[0], Vc::Aligned,
+                                       Detail::LoadTag<__m256i, int>());
+    for (size_t k = 0; k < 8; k += 2) {
+        typedef unsigned long long uint64 Vc_MAY_ALIAS;
+        const uint64 stateX = *aliasing_cast<uint64>(&Common::RandomState[k]);
+        *aliasing_cast<uint64>(&Common::RandomState[k]) = (stateX * 0x5deece66dull + 11);
+    }
+    return HT::sub(Detail::or_(_cast(AVX::srli_epi64<12>(state)), HT::one()), HT::one());
+}
+// }}}1
+// shifted / rotated {{{1
+template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount) const
+{
+    return Detail::shifted<EntryType>(d.v(), amount);
+}
+
+template <typename VectorType>
+Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m128>)
+{
+    return Mem::shuffle<X2, X3, Y0, Y1>(left, right);
+}
+template <typename VectorType>
+Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m256>)
+{
+    return Mem::shuffle128<X1, Y0>(left, right);
+}
+
+template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount, Vector shiftIn) const
+{
+#ifdef __GNUC__
+    if (__builtin_constant_p(amount)) {
+        const __m256i a = AVX::avx_cast<__m256i>(d.v());
+        const __m256i b = AVX::avx_cast<__m256i>(shiftIn.d.v());
+        if (amount * 2 == int(Size)) {
+            return shifted_shortcut(d.v(), shiftIn.d.v(), WidthT());
+        }
+        if (amount * 2 == -int(Size)) {
+            return shifted_shortcut(shiftIn.d.v(), d.v(), WidthT());
+        }
+        switch (amount) {
+        case 1:
+            return AVX::avx_cast<VectorType>(
+#ifdef Vc_IMPL_AVX2
+                _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
+                                   sizeof(EntryType))
+#else  // Vc_IMPL_AVX2
+                AVX::concat(
+                    _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), sizeof(EntryType)),
+                    _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), sizeof(EntryType)))
+#endif  // Vc_IMPL_AVX2
+                    );
+        case 2:
+            return AVX::avx_cast<VectorType>(
+#ifdef Vc_IMPL_AVX2
+                _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
+                                   2 * sizeof(EntryType))
+#else  // Vc_IMPL_AVX2
+                AVX::concat(
+                    _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), 2 * sizeof(EntryType)),
+                    _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), 2 * sizeof(EntryType)))
+#endif  // Vc_IMPL_AVX2
+                    );
+        case 3:
+            if (6u < Size) {
+                return AVX::avx_cast<VectorType>(
+#ifdef Vc_IMPL_AVX2
+                    _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
+                                       3 * sizeof(EntryType))
+#else   // Vc_IMPL_AVX2
+                    AVX::concat(_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a),
+                                                3 * sizeof(EntryType)),
+                                _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a),
+                                                3 * sizeof(EntryType)))
+#endif  // Vc_IMPL_AVX2
+                        );
+            // TODO: } else {
+            }
+        }
+    }
+#endif
+    using Detail::operator|;
+    return shifted(amount) | (amount > 0 ?
+                              shiftIn.shifted(amount - Size) :
+                              shiftIn.shifted(Size + amount));
+}
+template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::rotated(int amount) const
+{
+    return Detail::rotated<EntryType, size()>(d.v(), amount);
+}
+// sorted {{{1
+template <typename T>
+Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::sorted()
+    const
+{
+    return Detail::sorted(*this);
+}
+// interleaveLow/-High {{{1
+template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveLow(AVX2::double_v x) const
+{
+    return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_pd(data(), x.data()),
+                                   _mm256_unpackhi_pd(data(), x.data()));
+}
+template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveHigh(AVX2::double_v x) const
+{
+    return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_pd(data(), x.data()),
+                                   _mm256_unpackhi_pd(data(), x.data()));
+}
+template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveLow(AVX2::float_v x) const
+{
+    return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_ps(data(), x.data()),
+                                   _mm256_unpackhi_ps(data(), x.data()));
+}
+template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveHigh(AVX2::float_v x) const
+{
+    return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_ps(data(), x.data()),
+                                   _mm256_unpackhi_ps(data(), x.data()));
+}
+#ifdef Vc_IMPL_AVX2
+template <> Vc_INTRINSIC    AVX2::int_v    AVX2::int_v::interleaveLow (   AVX2::int_v x) const {
+    return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
+                                   _mm256_unpackhi_epi32(data(), x.data()));
+}
+template <> Vc_INTRINSIC    AVX2::int_v    AVX2::int_v::interleaveHigh(   AVX2::int_v x) const {
+    return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
+                                   _mm256_unpackhi_epi32(data(), x.data()));
+}
+template <> Vc_INTRINSIC   AVX2::uint_v   AVX2::uint_v::interleaveLow (  AVX2::uint_v x) const {
+    return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
+                                   _mm256_unpackhi_epi32(data(), x.data()));
+}
+template <> Vc_INTRINSIC   AVX2::uint_v   AVX2::uint_v::interleaveHigh(  AVX2::uint_v x) const {
+    return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
+                                   _mm256_unpackhi_epi32(data(), x.data()));
+}
+template <> Vc_INTRINSIC  AVX2::short_v  AVX2::short_v::interleaveLow ( AVX2::short_v x) const {
+    return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
+                                   _mm256_unpackhi_epi16(data(), x.data()));
+}
+template <> Vc_INTRINSIC  AVX2::short_v  AVX2::short_v::interleaveHigh( AVX2::short_v x) const {
+    return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
+                                   _mm256_unpackhi_epi16(data(), x.data()));
+}
+template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveLow (AVX2::ushort_v x) const {
+    return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
+                                   _mm256_unpackhi_epi16(data(), x.data()));
+}
+template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveHigh(AVX2::ushort_v x) const {
+    return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
+                                   _mm256_unpackhi_epi16(data(), x.data()));
+}
+#endif
+// permutation via operator[] {{{1
+template <> Vc_INTRINSIC Vc_PURE AVX2::double_v AVX2::double_v::operator[](Permutation::ReversedTag) const
+{
+    return Mem::permute128<X1, X0>(Mem::permute<X1, X0, X3, X2>(d.v()));
+}
+template <> Vc_INTRINSIC Vc_PURE AVX2::float_v AVX2::float_v::operator[](Permutation::ReversedTag) const
+{
+    return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
+}
+#ifdef Vc_IMPL_AVX2
+template <>
+Vc_INTRINSIC Vc_PURE AVX2::int_v AVX2::int_v::operator[](Permutation::ReversedTag) const
+{
+    return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
+}
+template <>
+Vc_INTRINSIC Vc_PURE AVX2::uint_v AVX2::uint_v::operator[](Permutation::ReversedTag) const
+{
+    return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
+}
+template <>
+Vc_INTRINSIC Vc_PURE AVX2::short_v AVX2::short_v::operator[](
+    Permutation::ReversedTag) const
+{
+    return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
+        AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
+        AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
+}
+template <>
+Vc_INTRINSIC Vc_PURE AVX2::ushort_v AVX2::ushort_v::operator[](
+    Permutation::ReversedTag) const
+{
+    return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
+        AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
+        AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
+}
+#endif
+template <> Vc_INTRINSIC AVX2::float_v Vector<float, VectorAbi::Avx>::operator[](const IndexType &/*perm*/) const
+{
+    // TODO
+    return *this;
+#ifdef Vc_IMPL_AVX2
+#else
+    /*
+    const int_m cross128 = AVX::concat(_mm_cmpgt_epi32(AVX::lo128(perm.data()), _mm_set1_epi32(3)),
+                                  _mm_cmplt_epi32(AVX::hi128(perm.data()), _mm_set1_epi32(4)));
+    if (cross128.isNotEmpty()) {
+    AVX2::float_v x = _mm256_permutevar_ps(d.v(), perm.data());
+        x(cross128) = _mm256_permutevar_ps(Mem::permute128<X1, X0>(d.v()), perm.data());
+        return x;
+    } else {
+    */
+#endif
+}
+
+// reversed {{{1
+template <typename T>
+Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::reversed() const
+{
+    return (*this)[Permutation::Reversed];
+}
+
+// broadcast from constexpr index {{{1
+template <> template <int Index> Vc_INTRINSIC AVX2::float_v AVX2::float_v::broadcast() const
+{
+    constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
+    constexpr VecPos Outer = static_cast<VecPos>((Index & 0x4) / 4);
+    return Mem::permute<Inner, Inner, Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
+}
+template <> template <int Index> Vc_INTRINSIC AVX2::double_v AVX2::double_v::broadcast() const
+{
+    constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
+    constexpr VecPos Outer = static_cast<VecPos>((Index & 0x2) / 2);
+    return Mem::permute<Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
+}
+// }}}1
+}  // namespace Vc
+
+// vim: foldmethod=marker
--- a/Vc/avx/vectorhelper.h
+++ b/Vc/avx/vectorhelper.h
@ -0,0 +1,257 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_AVX_VECTORHELPER_H_
+#define VC_AVX_VECTORHELPER_H_
+
+#include <limits>
+#include "types.h"
+#include "intrinsics.h"
+#include "casts.h"
+#include "../common/loadstoreflags.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace AVX
+{
+        template<> struct VectorHelper<__m256>
+        {
+            typedef __m256 VectorType;
+            typedef const VectorType VTArg;
+
+            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfAligned               = nullptr) { _mm256_store_ps(mem, x); }
+            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_ps(mem, x); }
+            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfStreaming             = nullptr) { _mm256_stream_ps(mem, x); }
+            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_ps()); }
+
+            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
+            template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
+        };
+
+        template<> struct VectorHelper<__m256d>
+        {
+            typedef __m256d VectorType;
+            typedef const VectorType VTArg;
+
+            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfAligned               = nullptr) { _mm256_store_pd(mem, x); }
+            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_pd(mem, x); }
+            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfStreaming             = nullptr) { _mm256_stream_pd(mem, x); }
+            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_pd()); }
+
+            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
+            template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
+        };
+
+        template<> struct VectorHelper<__m256i>
+        {
+            typedef __m256i VectorType;
+            typedef const VectorType VTArg;
+
+            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfAligned               = nullptr) { _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); }
+            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); }
+            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfStreaming             = nullptr) { _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); }
+            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_si256()); }
+
+            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
+            template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
+        };
+
+#define Vc_OP1(op) \
+        static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return Vc_CAT2(_mm256_##op##_, Vc_SUFFIX)(a); }
+#define Vc_OP(op) \
+        static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(op##_ , Vc_SUFFIX)(a, b); }
+#define Vc_OP_(op) \
+        static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op    , Vc_SUFFIX)(a, b); }
+#define Vc_OPx(op, op2) \
+        static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op2##_, Vc_SUFFIX)(a, b); }
+
+        template<> struct VectorHelper<double> {
+            typedef __m256d VectorType;
+            typedef const VectorType VTArg;
+            typedef double EntryType;
+#define Vc_SUFFIX pd
+
+            static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(_mm256_castps_pd(mask), a); }
+            static Vc_ALWAYS_INLINE VectorType set(const double a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
+            static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) {
+                return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d);
+            }
+            static Vc_ALWAYS_INLINE VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
+            static Vc_ALWAYS_INLINE VectorType one()  { return Vc_CAT2(setone_, Vc_SUFFIX)(); }// set(1.); }
+
+            static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
+#ifdef Vc_IMPL_FMA4
+                v1 = _mm256_macc_pd(v1, v2, v3);
+#else
+                VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
+                VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
+#if defined(Vc_GCC) && Vc_GCC < 0x40703
+                // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot
+                // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703
+                asm("":"+x"(h1), "+x"(h2));
+#endif
+                const VectorType l1 = _mm256_sub_pd(v1, h1);
+                const VectorType l2 = _mm256_sub_pd(v2, h2);
+                const VectorType ll = mul(l1, l2);
+                const VectorType lh = add(mul(l1, h2), mul(h1, l2));
+                const VectorType hh = mul(h1, h2);
+                // ll < lh < hh for all entries is certain
+                const VectorType lh_lt_v3 = cmplt_pd(abs(lh), abs(v3)); // |lh| < |v3|
+                const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3);
+                const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3);
+                v1 = add(add(ll, b), add(c, hh));
+#endif
+            }
+
+            static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_pd(a,b); }
+            static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_pd(a,b); }
+            static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_pd(a,b); }
+
+            Vc_OP1(sqrt)
+            static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) {
+                return _mm256_div_pd(one(), sqrt(x));
+            }
+            static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
+                return _mm256_div_pd(one(), x);
+            }
+            static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
+                return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_pd());
+            }
+
+            static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_pd(a, b); }
+            static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_pd(a, b); }
+            static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
+                __m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
+                b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
+                return _mm_cvtsd_f64(b);
+            }
+            static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
+                __m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
+                b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
+                return _mm_cvtsd_f64(b);
+            }
+            static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
+                __m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
+                b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
+                return _mm_cvtsd_f64(b);
+            }
+            static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
+                __m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
+                b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
+                return _mm_cvtsd_f64(b);
+            }
+#undef Vc_SUFFIX
+            static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
+                return _mm256_round_pd(a, _MM_FROUND_NINT);
+            }
+        };
+
+        template<> struct VectorHelper<float> {
+            typedef float EntryType;
+            typedef __m256 VectorType;
+            typedef const VectorType VTArg;
+#define Vc_SUFFIX ps
+
+            static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(mask, a); }
+            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
+            static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d,
+                    const float e, const float f, const float g, const float h) {
+                return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); }
+            static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
+            static Vc_ALWAYS_INLINE Vc_CONST VectorType one()  { return Vc_CAT2(setone_, Vc_SUFFIX)(); }// set(1.f); }
+            static Vc_ALWAYS_INLINE Vc_CONST __m256 concat(__m256d a, __m256d b) { return _mm256_insertf128_ps(avx_cast<__m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }
+
+            static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
+#ifdef Vc_IMPL_FMA4
+                v1 = _mm256_macc_ps(v1, v2, v3);
+#else
+                __m256d v1_0 = _mm256_cvtps_pd(lo128(v1));
+                __m256d v1_1 = _mm256_cvtps_pd(hi128(v1));
+                __m256d v2_0 = _mm256_cvtps_pd(lo128(v2));
+                __m256d v2_1 = _mm256_cvtps_pd(hi128(v2));
+                __m256d v3_0 = _mm256_cvtps_pd(lo128(v3));
+                __m256d v3_1 = _mm256_cvtps_pd(hi128(v3));
+                v1 = AVX::concat(
+                        _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
+                        _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
+#endif
+            }
+
+            static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_ps(a, b); }
+            static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_ps(a, b); }
+            static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_ps(a, b); }
+
+            Vc_OP1(sqrt) Vc_OP1(rsqrt)
+            static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
+                return _mm256_rcp_ps(x);
+            }
+            static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
+                return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_ps());
+            }
+
+            static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_ps(a, b); }
+            static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_ps(a, b); }
+            static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
+                __m128 b = _mm_min_ps(lo128(a), hi128(a));
+                b = _mm_min_ps(b, _mm_movehl_ps(b, b));   // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
+                b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3
+                return _mm_cvtss_f32(b);
+            }
+            static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
+                __m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
+                b = _mm_max_ps(b, _mm_movehl_ps(b, b));   // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
+                b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3
+                return _mm_cvtss_f32(b);
+            }
+            static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
+                __m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
+                b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
+                b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
+                return _mm_cvtss_f32(b);
+            }
+            static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
+                __m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
+                b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
+                b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
+                return _mm_cvtss_f32(b);
+            }
+#undef Vc_SUFFIX
+            static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
+                return _mm256_round_ps(a, _MM_FROUND_NINT);
+            }
+        };
+
+#undef Vc_OP1
+#undef Vc_OP
+#undef Vc_OP_
+#undef Vc_OPx
+
+}  // namespace AVX(2)
+}  // namespace Vc
+
+#endif // VC_AVX_VECTORHELPER_H_
--- a/Vc/common/algorithms.h
+++ b/Vc/common/algorithms.h
@ -0,0 +1,166 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_ALGORITHMS_H_
+#define VC_COMMON_ALGORITHMS_H_
+
+#include "simdize.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+#ifdef DOXYGEN
+/**
+ * \ingroup Utilities
+ * \headerfile algorithms.h <Vc/Vc>
+ *
+ * Vc variant of the `std::for_each` algorithm.
+ *
+ * This algorithm calls \p f with one argument of type
+ * `Vc::Vector<` *iterator value type* `, ` *unspecified* `>` as often as is needed to
+ * iterate over the complete range from \p first to \p last.
+ * It will try to use the best vector size (VectorAbi) to work on the largest chunks
+ * possible.
+ * To support aligned loads (and stores) and to support arbitrary range distances, the
+ * algorithm may require the use of `Vc::VectorAbi` types that work on fewer elements in
+ * parallel.
+ *
+ * The following example requires C++14 for generic lambdas. If you don't have generic
+ * lambdas available you can use a "classic" functor type with a templated call operator
+ * instead.
+ *
+ * \code
+ * void scale(std::vector<double> &data, double factor) {
+ *   Vc::simd_for_each(data.begin(), data.end(), [&](auto v) {
+ *      v *= factor;
+ *   });
+ * }
+ * \endcode
+ */
+template <class InputIt, class UnaryFunction>
+UnaryFunction simd_for_each(InputIt first, InputIt last, UnaryFunction f);
+#else
+template <class InputIt, class UnaryFunction,
+          class ValueType = typename std::iterator_traits<InputIt>::value_type>
+inline enable_if<
+    Traits::is_functor_argument_immutable<UnaryFunction, simdize<ValueType>>::value,
+    UnaryFunction>
+simd_for_each(InputIt first, InputIt last, UnaryFunction f)
+{
+    typedef simdize<ValueType> V;
+    typedef simdize<ValueType, 1> V1;
+    const auto lastV = last - V::Size + 1;
+    for (; first < lastV; first += V::Size) {
+        V tmp;
+        load_interleaved(tmp, std::addressof(*first));
+        f(tmp);
+    }
+    for (; first != last; ++first) {
+        V1 tmp;
+        load_interleaved(tmp, std::addressof(*first));
+        f(tmp);
+    }
+    return f;
+}
+
+template <typename InputIt, typename UnaryFunction,
+          class ValueType = typename std::iterator_traits<InputIt>::value_type>
+inline enable_if<
+        !Traits::is_functor_argument_immutable<UnaryFunction, simdize<ValueType>>::value,
+    UnaryFunction>
+simd_for_each(InputIt first, InputIt last, UnaryFunction f)
+{
+    typedef simdize<ValueType> V;
+    typedef simdize<ValueType, 1> V1;
+    const auto lastV = last - V::size() + 1;
+    for (; first < lastV; first += V::size()) {
+        V tmp;
+        load_interleaved(tmp, std::addressof(*first));
+        f(tmp);
+        store_interleaved(tmp, std::addressof(*first));
+    }
+    for (; first != last; ++first) {
+        V1 tmp;
+        load_interleaved(tmp, std::addressof(*first));
+        f(tmp);
+        store_interleaved(tmp, std::addressof(*first));
+    }
+    return f;
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+template <typename InputIt, typename UnaryFunction,
+          class ValueType = typename std::iterator_traits<InputIt>::value_type>
+inline enable_if<
+    Traits::is_functor_argument_immutable<UnaryFunction, simdize<ValueType>>::value,
+    UnaryFunction>
+simd_for_each_n(InputIt first, std::size_t count, UnaryFunction f)
+{
+    typename std::make_signed<size_t>::type len = count;
+    typedef simdize<ValueType> V;
+    typedef simdize<ValueType, 1> V1;
+    for (; len >= int(V::size()); len -= V::Size, first += V::Size) {
+        V tmp;
+        load_interleaved(tmp, std::addressof(*first));
+        f(tmp);
+    }
+    for (; len != 0; --len, ++first) {
+        V1 tmp;
+        load_interleaved(tmp, std::addressof(*first));
+        f(tmp);
+    }
+    return f;
+}
+
+template <typename InputIt, typename UnaryFunction,
+          class ValueType = typename std::iterator_traits<InputIt>::value_type>
+inline enable_if<
+    !Traits::is_functor_argument_immutable<UnaryFunction, simdize<ValueType>>::value,
+    UnaryFunction>
+simd_for_each_n(InputIt first, std::size_t count, UnaryFunction f)
+{
+    typename std::make_signed<size_t>::type len = count;
+    typedef simdize<ValueType> V;
+    typedef simdize<ValueType, 1> V1;
+    for (; len >= int(V::size()); len -= V::Size, first += V::Size) {
+        V tmp;
+        load_interleaved(tmp, std::addressof(*first));
+        f(tmp);
+        store_interleaved(tmp, std::addressof(*first));
+    }
+    for (; len != 0; --len, ++first) {
+        V1 tmp;
+        load_interleaved(tmp, std::addressof(*first));
+        f(tmp);
+        store_interleaved(tmp, std::addressof(*first));
+    }
+    return f;
+}
+
+}  // namespace Vc
+
+#endif // VC_COMMON_ALGORITHMS_H_
--- a/Vc/common/aliasingentryhelper.h
+++ b/Vc/common/aliasingentryhelper.h
@ -0,0 +1,121 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_ALIASINGENTRYHELPER_H_
+#define VC_COMMON_ALIASINGENTRYHELPER_H_
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+
+template<class StorageType> class AliasingEntryHelper
+{
+    private:
+        typedef typename StorageType::EntryType T;
+#ifdef Vc_ICC
+        StorageType *const m_storage;
+        const int m_index;
+    public:
+        Vc_ALWAYS_INLINE AliasingEntryHelper(StorageType *d, int index) : m_storage(d), m_index(index) {}
+        Vc_ALWAYS_INLINE AliasingEntryHelper(const AliasingEntryHelper &) = default;
+        Vc_ALWAYS_INLINE AliasingEntryHelper(AliasingEntryHelper &&) = default;
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) {
+            m_storage->assign(m_index, rhs);
+            return *this;
+        }
+
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator  =(T x) { m_storage->assign(m_index, x); return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator +=(T x) { m_storage->assign(m_index, m_storage->m(m_index) + x); return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator -=(T x) { m_storage->assign(m_index, m_storage->m(m_index) - x); return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator /=(T x) { m_storage->assign(m_index, m_storage->m(m_index) / x); return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator *=(T x) { m_storage->assign(m_index, m_storage->m(m_index) * x); return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator |=(T x) { m_storage->assign(m_index, m_storage->m(m_index) | x); return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator &=(T x) { m_storage->assign(m_index, m_storage->m(m_index) & x); return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator ^=(T x) { m_storage->assign(m_index, m_storage->m(m_index) ^ x); return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator %=(T x) { m_storage->assign(m_index, m_storage->m(m_index) % x); return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_storage->assign(m_index, m_storage->m(m_index)<< x); return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_storage->assign(m_index, m_storage->m(m_index)>> x); return *this; }
+#define m_data m_storage->read(m_index)
+#else
+        typedef T A Vc_MAY_ALIAS;
+        A &m_data;
+    public:
+        template<typename T2>
+        Vc_ALWAYS_INLINE AliasingEntryHelper(T2 &d) : m_data(reinterpret_cast<A &>(d)) {}
+
+        Vc_ALWAYS_INLINE AliasingEntryHelper(A &d) : m_data(d) {}
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator=(const AliasingEntryHelper &rhs) {
+            m_data = rhs.m_data;
+            return *this;
+        }
+
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator =(T x) { m_data  = x; return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator+=(T x) { m_data += x; return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator-=(T x) { m_data -= x; return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator/=(T x) { m_data /= x; return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator*=(T x) { m_data *= x; return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator|=(T x) { m_data |= x; return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator&=(T x) { m_data &= x; return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator^=(T x) { m_data ^= x; return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator%=(T x) { m_data %= x; return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator<<=(T x) { m_data <<= x; return *this; }
+        Vc_ALWAYS_INLINE AliasingEntryHelper &operator>>=(T x) { m_data >>= x; return *this; }
+#endif
+
+        Vc_ALWAYS_INLINE Vc_PURE operator const T() const { return m_data; }
+
+        Vc_ALWAYS_INLINE Vc_PURE bool operator==(T x) const { return static_cast<T>(m_data) == x; }
+        Vc_ALWAYS_INLINE Vc_PURE bool operator!=(T x) const { return static_cast<T>(m_data) != x; }
+        Vc_ALWAYS_INLINE Vc_PURE bool operator<=(T x) const { return static_cast<T>(m_data) <= x; }
+        Vc_ALWAYS_INLINE Vc_PURE bool operator>=(T x) const { return static_cast<T>(m_data) >= x; }
+        Vc_ALWAYS_INLINE Vc_PURE bool operator< (T x) const { return static_cast<T>(m_data) <  x; }
+        Vc_ALWAYS_INLINE Vc_PURE bool operator> (T x) const { return static_cast<T>(m_data) >  x; }
+
+        Vc_ALWAYS_INLINE Vc_PURE T operator-() const { return -static_cast<T>(m_data); }
+        Vc_ALWAYS_INLINE Vc_PURE T operator~() const { return ~static_cast<T>(m_data); }
+        Vc_ALWAYS_INLINE Vc_PURE T operator+(T x) const { return static_cast<T>(m_data) + x; }
+        Vc_ALWAYS_INLINE Vc_PURE T operator-(T x) const { return static_cast<T>(m_data) - x; }
+        Vc_ALWAYS_INLINE Vc_PURE T operator/(T x) const { return static_cast<T>(m_data) / x; }
+        Vc_ALWAYS_INLINE Vc_PURE T operator*(T x) const { return static_cast<T>(m_data) * x; }
+        Vc_ALWAYS_INLINE Vc_PURE T operator|(T x) const { return static_cast<T>(m_data) | x; }
+        Vc_ALWAYS_INLINE Vc_PURE T operator&(T x) const { return static_cast<T>(m_data) & x; }
+        Vc_ALWAYS_INLINE Vc_PURE T operator^(T x) const { return static_cast<T>(m_data) ^ x; }
+        Vc_ALWAYS_INLINE Vc_PURE T operator%(T x) const { return static_cast<T>(m_data) % x; }
+        //T operator<<(T x) const { return static_cast<T>(m_data) << x; }
+        //T operator>>(T x) const { return static_cast<T>(m_data) >> x; }
+#ifdef m_data
+#undef m_data
+#endif
+};
+
+}  // namespace Common
+}  // namespace Vc
+
+#endif // VC_COMMON_ALIASINGENTRYHELPER_H_
--- a/Vc/common/alignedbase.h
+++ b/Vc/common/alignedbase.h
@ -0,0 +1,137 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_ALIGNEDBASE_H_
+#define VC_COMMON_ALIGNEDBASE_H_
+
+#include "types.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Detail
+{
+/**\internal
+ * Break the recursion of the function below.
+ */
+template <typename T> constexpr T max(T a) { return a; }
+/**\internal
+ * \returns the maximum of all specified arguments.
+ */
+template <typename T, typename... Ts> constexpr T max(T a, T b, Ts... rest)
+{
+    return a > b ? max(a, rest...) : max(b, rest...);
+}
+}  // namespace Detail
+namespace Common
+{
+template <std::size_t> Vc_INTRINSIC void *aligned_malloc(std::size_t);
+Vc_ALWAYS_INLINE void free(void *);
+}  // namespace Common
+
+/**
+ * \ingroup Utilities
+ *
+ * Helper class to ensure a given alignment.
+ *
+ * This class reimplements the \c new and \c delete operators to align objects allocated
+ * on the heap suitably with the specified alignment \c Alignment.
+ *
+ * \see Vc::VectorAlignedBase
+ * \see Vc::MemoryAlignedBase
+ */
+template <std::size_t Alignment> struct alignas(Alignment) AlignedBase
+{
+    Vc_FREE_STORE_OPERATORS_ALIGNED(Alignment);
+};
+
+/**
+ * \ingroup Utilities
+ *
+ * Helper type to ensure suitable alignment for any Vc::Vector<T> type (using the default
+ * VectorAbi).
+ *
+ * This class reimplements the \c new and \c delete operators to align objects allocated
+ * on the heap suitably for objects of Vc::Vector<T> type. This is necessary since the
+ * standard \c new operator does not adhere to the alignment requirements of the type.
+ *
+ * \see Vc::VectorAlignedBaseT
+ * \see Vc::MemoryAlignedBase
+ * \see Vc::AlignedBase
+ */
+using VectorAlignedBase = AlignedBase<
+    Detail::max(alignof(Vector<float>), alignof(Vector<double>), alignof(Vector<ullong>),
+                alignof(Vector<llong>), alignof(Vector<ulong>), alignof(Vector<long>),
+                alignof(Vector<uint>), alignof(Vector<int>), alignof(Vector<ushort>),
+                alignof(Vector<short>), alignof(Vector<uchar>), alignof(Vector<schar>))>;
+
+/**
+ * \ingroup Utilities
+ * Variant of the above type ensuring suitable alignment only for the specified vector
+ * type \p V.
+ *
+ * \see Vc::VectorAlignedBase
+ * \see Vc::MemoryAlignedBaseT
+ */
+template <typename V> using VectorAlignedBaseT = AlignedBase<alignof(V)>;
+
+/**
+ * \ingroup Utilities
+ *
+ * Helper class to ensure suitable alignment for arrays of scalar objects for any
+ * Vc::Vector<T> type (using the default VectorAbi).
+ *
+ * This class reimplements the \c new and \c delete operators to align objects allocated
+ * on the heap suitably for arrays of type \p Vc::Vector<T>::EntryType. Subsequent load
+ * and store operations are safe to use the aligned variant.
+ *
+ * \see Vc::MemoryAlignedBaseT
+ * \see Vc::VectorAlignedBase
+ * \see Vc::AlignedBase
+ */
+using MemoryAlignedBase = AlignedBase<
+    Detail::max(Vector<float>::MemoryAlignment, Vector<double>::MemoryAlignment,
+                Vector<ullong>::MemoryAlignment, Vector<llong>::MemoryAlignment,
+                Vector<ulong>::MemoryAlignment, Vector<long>::MemoryAlignment,
+                Vector<uint>::MemoryAlignment, Vector<int>::MemoryAlignment,
+                Vector<ushort>::MemoryAlignment, Vector<short>::MemoryAlignment,
+                Vector<uchar>::MemoryAlignment, Vector<schar>::MemoryAlignment)>;
+
+/**
+ * \ingroup Utilities
+ * Variant of the above type ensuring suitable alignment only for the specified vector
+ * type \p V.
+ *
+ * \see Vc::MemoryAlignedBase
+ * \see Vc::VectorAlignedBaseT
+ */
+template <typename V> using MemoryAlignedBaseT = AlignedBase<V::MemoryAlignment>;
+}
+
+#endif  // VC_COMMON_ALIGNEDBASE_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/bitscanintrinsics.h
+++ b/Vc/common/bitscanintrinsics.h
@ -0,0 +1,62 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_BITSCANINTRINSICS_H_
+#define VC_COMMON_BITSCANINTRINSICS_H_
+
+#if defined(Vc_GCC) || defined(Vc_CLANG) || defined(Vc_APPLECLANG)
+#include <x86intrin.h>
+#  ifndef _bit_scan_forward
+#    define _bit_scan_forward(x) __builtin_ctz(x)
+#include "macros.h"
+static Vc_ALWAYS_INLINE Vc_CONST int _Vc_bit_scan_reverse_asm(unsigned int x) {
+    int r;
+    __asm__("bsr %1,%0" : "=r"(r) : "X"(x));
+    return r;
+}
+#    define _bit_scan_reverse(x) _Vc_bit_scan_reverse_asm(x)
+#  endif
+#elif defined(_WIN32)
+#include <intrin.h>
+static inline __forceinline unsigned long _bit_scan_forward(unsigned long x) {
+	unsigned long index;
+	_BitScanForward(&index, x);
+	return index;
+}
+static inline __forceinline unsigned long _bit_scan_reverse(unsigned long x) {
+	unsigned long index;
+	_BitScanReverse(&index, x);
+	return index;
+}
+#elif defined(Vc_ICC)
+// for all I know ICC supports the _bit_scan_* intrinsics
+#else
+// just assume the compiler can do it
+#endif
+
+
+#endif // VC_COMMON_BITSCANINTRINSICS_H_
--- a/Vc/common/const.h
+++ b/Vc/common/const.h
@ -0,0 +1,92 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_CONST_H_
+#define VC_COMMON_CONST_H_
+
+#include <type_traits>
+#include "../global.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Detail
+{
+
+template <int exponent> constexpr double exponentToFloat(std::integral_constant<bool, true>);
+template <int exponent> constexpr double exponentToFloat(std::integral_constant<bool, false>);
+template <> constexpr double exponentToFloat<0>(std::integral_constant<bool, true>)
+{
+    return 1.;
+}
+template <> constexpr double exponentToFloat<0>(std::integral_constant<bool, false>)
+{
+    return 1.;
+}
+template <> constexpr double exponentToFloat<-32>(std::integral_constant<bool, true>)
+{
+    return 1. / (65536. * 65536.);
+}
+template <> constexpr double exponentToFloat<32>(std::integral_constant<bool, false>)
+{
+    return 65536. * 65536.;
+}
+template <> constexpr double exponentToFloat<-64>(std::integral_constant<bool, true>)
+{
+    return 1. / (65536. * 65536. * 65536. * 65536.);
+}
+template <> constexpr double exponentToFloat<64>(std::integral_constant<bool, false>)
+{
+    return 65536. * 65536. * 65536. * 65536.;
+}
+template <int exponent>
+constexpr double exponentToFloat(std::integral_constant<bool, false> negative)
+{
+    return exponentToFloat<exponent - 1>(negative) * 2.0;
+}
+template <int exponent>
+constexpr double exponentToFloat(std::integral_constant<bool, true> negative)
+{
+    return exponentToFloat<exponent + 1>(negative) * 0.5;
+}
+template <int sign, unsigned long long mantissa, int exponent> constexpr double doubleConstant()
+{
+    return (static_cast<double>((mantissa & 0x000fffffffffffffull) | 0x0010000000000000ull) /
+            0x0010000000000000ull) *
+           exponentToFloat<exponent>(std::integral_constant<bool, (exponent < 0)>()) * sign;
+}
+template <int sign, unsigned int mantissa, int exponent> constexpr float floatConstant()
+{
+    return (static_cast<float>((mantissa & 0x007fffffu) | 0x00800000u) / 0x00800000u) *
+           static_cast<float>(
+               exponentToFloat<exponent>(std::integral_constant<bool, (exponent < 0)>())) *
+           sign;
+}
+
+}  // namespace Detail
+}  // namespace Vc
+
+#endif // VC_COMMON_CONST_H_
--- a/Vc/common/data.h
+++ b/Vc/common/data.h
@ -0,0 +1,43 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_CONST_DATA_H_
+#define VC_COMMON_CONST_DATA_H_
+
+#include "macros.h"
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+
+alignas(64) extern unsigned int RandomState[];
+alignas(32) extern const unsigned int AllBitsSet[8];
+
+}  // namespace Common
+}  // namespace Vc
+
+#endif // VC_COMMON_CONST_DATA_H_
--- a/Vc/common/deinterleave.h
+++ b/Vc/common/deinterleave.h
@ -0,0 +1,91 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_DEINTERLEAVE_H_
+#define VC_COMMON_DEINTERLEAVE_H_
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+
+/**
+ * \ingroup Vectors
+ *
+ * \deprecated Turn to InterleavedMemoryWrapper for a more flexible and complete solution.
+ *
+ * Loads two vectors of values from an interleaved array.
+ *
+ * \param a, b The vectors to load the values from memory into.
+ * \param memory The memory location where to read the next 2 * V::Size values from
+ * \param align Either pass Vc::Aligned or Vc::Unaligned. It defaults to Vc::Aligned if nothing is
+ * specified.
+ *
+ * If you store your data as
+ * \code
+ * struct { float x, y; } m[1000];
+ * \endcode
+ * then the deinterleave function allows you to read \p Size concurrent x and y values like this:
+ * \code
+ * Vc::float_v x, y;
+ * Vc::deinterleave(&x, &y, &m[10], Vc::Unaligned);
+ * \endcode
+ * This code will load m[10], m[12], m[14], ... into \p x and m[11], m[13], m[15], ... into \p y.
+ *
+ * The deinterleave function supports the following type combinations:
+\verbatim
+  V \  M | float | double | ushort | short | uint | int
+=========|=======|========|========|=======|======|=====
+ float_v |   X   |        |    X   |   X   |      |
+---------|-------|--------|--------|-------|------|-----
+double_v |       |    X   |        |       |      |
+---------|-------|--------|--------|-------|------|-----
+   int_v |       |        |        |   X   |      |  X
+---------|-------|--------|--------|-------|------|-----
+  uint_v |       |        |    X   |       |   X  |
+---------|-------|--------|--------|-------|------|-----
+ short_v |       |        |        |   X   |      |
+---------|-------|--------|--------|-------|------|-----
+ushort_v |       |        |    X   |       |      |
+\endverbatim
+ */
+template<typename V, typename M, typename A> Vc_ALWAYS_INLINE void deinterleave(V *a, V *b,
+        const M *memory, A align)
+{
+    Detail::deinterleave(*a, *b, memory, align);
+}
+
+// documented as default for align above
+template<typename V, typename M> Vc_ALWAYS_INLINE void deinterleave(V *a, V *b,
+        const M *memory)
+{
+    Detail::deinterleave(*a, *b, memory, Aligned);
+}
+
+}  // namespace Vc
+
+#endif // VC_COMMON_DEINTERLEAVE_H_
--- a/Vc/common/detail.h
+++ b/Vc/common/detail.h
@ -0,0 +1,137 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2018 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_DETAIL_H_
+#define VC_COMMON_DETAIL_H_
+
+#include <vector>
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+// convertIndexVector {{{
+// if the argument is a Vector<T> already we definitely want to keep it that way
+template <typename IV>
+Vc_INTRINSIC enable_if<(Traits::is_simd_vector<IV>::value &&
+                        sizeof(typename IV::EntryType) >= sizeof(int)),
+                       const IV &>
+convertIndexVector(const IV &indexVector)
+{
+    return indexVector;
+}
+
+// but if the scalar (integral) type is smaller than int we convert it up to int. Otherwise it's
+// very likely that the calculations we have to perform will overflow.
+template <typename IV>
+Vc_INTRINSIC enable_if<(Traits::is_simd_vector<IV>::value &&
+                        sizeof(typename IV::EntryType) < sizeof(int)),
+                       fixed_size_simd<int, IV::Size>>
+convertIndexVector(const IV &indexVector)
+{
+    return static_cast<fixed_size_simd<int, IV::Size>>(indexVector);
+}
+
+// helper for promoting int types to int or higher
+template <class T> using promoted_type = decltype(std::declval<T>() + 1);
+
+// std::array, Vc::array, and C-array are fixed size and can therefore be converted to a
+// fixed_size_simd of the same size
+template <typename T, std::size_t N>
+Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
+convertIndexVector(const std::array<T, N> &indexVector)
+{
+    return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
+                                                Vc::Unaligned};
+}
+template <typename T, std::size_t N>
+Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
+convertIndexVector(const Vc::array<T, N> &indexVector)
+{
+    return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
+                                                Vc::Unaligned};
+}
+template <typename T, std::size_t N>
+Vc_INTRINSIC enable_if<std::is_integral<T>::value, fixed_size_simd<promoted_type<T>, N>>
+convertIndexVector(const T (&indexVector)[N])
+{
+    return fixed_size_simd<promoted_type<T>, N>{std::addressof(indexVector[0]),
+                                                Vc::Unaligned};
+}
+
+// a plain pointer won't work. Because we need some information on the number of values in
+// the index argument
+#ifndef Vc_MSVC
+// MSVC treats the function as usable in SFINAE context if it is deleted. If it's not declared we
+// seem to get what we wanted (except for bad diagnostics)
+template <class T>
+enable_if<std::is_pointer<T>::value, void> convertIndexVector(T indexVector) = delete;
+#endif
+
+// an initializer_list works, but is runtime-sized (before C++14, at least) so we have to
+// fall back to std::vector
+template <typename T>
+Vc_INTRINSIC std::vector<promoted_type<T>> convertIndexVector(
+    const std::initializer_list<T> &indexVector)
+{
+    return {begin(indexVector), end(indexVector)};
+}
+
+// a std::vector cannot be converted to anything better
+template <typename T>
+Vc_INTRINSIC
+    enable_if<(std::is_integral<T>::value && sizeof(T) >= sizeof(int)), std::vector<T>>
+    convertIndexVector(const std::vector<T> &indexVector)
+{
+    return indexVector;
+}
+template <typename T>
+Vc_INTRINSIC enable_if<(std::is_integral<T>::value && sizeof(T) < sizeof(int)),
+                       std::vector<promoted_type<T>>>
+convertIndexVector(const std::vector<T> &indexVector)
+{
+    return {std::begin(indexVector), std::end(indexVector)};
+}
+
+template <class T,
+          class = enable_if<
+              (!std::is_pointer<T>::value && !Traits::is_simd_vector<T>::value &&
+               !std::is_lvalue_reference<decltype(std::declval<const T &>()[0])>::value)>>
+Vc_INTRINSIC const T &convertIndexVector(const T &i)
+{
+    return i;
+}
+
+// }}}
+}  // namespace Common
+}  // namespace Vc_VERSIONED_NAMESPACE
+
+#endif  // VC_COMMON_DETAIL_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/elementreference.h
+++ b/Vc/common/elementreference.h
@ -0,0 +1,178 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2016 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_ELEMENTREFERENCE_H_
+#define VC_COMMON_ELEMENTREFERENCE_H_
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Detail
+{
+template <typename U, typename Accessor = U> class ElementReference
+{
+    friend U;
+    friend Accessor;
+    Vc_INTRINSIC ElementReference(U &o, int i) noexcept : index(i), obj(o) {}
+
+    static constexpr bool get_noexcept =
+        noexcept(Accessor::get(std::declval<U &>(), int()));
+    template <typename T> static constexpr bool set_noexcept()
+    {
+        return noexcept(Accessor::set(std::declval<U &>(), int(), std::declval<T>()));
+    }
+
+public:
+    using value_type = typename U::value_type;
+    Vc_INTRINSIC ElementReference(const ElementReference &) = delete;
+
+    /**
+     * Move Constructor
+     *
+     * this is the only way to constructor an ElementReference in user code
+     *
+     * \note
+     * Please be aware that this class models the concept of a reference
+     * and as such it can have the same lifetime issue as a standard C++
+     * reference.
+     *
+     * \note
+     * C++ 17 support copy-elision, which in turn allows to
+     * the ElementReference obtained via operator[] from a function
+     * and avoid copying. C++11 and C++14 don't offer this, thus we add
+     * the move constructor, to allow them to move the data and thus avoid
+     * copying (which was prohibited by the deleted constructor above
+     */
+    Vc_INTRINSIC ElementReference(ElementReference &&) = default;
+
+    Vc_INTRINSIC operator value_type() const noexcept(get_noexcept)
+    {
+        return Accessor::get(obj, index);
+    }
+
+    template <typename T>
+        Vc_INTRINSIC ElementReference &operator=(T &&x) &&
+        noexcept(noexcept(Accessor::set(std::declval<U &>(), int(), std::declval<T>())))
+    {
+        Accessor::set(obj, index, std::forward<T>(x));
+        return *this;
+    }
+
+// TODO: improve with operator.()
+
+#define Vc_OP_(op_)                                                                      \
+    template <typename T, typename R = decltype(std::declval<const value_type &>()       \
+                                                    op_ std::declval<T>())>              \
+        Vc_INTRINSIC ElementReference &operator op_##=(T &&x) &&                         \
+        noexcept(get_noexcept && noexcept(Accessor::set(std::declval<U &>(), int(),      \
+                                                        std::declval<R &&>())))          \
+    {                                                                                    \
+        const value_type &lhs = Accessor::get(obj, index);                               \
+        Accessor::set(obj, index, lhs op_ std::forward<T>(x));                           \
+        return *this;                                                                    \
+    }
+    Vc_ALL_ARITHMETICS(Vc_OP_);
+    Vc_ALL_SHIFTS(Vc_OP_);
+    Vc_ALL_BINARY(Vc_OP_);
+#undef Vc_OP_
+
+    template <typename = void>
+        Vc_INTRINSIC ElementReference &operator++() &&
+        noexcept(noexcept(std::declval<value_type &>() =
+                              Accessor::get(std::declval<U &>(), int())) &&
+                 set_noexcept<decltype(++std::declval<value_type &>())>())
+    {
+        value_type x = Accessor::get(obj, index);
+        Accessor::set(obj, index, ++x);
+        return *this;
+    }
+
+    template <typename = void>
+        Vc_INTRINSIC value_type operator++(int) &&
+        noexcept(noexcept(std::declval<value_type &>() =
+                              Accessor::get(std::declval<U &>(), int())) &&
+                 set_noexcept<decltype(std::declval<value_type &>()++)>())
+    {
+        const value_type r = Accessor::get(obj, index);
+        value_type x = r;
+        Accessor::set(obj, index, ++x);
+        return r;
+    }
+
+    template <typename = void>
+        Vc_INTRINSIC ElementReference &operator--() &&
+        noexcept(noexcept(std::declval<value_type &>() =
+                              Accessor::get(std::declval<U &>(), int())) &&
+                 set_noexcept<decltype(--std::declval<value_type &>())>())
+    {
+        value_type x = Accessor::get(obj, index);
+        Accessor::set(obj, index, --x);
+        return *this;
+    }
+
+    template <typename = void>
+        Vc_INTRINSIC value_type operator--(int) &&
+        noexcept(noexcept(std::declval<value_type &>() =
+                              Accessor::get(std::declval<U &>(), int())) &&
+                 set_noexcept<decltype(std::declval<value_type &>()--)>())
+    {
+        const value_type r = Accessor::get(obj, index);
+        value_type x = r;
+        Accessor::set(obj, index, --x);
+        return r;
+    }
+
+    friend void swap(ElementReference &&a, ElementReference &&b) {
+        value_type tmp(a);
+        static_cast<ElementReference &&>(a) = static_cast<value_type>(b);
+        static_cast<ElementReference &&>(b) = tmp;
+    }
+
+    friend void swap(value_type &a, ElementReference &&b) {
+        value_type tmp(a);
+        a = static_cast<value_type>(b);
+        static_cast<ElementReference &&>(b) = tmp;
+    }
+
+    friend void swap(ElementReference &&a, value_type &b) {
+        value_type tmp(a);
+        static_cast<ElementReference &&>(a) = b;
+        b = tmp;
+    }
+
+private:
+    int index;
+    U &obj;
+};
+
+}  // namespace Detail
+}  // namespace Vc
+
+#endif  // VC_COMMON_ELEMENTREFERENCE_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/exponential.h
+++ b/Vc/common/exponential.h
@ -0,0 +1,91 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2012-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-------------------------------------------------------------------
+
+The exp implementation is derived from Cephes, which carries the
+following Copyright notice:
+
+Cephes Math Library Release 2.2:  June, 1992
+Copyright 1984, 1987, 1989 by Stephen L. Moshier
+Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+
+}}}*/
+
+#ifdef Vc_COMMON_MATH_H_INTERNAL
+
+constexpr float log2_e = 1.44269504088896341f;
+
+// These constants are adjusted to account for single-precision floating point.
+// The original are for double precision:
+// 
+// constexpr float MAXLOGF = 88.72283905206835f;
+// constexpr float MINLOGF = -103.278929903431851103f; /* log(2^-149) */
+
+constexpr float MAXLOGF = 88.722831726074219f; /* log(2^127.99998474121094f) */
+constexpr float MINLOGF = -88.029685974121094f; /* log(2^-126.99999237060547f) */
+constexpr float MAXNUMF = 3.4028234663852885981170418348451692544e38f;
+
+template <typename Abi, typename = enable_if<std::is_same<Abi, VectorAbi::Sse>::value ||
+                                             std::is_same<Abi, VectorAbi::Avx>::value>>
+inline Vector<float, detail::not_fixed_size_abi<Abi>> exp(Vector<float, Abi> x)
+{
+    using V = Vector<float, Abi>;
+    typedef typename V::Mask M;
+    typedef Detail::Const<float, Abi> C;
+
+        const M overflow  = x > MAXLOGF;
+        const M underflow = x < MINLOGF;
+
+        // log₂(eˣ) = x * log₂(e) * log₂(2)
+        //          = log₂(2^(x * log₂(e)))
+        // => eˣ = 2^(x * log₂(e))
+        // => n  = ⌊x * log₂(e) + ½⌋
+        // => y  = x - n * ln(2)       | recall that: ln(2) * log₂(e) == 1
+        // <=> eˣ = 2ⁿ * eʸ
+        V z = floor(C::log2_e() * x + 0.5f);
+        const auto n = static_cast<Vc::SimdArray<int, V::Size>>(z);
+        x -= z * C::ln2_large();
+        x -= z * C::ln2_small();
+
+        /* Theoretical peak relative error in [-0.5, +0.5] is 4.2e-9. */
+        z = ((((( 1.9875691500E-4f  * x
+                + 1.3981999507E-3f) * x
+                + 8.3334519073E-3f) * x
+                + 4.1665795894E-2f) * x
+                + 1.6666665459E-1f) * x
+                + 5.0000001201E-1f) * (x * x)
+                + x
+                + 1.0f;
+
+        x = ldexp(z, n); // == z * 2ⁿ
+
+        x(overflow) = std::numeric_limits<typename V::EntryType>::infinity();
+        x.setZero(underflow);
+
+        return x;
+    }
+
+#endif // Vc_COMMON_MATH_H_INTERNAL
--- a/Vc/common/fix_clang_emmintrin.h
+++ b/Vc/common/fix_clang_emmintrin.h
@ -0,0 +1,79 @@
+/*{{{
+    Copyright (C) 2013-2015 Matthias Kretz <kretz@kde.org>
+
+    Permission to use, copy, modify, and distribute this software
+    and its documentation for any purpose and without fee is hereby
+    granted, provided that the above copyright notice appear in all
+    copies and that both that the copyright notice and this
+    permission notice and warranty disclaimer appear in supporting
+    documentation, and that the name of the author not be used in
+    advertising or publicity pertaining to distribution of the
+    software without specific, written prior permission.
+
+    The author disclaim all warranties with regard to this
+    software, including all implied warranties of merchantability
+    and fitness.  In no event shall the author be liable for any
+    special, indirect or consequential damages or any damages
+    whatsoever resulting from loss of use, data or profits, whether
+    in an action of contract, negligence or other tortious action,
+    arising out of or in connection with the use or performance of
+    this software.
+
+}}}*/
+
+#ifndef VC_COMMON_FIX_CLANG_EMMINTRIN_H_
+#define VC_COMMON_FIX_CLANG_EMMINTRIN_H_
+
+#include "../global.h"
+
+#if (defined Vc_CLANG && Vc_CLANG < 0x30700) || (defined Vc_APPLECLANG && Vc_APPLECLANG < 0x70000)
+
+#ifdef _mm_slli_si128
+#undef _mm_slli_si128
+#define _mm_slli_si128(a, count) __extension__ ({ \
+  (__m128i)__builtin_ia32_pslldqi128((__m128i)(a), (count)*8); })
+#endif
+
+#ifdef _mm_srli_si128
+#undef _mm_srli_si128
+#define _mm_srli_si128(a, count) __extension__ ({ \
+  (__m128i)__builtin_ia32_psrldqi128((__m128i)(a), (count)*8); })
+#endif
+
+#ifdef _mm_shuffle_epi32
+#undef _mm_shuffle_epi32
+#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), (__v4si) _mm_set1_epi32(0), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
+#endif
+
+#ifdef _mm_shufflelo_epi16
+#undef _mm_shufflelo_epi16
+#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \
+                                   (imm) & 0x3, ((imm) & 0xc) >> 2, \
+                                   ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
+                                   4, 5, 6, 7); })
+#endif
+
+#ifdef _mm_shufflehi_epi16
+#undef _mm_shufflehi_epi16
+#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), (__v8hi) _mm_set1_epi16(0), \
+                                   0, 1, 2, 3, \
+                                   4 + (((imm) & 0x03) >> 0), \
+                                   4 + (((imm) & 0x0c) >> 2), \
+                                   4 + (((imm) & 0x30) >> 4), \
+                                   4 + (((imm) & 0xc0) >> 6)); })
+#endif
+
+#ifdef _mm_shuffle_pd
+#undef _mm_shuffle_pd
+#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
+  __builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, (((i) & 2) >> 1) + 2); })
+#endif
+
+#endif // Vc_CLANG || Vc_APPLECLANG
+
+#endif // VC_COMMON_FIX_CLANG_EMMINTRIN_H_
--- a/Vc/common/gatherimplementation.h
+++ b/Vc/common/gatherimplementation.h
@ -0,0 +1,318 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_GATHERIMPLEMENTATION_H_
+#define VC_COMMON_GATHERIMPLEMENTATION_H_
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+
+enum class GatherScatterImplementation : int {
+    SimpleLoop,
+    SetIndexZero,
+    BitScanLoop,
+    PopcntSwitch
+};
+
+using SimpleLoopT   = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SimpleLoop>;
+using SetIndexZeroT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::SetIndexZero>;
+using BitScanLoopT  = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::BitScanLoop>;
+using PopcntSwitchT = std::integral_constant<GatherScatterImplementation, GatherScatterImplementation::PopcntSwitch>;
+
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeGather(SetIndexZeroT,
+                                    V &v,
+                                    const MT *mem,
+                                    IT &&indexes_,
+                                    typename V::MaskArgument mask)
+{
+    auto indexes = std::forward<IT>(indexes_);
+    indexes.setZeroInverted(static_cast<decltype(!indexes)>(mask));
+    const V tmp(mem, indexes);
+    where(mask) | v = tmp;
+}
+
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeGather(SimpleLoopT, V &v, const MT *mem, const IT &indexes,
+                                    const typename V::MaskArgument mask)
+{
+    if (Vc_IS_UNLIKELY(mask.isEmpty())) {
+        return;
+    }
+#if defined Vc_GCC && Vc_GCC >= 0x40900
+    // GCC 4.8 doesn't support dependent type and constexpr vector_size argument
+    constexpr std::size_t Sizeof = sizeof(V);
+    using Builtin [[gnu::vector_size(Sizeof)]] = typename V::value_type;
+    Builtin tmp = reinterpret_cast<Builtin>(v.data());
+    Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
+        if (mask[i]) {
+            tmp[i] = mem[indexes[i]];
+        }
+    });
+    v.data() = reinterpret_cast<typename V::VectorType>(tmp);
+#else
+    Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
+        if (mask[i])
+            v[i] = mem[indexes[i]];
+    });
+#endif
+}
+
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeGather(BitScanLoopT,
+                                    V &v,
+                                    const MT *mem,
+                                    const IT &indexes,
+                                    typename V::MaskArgument mask)
+{
+#ifdef Vc_GNU_ASM
+    size_t bits = mask.toInt();
+    while (Vc_IS_LIKELY(bits > 0)) {
+        size_t i, j;
+        asm("bsf %[bits],%[i]\n\t"
+            "bsr %[bits],%[j]\n\t"
+            "btr %[i],%[bits]\n\t"
+            "btr %[j],%[bits]\n\t"
+            : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
+        v[i] = mem[indexes[i]];
+        v[j] = mem[indexes[j]];
+    }
+#else
+    // Alternative from Vc::SSE (0.7)
+    int bits = mask.toInt();
+    while (bits) {
+        const int i = _bit_scan_forward(bits);
+	bits &= bits - 1;
+	v[i] = mem[indexes[i]];
+    }
+#endif  // Vc_GNU_ASM
+}
+
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
+                                    V &v,
+                                    const MT *mem,
+                                    const IT &indexes,
+                                    typename V::MaskArgument mask,
+                                    enable_if<V::Size == 16> = nullarg)
+{
+    unsigned int bits = mask.toInt();
+    unsigned int low, high = 0;
+    switch (Vc::Detail::popcnt16(bits)) {
+    case 16:
+        v.gather(mem, indexes);
+        break;
+    case 15:
+        low = _bit_scan_forward(bits);
+        bits ^= 1 << low;
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 14:
+        high = _bit_scan_reverse(bits);
+        v[high] = mem[indexes[high]];
+        high = (1 << high);
+        // fallthrough
+    case 13:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 12:
+        high = _bit_scan_reverse(bits);
+        v[high] = mem[indexes[high]];
+        high = (1 << high);
+        // fallthrough
+    case 11:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 10:
+        high = _bit_scan_reverse(bits);
+        v[high] = mem[indexes[high]];
+        high = (1 << high);
+        // fallthrough
+    case 9:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 8:
+        high = _bit_scan_reverse(bits);
+        v[high] = mem[indexes[high]];
+        high = (1 << high);
+        // fallthrough
+    case 7:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 6:
+        high = _bit_scan_reverse(bits);
+        v[high] = mem[indexes[high]];
+        high = (1 << high);
+        // fallthrough
+    case 5:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 4:
+        high = _bit_scan_reverse(bits);
+        v[high] = mem[indexes[high]];
+        high = (1 << high);
+        // fallthrough
+    case 3:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 2:
+        high = _bit_scan_reverse(bits);
+        v[high] = mem[indexes[high]];
+        // fallthrough
+    case 1:
+        low = _bit_scan_forward(bits);
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 0:
+        break;
+    }
+}
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
+                                    V &v,
+                                    const MT *mem,
+                                    const IT &indexes,
+                                    typename V::MaskArgument mask,
+                                    enable_if<V::Size == 8> = nullarg)
+{
+    unsigned int bits = mask.toInt();
+    unsigned int low, high = 0;
+    switch (Vc::Detail::popcnt8(bits)) {
+    case 8:
+        v.gather(mem, indexes);
+        break;
+    case 7:
+        low = _bit_scan_forward(bits);
+        bits ^= 1 << low;
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 6:
+        high = _bit_scan_reverse(bits);
+        v[high] = mem[indexes[high]];
+        high = (1 << high);
+        // fallthrough
+    case 5:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 4:
+        high = _bit_scan_reverse(bits);
+        v[high] = mem[indexes[high]];
+        high = (1 << high);
+        // fallthrough
+    case 3:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 2:
+        high = _bit_scan_reverse(bits);
+        v[high] = mem[indexes[high]];
+        // fallthrough
+    case 1:
+        low = _bit_scan_forward(bits);
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 0:
+        break;
+    }
+}
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
+                                    V &v,
+                                    const MT *mem,
+                                    const IT &indexes,
+                                    typename V::MaskArgument mask,
+                                    enable_if<V::Size == 4> = nullarg)
+{
+    unsigned int bits = mask.toInt();
+    unsigned int low, high = 0;
+    switch (Vc::Detail::popcnt4(bits)) {
+    case 4:
+        v.gather(mem, indexes);
+        break;
+    case 3:
+        low = _bit_scan_forward(bits);
+        bits ^= 1 << low;
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 2:
+        high = _bit_scan_reverse(bits);
+        v[high] = mem[indexes[high]];
+        // fallthrough
+    case 1:
+        low = _bit_scan_forward(bits);
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 0:
+        break;
+    }
+}
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeGather(PopcntSwitchT,
+                                    V &v,
+                                    const MT *mem,
+                                    const IT &indexes,
+                                    typename V::MaskArgument mask,
+                                    enable_if<V::Size == 2> = nullarg)
+{
+    unsigned int bits = mask.toInt();
+    unsigned int low;
+    switch (Vc::Detail::popcnt4(bits)) {
+    case 2:
+        v.gather(mem, indexes);
+        break;
+    case 1:
+        low = _bit_scan_forward(bits);
+        v[low] = mem[indexes[low]];
+        // fallthrough
+    case 0:
+        break;
+    }
+}
+
+}  // namespace Common
+}  // namespace Vc
+
+#endif // VC_COMMON_GATHERIMPLEMENTATION_H_
--- a/Vc/common/gatherinterface.h
+++ b/Vc/common/gatherinterface.h
@ -0,0 +1,221 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef Vc_CURRENT_CLASS_NAME
+#error "incorrect use of common/gatherinterface.h: Vc_CURRENT_CLASS_NAME must be defined to the current class name for declaring constructors."
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// gathers
+// A gather takes the following arguments:
+// 1. A const pointer to memory of any type that can convert to EntryType
+// 2. An indexes “vector”. The requirement is that the type implements the subscript operator,
+//    stores «Size» valid index values, and each offset to the pointer above yields a valid
+//    memory location for reading.
+// 3. Optionally the third argument may be a mask. The mask disables several memory reads and
+//    thus removes the requirements in (2.) for the disabled entries.
+
+private:
+    /**\internal
+     * This function implements a gather given a pointer to memory \p mem and some
+     * container object storing the gather \p indexes.
+     *
+     * \param mem This pointer must be aligned correctly for the type \p MT. This is the
+     * natural behavior of C++, so this is typically the case.
+     * \param indexes This object contains at least \VSize{T} indexes that denote the
+     * offset in \p mem where the components for the current vector should be copied from.
+     * The offset is not in Bytes, but in multiples of `sizeof(MT)`.
+     */
+    // enable_if<std::can_convert<MT, EntryType>::value &&
+    // has_subscript_operator<IT>::value>
+    template <class MT, class IT, int Scale = 1>
+    inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &);
+
+    /**\internal
+     * This overload of the above function adds a \p mask argument to disable memory
+     * accesses at the \p indexes offsets where \p mask is \c false.
+     */
+    template <class MT, class IT, int Scale = 1>
+    inline void gatherImplementation(const Common::GatherArguments<MT, IT, Scale> &,
+                                     MaskArgument mask);
+
+public:
+#define Vc_ASSERT_GATHER_PARAMETER_TYPES_                                                \
+    static_assert(                                                                       \
+        std::is_convertible<MT, EntryType>::value,                                       \
+        "The memory pointer needs to point to a type that can be converted to the "      \
+        "EntryType of this SIMD vector type.");                                          \
+    static_assert(                                                                       \
+        Vc::Traits::has_subscript_operator<IT>::value,                                   \
+        "The indexes argument must be a type that implements the subscript operator.");  \
+    static_assert(                                                                       \
+        !Traits::is_simd_vector<IT>::value ||                                            \
+            Traits::simd_vector_size<IT>::value >= Size,                                 \
+        "If you use a SIMD vector for the indexes parameter, the index vector must "     \
+        "have at least as many entries as this SIMD vector.");                           \
+    static_assert(                                                                       \
+        !std::is_array<T>::value ||                                                      \
+            (std::rank<T>::value == 1 &&                                                 \
+             (std::extent<T>::value == 0 || std::extent<T>::value >= Size)),             \
+        "If you use a simple array for the indexes parameter, the array must have "      \
+        "at least as many entries as this SIMD vector.")
+
+    /**
+     * \name Gather constructors and member functions
+     *
+     * Constructs or loads a vector from the objects at `mem[indexes[0]]`,
+     * `mem[indexes[1]]`, `mem[indexes[2]]`, ...
+     *
+     * All gather functions optionally take a mask as last argument. In that case only the
+     * entries that are selected in the mask are accessed in memory and copied to the
+     * vector. This enables invalid indexes in the \p indexes vector if those are masked
+     * off in \p mask.
+     *
+     * Gathers from structured data (AoS: arrays of struct) are possible via a special
+     * subscript operator of the container (array). You can use \ref Vc::array and \ref
+     * Vc::vector as drop-in replacements for \c std::array and \c std::vector. These
+     * container classes contain the necessary subscript operator overload. Example:
+     * \code
+     * Vc::vector<float> data(100);
+     * std::iota(data.begin(), data.end(), 0.f);  // fill with values 0, 1, 2, ...
+     * auto indexes = float_v::IndexType::IndexesFromZero();
+     * float_v gathered = data[indexes];  // gathered == [0, 1, 2, ...]
+     * \endcode
+     *
+     * This also works for gathers into arrays of structures:
+     * \code
+     * struct Point { float x, y, z; };
+     * Vc::array<Point, 100> points;
+     * // fill points ...
+     * auto indexes = float_v::IndexType::IndexesFromZero();
+     * float_v xs = data[indexes][&Point::x];  // [points[0].x, points[1].x, points[2].x, ...]
+     * float_v ys = data[indexes][&Point::y];  // [points[0].y, points[1].y, points[2].y, ...]
+     * float_v zs = data[indexes][&Point::z];  // [points[0].z, points[1].z, points[2].z, ...]
+     * \endcode
+     *
+     * Alternatively, you can use Vc::Common::AdaptSubscriptOperator to extend a given
+     * container class with the necessary subscript operator. Example:
+     * \code
+     * template <typename T, typename Allocator = std::allocator<T>>
+     * using my_vector = Vc::Common::AdaptSubscriptOperator<std::vector<T, Allocator>>;
+     * \endcode
+     *
+     * \param mem A pointer to memory which contains objects of type \p MT at the offsets
+     *            given by \p indexes.
+     * \param indexes A container/vector of offsets into \p mem.
+     *                The type of \p indexes (\p IT) may either be a pointer to integers
+     *                (C-array) or a vector of integers (preferrably IndexType).
+     * \param mask If a mask is given, only the active entries will be copied from memory.
+     *
+     * \note If you use a masked gather constructor the masked-off entries of the vector
+     * are zero-initilized.
+     */
+    ///@{
+
+    /// Gather constructor
+    template <typename MT, typename IT,
+              typename = enable_if<Traits::has_subscript_operator<IT>::value>>
+    Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes)
+    {
+        Vc_ASSERT_GATHER_PARAMETER_TYPES_;
+        gatherImplementation(
+            Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
+    }
+
+    template <class MT, class IT, int Scale>
+    Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args)
+    {
+        Vc_ASSERT_GATHER_PARAMETER_TYPES_;
+        gatherImplementation(args);
+    }
+
+    /// Masked gather constructor
+    template <typename MT, typename IT,
+              typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
+    Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const MT *mem, const IT &indexes,
+                                       MaskArgument mask)
+    {
+        Vc_ASSERT_GATHER_PARAMETER_TYPES_;
+        gatherImplementation(
+            Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
+    }
+
+    template <class MT, class IT, int Scale>
+    Vc_INTRINSIC Vc_CURRENT_CLASS_NAME(const Common::GatherArguments<MT, IT, Scale> &args,
+                                       MaskArgument mask)
+    {
+        Vc_ASSERT_GATHER_PARAMETER_TYPES_;
+        gatherImplementation(args, mask);
+    }
+
+    /// Gather function
+    template <typename MT, typename IT,
+              typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
+    Vc_INTRINSIC void gather(const MT *mem, const IT &indexes)
+    {
+        Vc_ASSERT_GATHER_PARAMETER_TYPES_;
+        gatherImplementation(
+            Common::make_gather<1>(mem, Common::convertIndexVector(indexes)));
+    }
+
+    /// Masked gather function
+    template <typename MT, typename IT,
+              typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
+    Vc_INTRINSIC void gather(const MT *mem, const IT &indexes, MaskArgument mask)
+    {
+        Vc_ASSERT_GATHER_PARAMETER_TYPES_;
+        gatherImplementation(
+            Common::make_gather<1>(mem, Common::convertIndexVector(indexes)), mask);
+    }
+    ///@}
+
+#include "gatherinterface_deprecated.h"
+
+    /**\internal
+     * \name Gather function to use from Vc::Common::subscript_operator
+     *
+     * \param args
+     * \param mask
+     */
+    ///@{
+    template <class MT, class IT, int Scale>
+    Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args)
+    {
+        Vc_ASSERT_GATHER_PARAMETER_TYPES_;
+        gatherImplementation(args);
+    }
+
+    template <class MT, class IT, int Scale>
+    Vc_INTRINSIC void gather(const Common::GatherArguments<MT, IT, Scale> &args,
+                             MaskArgument mask)
+    {
+        Vc_ASSERT_GATHER_PARAMETER_TYPES_;
+        gatherImplementation(args, mask);
+    }
+    ///@}
+
+#undef Vc_ASSERT_GATHER_PARAMETER_TYPES_
--- a/Vc/common/gatherinterface_deprecated.h
+++ b/Vc/common/gatherinterface_deprecated.h
@ -0,0 +1,300 @@
+    /// \name Deprecated Members
+    ///@{
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
+     *                to. The type of indexes can either be an integer vector or a type that supports
+     *                operator[] access.
+     */
+    template <typename S1, typename IT>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
+                                                           const EntryType S1::*member1,
+                                                           IT indexes)
+    {
+        gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
+                   array, indexes)[member1]
+                   .gatherArguments());
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
+     *                to. The type of indexes can either be an integer vector or a type that supports
+     *                operator[] access.
+     * \param mask    If a mask is given only the active entries will be gathered/scattered.
+     */
+    template <typename S1, typename IT>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
+                                                           const EntryType S1::*member1,
+                                                           IT indexes, MaskArgument mask)
+    {
+        gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
+                   array, indexes)[member1]
+                   .gatherArguments(),
+               mask);
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
+     *                struct (i.e. array[i].*member1.*member2 is read).
+     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
+     *                to. The type of indexes can either be an integer vector or a type that supports
+     *                operator[] access.
+     */
+    template <typename S1, typename S2, typename IT>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
+                                                           const S2 S1::*member1,
+                                                           const EntryType S2::*member2,
+                                                           IT indexes)
+    {
+        gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
+                   array, indexes)[member1][member2]
+                   .gatherArguments());
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
+     *                struct (i.e. array[i].*member1.*member2 is read).
+     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
+     *                to. The type of indexes can either be an integer vector or a type that supports
+     *                operator[] access.
+     * \param mask    If a mask is given only the active entries will be gathered/scattered.
+     */
+    template <typename S1, typename S2, typename IT>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
+                                                           const S2 S1::*member1,
+                                                           const EntryType S2::*member2,
+                                                           IT indexes, MaskArgument mask)
+    {
+        gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
+                   array, indexes)[member1][member2]
+                   .gatherArguments(),
+               mask);
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param outerIndexes
+     * \param innerIndexes
+     */
+    template <typename S1, typename IT1, typename IT2>
+    Vc_DEPRECATED(
+        "use the subscript operator to Vc::array or Vc::vector "
+        "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
+                                                 const EntryType *const S1::*ptrMember1,
+                                                 IT1 outerIndexes, IT2 innerIndexes)
+    {
+        gather(Common::SubscriptOperation<const S1, IT1, std::ratio<1, 1>, true>(
+                   array, outerIndexes)[ptrMember1][innerIndexes]
+                   .gatherArguments());
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param outerIndexes
+     * \param innerIndexes
+     * \param mask    If a mask is given only the active entries will be gathered/scattered.
+     */
+    template <typename S1, typename IT1, typename IT2>
+    Vc_DEPRECATED(
+        "use the subscript operator to Vc::array or Vc::vector "
+        "instead.") inline Vc_CURRENT_CLASS_NAME(const S1 *array,
+                                                 const EntryType *const S1::*ptrMember1,
+                                                 IT1 outerIndexes, IT2 innerIndexes,
+                                                 MaskArgument mask)
+    {
+        gather(Common::SubscriptOperation<const S1, IT1, std::ratio<1, 1>, true>(
+                   array, outerIndexes)[ptrMember1][innerIndexes]
+                   .gatherArguments(),
+               mask);
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
+     *                to. The type of indexes can either be an integer vector or a type that supports
+     *                operator[] access.
+     */
+    template <typename S1, typename IT>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline void gather(const S1 *array,
+                                                 const EntryType S1::*member1, IT indexes)
+    {
+        gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
+                   array, indexes)[member1]
+                   .gatherArguments());
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
+     *                to. The type of indexes can either be an integer vector or a type that supports
+     *                operator[] access.
+     * \param mask    If a mask is given only the active entries will be gathered/scattered.
+     */
+    template <typename S1, typename IT>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline void gather(const S1 *array,
+                                                 const EntryType S1::*member1,
+                                                 IT indexes,
+                                                 MaskArgument mask)
+    {
+        gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
+                   array, indexes)[member1]
+                   .gatherArguments(),
+               mask);
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
+     *                struct (i.e. array[i].*member1.*member2 is read).
+     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
+     *                to. The type of indexes can either be an integer vector or a type that supports
+     *                operator[] access.
+     */
+    template <typename S1, typename S2, typename IT>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline void gather(const S1 *array, const S2 S1::*member1,
+                                                 const EntryType S2::*member2, IT indexes)
+    {
+        gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
+                   array, indexes)[member1][member2]
+                   .gatherArguments());
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
+     *                struct (i.e. array[i].*member1.*member2 is read).
+     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
+     *                to. The type of indexes can either be an integer vector or a type that supports
+     *                operator[] access.
+     * \param mask    If a mask is given only the active entries will be gathered/scattered.
+     */
+    template <typename S1, typename S2, typename IT>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline void gather(const S1 *array, const S2 S1::*member1,
+                                                 const EntryType S2::*member2, IT indexes,
+                                                 MaskArgument mask)
+    {
+        gather(Common::SubscriptOperation<const S1, IT, std::ratio<1, 1>, true>(
+                   array, indexes)[member1][member2]
+                   .gatherArguments(),
+               mask);
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param outerIndexes
+     * \param innerIndexes
+     */
+    template <typename S1, typename IT1, typename IT2>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline void gather(const S1 *array,
+                                                 const EntryType *const S1::*ptrMember1,
+                                                 IT1 outerIndexes, IT2 innerIndexes)
+    {
+        gather(Common::SubscriptOperation<const S1, IT1, std::ratio<1, 1>, true>(
+                   array, outerIndexes)[ptrMember1][innerIndexes]
+                   .gatherArguments());
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param outerIndexes
+     * \param innerIndexes
+     * \param mask    If a mask is given only the active entries will be gathered/scattered.
+     */
+    template <typename S1, typename IT1, typename IT2>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline void gather(const S1 *array,
+                                                 const EntryType *const S1::*ptrMember1,
+                                                 IT1 outerIndexes, IT2 innerIndexes,
+                                                 MaskArgument mask)
+    {
+        gather(Common::SubscriptOperation<const S1, IT1, std::ratio<1, 1>, true>(
+                   array, outerIndexes)[ptrMember1][innerIndexes]
+                   .gatherArguments(),
+               mask);
+    }
+    ///@}
--- a/Vc/common/generalinterface.h
+++ b/Vc/common/generalinterface.h
@ -0,0 +1,61 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+public:
+    ///////////////////////////////////////////////////////////////////////////
+    // init to zero
+    Vc_INTRINSIC Vector() = default;
+
+    ///////////////////////////////////////////////////////////////////////////
+    // types
+
+    ///////////////////////////////////////////////////////////////////////////
+    // constants
+    static constexpr std::size_t size() { return Size; }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // constant Vectors
+    explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerZero) Vc_INTRINSIC_R;
+    explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerOne) Vc_INTRINSIC_R;
+    explicit Vc_INTRINSIC_L Vector(VectorSpecialInitializerIndexesFromZero) Vc_INTRINSIC_R;
+    static Vc_INTRINSIC Vc_CONST Vector Zero() { return Vector(Vc::Zero); }
+    static Vc_INTRINSIC Vc_CONST Vector One() { return Vector(Vc::One); }
+    static Vc_INTRINSIC Vc_CONST Vector IndexesFromZero()
+    {
+        return Vector(Vc::IndexesFromZero);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // generator ctor
+    template <class G, int = 0,
+              class = typename std::enable_if<std::is_convertible<
+                  decltype(std::declval<G>()(size_t())), value_type>::value>::type>
+    explicit Vector(G &&g) : Vector(generate(std::forward<G>(g)))
+    {
+    }
+
+// vim: foldmethod=marker
--- a/Vc/common/iif.h
+++ b/Vc/common/iif.h
@ -0,0 +1,97 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2012-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_IIF_H_
+#define VC_COMMON_IIF_H_
+
+#include "../type_traits"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+/**
+ * \ingroup Utilities
+ *
+ * Function to mimic the ternary operator '?:' (inline-if).
+ *
+ * \param condition  Determines which values are returned. This is analog to the first argument to
+ *                   the ternary operator.
+ * \param trueValue  The values to return where \p condition is \c true.
+ * \param falseValue The values to return where \p condition is \c false.
+ * \return A combination of entries from \p trueValue and \p falseValue, according to \p condition.
+ *
+ * So instead of the scalar variant
+ * \code
+ * float x = a > 1.f ? b : b + c;
+ * \endcode
+ * you'd write
+ * \code
+ * float_v x = Vc::iif (a > 1.f, b, b + c);
+ * \endcode
+ *
+ * Assuming \c a has the values [0, 3, 5, 1], \c b is [1, 1, 1, 1], and \c c is [1, 2, 3, 4], then x
+ * will be [2, 2, 3, 5].
+ */
+template <typename Mask, typename T>
+Vc_ALWAYS_INLINE enable_if<is_simd_mask<Mask>::value && is_simd_vector<T>::value, T> iif(
+    const Mask &condition, const T &trueValue, const T &falseValue)
+{
+    T result(falseValue);
+    Vc::where(condition) | result = trueValue;
+    return result;
+}
+
+/**\internal
+ * The following declaration makes it explicit that `iif (Mask, non-vector, non-vector)`
+ * is not supposed to work. Doing the same thing with \c static_assert would break SFINAE.
+ */
+template <typename Mask, typename T>
+enable_if<is_simd_mask<Mask>::value && !is_simd_vector<T>::value, T> iif(
+    const Mask &, const T &, const T &) = delete;
+
+/**
+ * \ingroup Utilities
+ *
+ * Overload of the above for boolean conditions.
+ *
+ * This typically results in direct use of the ternary operator. This function makes it easier to
+ * switch from a Vc type to a builtin type.
+ *
+ * \param condition  Determines which value is returned. This is analog to the first argument to
+ *                   the ternary operator.
+ * \param trueValue  The value to return if \p condition is \c true.
+ * \param falseValue The value to return if \p condition is \c false.
+ * \return Either \p trueValue or \p falseValue, depending on \p condition.
+ */
+template<typename T> constexpr T iif (bool condition, const T &trueValue, const T &falseValue)
+{
+    return condition ? trueValue : falseValue;
+}
+
+}  // namespace Vc
+
+#endif // VC_COMMON_IIF_H_
--- a/Vc/common/indexsequence.h
+++ b/Vc/common/indexsequence.h
@ -0,0 +1,79 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_INDEXSEQUENCE_H_
+#define VC_COMMON_INDEXSEQUENCE_H_
+
+#include "../global.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+/** \internal
+ * Helper class for a sequence of size_t values from 0 to N. This type will be included in
+ * C++14.
+ */
+template <std::size_t... I> struct index_sequence
+{
+    static constexpr std::size_t size() noexcept { return sizeof...(I); }
+};
+
+/** \internal
+ * This struct builds an index_sequence type from a given upper bound \p N.
+ * It does so recursively via concatenation of to index sequences of length N/2.
+ */
+template <std::size_t N> struct make_index_sequence_impl {
+    template <std::size_t Offset, std::size_t... Ns>
+    static index_sequence<Ns..., (Ns + Offset)...> join(std::false_type,
+                                                        index_sequence<Ns...>);
+    template <std::size_t Offset, std::size_t... Ns>
+    static index_sequence<Ns..., Offset - 1, (Ns + Offset)...> join(
+        std::true_type, index_sequence<Ns...>);
+
+    using is_odd = std::integral_constant<bool, N & 1>;
+    using half = typename make_index_sequence_impl<N / 2>::type;
+    using type = decltype(join<(N + 1) / 2>(is_odd(), half()));
+};
+template <> struct make_index_sequence_impl<0> {
+    using type = index_sequence<>;
+};
+template <> struct make_index_sequence_impl<1> {
+    using type = index_sequence<0>;
+};
+template <> struct make_index_sequence_impl<2> {
+    using type = index_sequence<0, 1>;
+};
+
+/** \internal
+ * Creates an index_sequence type for the upper bound \p N.
+ */
+template <std::size_t N>
+using make_index_sequence = typename make_index_sequence_impl<N>::type;
+}
+
+#endif  // VC_COMMON_INDEXSEQUENCE_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/interleave.h
+++ b/Vc/common/interleave.h
@ -0,0 +1,63 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_INTERLEAVE_H_
+#define VC_COMMON_INTERLEAVE_H_
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+/** \ingroup Utilities
+ Interleaves the entries from \p a and \p b into two vectors of the same type. The order
+ in the returned vector contains the elements `a[0], b[0], a[1], b[1], a[2], b[2], a[3],
+ b[3], ...`.
+
+Example:
+\code
+Vc::SimdArray<int, 4> a = { 1, 2, 3, 4 };
+Vc::SimdArray<int, 4> b = { 9, 8, 7, 6 };
+std::tie(a, b) = Vc::interleave(a, b);
+std::cout << a << b;
+// prints:
+// <1 9 2 8><3 7 4 6>
+\endcode
+
+ \param a input vector whose data will appear at even indexes in the output
+ \param b input vector whose data will appear at odd indexes in the output
+ \return two vectors with data from \p a and \p b interleaved
+ */
+template <typename V, typename = enable_if<Traits::is_simd_vector<V>::value>>
+std::pair<V, V> interleave(const V &a, const V &b)
+{
+    return {a.interleaveLow(b), a.interleaveHigh(b)};
+}
+}  // namespace Vc
+
+#endif  // VC_COMMON_INTERLEAVE_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/interleavedmemory.h
+++ b/Vc/common/interleavedmemory.h
@ -0,0 +1,351 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2012-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_INTERLEAVEDMEMORY_H_
+#define VC_COMMON_INTERLEAVEDMEMORY_H_
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+/**
+ * \internal
+ */
+template<typename V, typename I, bool Readonly> struct InterleavedMemoryAccessBase
+{
+    // Partial specialization doesn't work for functions without partial specialization of the whole
+    // class. Therefore we capture the contents of InterleavedMemoryAccessBase in a macro to easily
+    // copy it into its specializations.
+    typedef typename std::conditional<
+        Readonly, typename std::add_const<typename V::EntryType>::type,
+        typename V::EntryType>::type T;
+    typedef typename V::AsArg VArg;
+    typedef T Ta Vc_MAY_ALIAS;
+    const I m_indexes;
+    Ta *const m_data;
+
+    Vc_ALWAYS_INLINE InterleavedMemoryAccessBase(typename I::AsArg indexes, Ta *data)
+        : m_indexes(indexes), m_data(data)
+    {
+    }
+
+    // implementations of the following are in {scalar,sse,avx}/detail.h
+    template <typename... Vs> Vc_INTRINSIC void deinterleave(Vs &&... vs) const
+    {
+        Impl::deinterleave(m_data, m_indexes, std::forward<Vs>(vs)...);
+    }
+
+protected:
+    using Impl = Vc::Detail::InterleaveImpl<V, V::Size, sizeof(V)>;
+
+    template <typename T, std::size_t... Indexes>
+    Vc_INTRINSIC void callInterleave(T &&a, index_sequence<Indexes...>)
+    {
+        Impl::interleave(m_data, m_indexes, a[Indexes]...);
+    }
+};
+
+/**
+ * \internal
+ */
+// delay execution of the deinterleaving gather until operator=
+template <size_t StructSize, typename V, typename I = typename V::IndexType,
+          bool Readonly>
+struct InterleavedMemoryReadAccess : public InterleavedMemoryAccessBase<V, I, Readonly>
+{
+    typedef InterleavedMemoryAccessBase<V, I, Readonly> Base;
+    typedef typename Base::Ta Ta;
+
+    Vc_ALWAYS_INLINE InterleavedMemoryReadAccess(Ta *data, typename I::AsArg indexes)
+        : Base(StructSize == 1u
+                   ? indexes
+                   : StructSize == 2u
+                         ? indexes << 1
+                         : StructSize == 4u
+                               ? indexes << 2
+                               : StructSize == 8u
+                                     ? indexes << 3
+                                     : StructSize == 16u ? indexes << 4
+                                                         : indexes * I(int(StructSize)),
+               data)
+    {
+    }
+
+    template <typename T, std::size_t... Indexes>
+    Vc_ALWAYS_INLINE T deinterleave_unpack(index_sequence<Indexes...>) const
+    {
+        T r;
+        Base::Impl::deinterleave(this->m_data, this->m_indexes, std::get<Indexes>(r)...);
+        return r;
+    }
+
+    template <typename T,
+              typename = enable_if<(std::is_default_constructible<T>::value &&
+                                    std::is_same<V, Traits::decay<decltype(std::get<0>(
+                                                        std::declval<T &>()))>>::value)>>
+    Vc_ALWAYS_INLINE operator T() const
+    {
+        return deinterleave_unpack<T>(make_index_sequence<std::tuple_size<T>::value>());
+    }
+};
+
+///\internal Runtime check (NDEBUG) for asserting unique indexes.
+template<typename I> struct CheckIndexesUnique
+{
+#ifdef NDEBUG
+    static Vc_INTRINSIC void test(const I &) {}
+#else
+    static void test(const I &indexes)
+    {
+        const I test = indexes.sorted();
+        Vc_ASSERT(I::Size == 1 || (test == test.rotated(1)).isEmpty())
+    }
+#endif
+};
+///\internal For SuccessiveEntries there can never be a problem.
+template<size_t S> struct CheckIndexesUnique<SuccessiveEntries<S> >
+{
+    static Vc_INTRINSIC void test(const SuccessiveEntries<S> &) {}
+};
+
+/**
+ * \internal
+ */
+template <size_t StructSize, typename V, typename I = typename V::IndexType>
+struct InterleavedMemoryAccess : public InterleavedMemoryReadAccess<StructSize, V, I, false>
+{
+    typedef InterleavedMemoryAccessBase<V, I, false> Base;
+    typedef typename Base::Ta Ta;
+
+    Vc_ALWAYS_INLINE InterleavedMemoryAccess(Ta *data, typename I::AsArg indexes)
+        : InterleavedMemoryReadAccess<StructSize, V, I, false>(data, indexes)
+    {
+        CheckIndexesUnique<I>::test(indexes);
+    }
+
+    template <int N> Vc_ALWAYS_INLINE void operator=(VectorReferenceArray<N, V> &&rhs)
+    {
+        static_assert(N <= StructSize,
+                      "You_are_trying_to_scatter_more_data_into_the_struct_than_it_has");
+        this->callInterleave(std::move(rhs), make_index_sequence<N>());
+    }
+    template <int N> Vc_ALWAYS_INLINE void operator=(VectorReferenceArray<N, const V> &&rhs)
+    {
+        static_assert(N <= StructSize,
+                      "You_are_trying_to_scatter_more_data_into_the_struct_than_it_has");
+        this->callInterleave(std::move(rhs), make_index_sequence<N>());
+    }
+};
+
+/**
+ * Wraps a pointer to memory with convenience functions to access it via vectors.
+ *
+ * \param S The type of the struct.
+ * \param V The type of the vector to be returned when read. This should reflect the type of the
+ * members inside the struct.
+ *
+ * \see operator[]
+ * \ingroup Containers
+ * \headerfile interleavedmemory.h <Vc/Memory>
+ */
+template<typename S, typename V> class InterleavedMemoryWrapper
+{
+    typedef typename std::conditional<std::is_const<S>::value,
+                                      const typename V::EntryType,
+                                      typename V::EntryType>::type T;
+    typedef typename V::IndexType I;
+    typedef typename V::AsArg VArg;
+    typedef const I &IndexType;
+    static constexpr std::size_t StructSize = sizeof(S) / sizeof(T);
+    using ReadAccess = InterleavedMemoryReadAccess<StructSize, V>;
+    using Access =
+        typename std::conditional<std::is_const<T>::value, ReadAccess,
+                                  InterleavedMemoryAccess<StructSize, V>>::type;
+    using ReadSuccessiveEntries =
+        InterleavedMemoryReadAccess<StructSize, V, SuccessiveEntries<StructSize>>;
+    using AccessSuccessiveEntries = typename std::conditional<
+        std::is_const<T>::value, ReadSuccessiveEntries,
+        InterleavedMemoryAccess<StructSize, V, SuccessiveEntries<StructSize>>>::type;
+    typedef T Ta Vc_MAY_ALIAS;
+    Ta *const m_data;
+
+    static_assert(StructSize * sizeof(T) == sizeof(S),
+                  "InterleavedMemoryAccess_does_not_support_packed_structs");
+
+public:
+    /**
+     * Constructs the wrapper object.
+     *
+     * \param s A pointer to a C-array.
+     */
+    Vc_ALWAYS_INLINE InterleavedMemoryWrapper(S *s)
+        : m_data(reinterpret_cast<Ta *>(s))
+    {
+    }
+
+    /**
+     * Interleaved scatter/gather access.
+     *
+     * Assuming you have a struct of floats and a vector of \p indexes into the array, this function
+     * can be used to access the struct entries as vectors using the minimal number of store or load
+     * instructions.
+     *
+     * \param indexes Vector of indexes that determine the gather locations.
+     *
+     * \return A special (magic) object that executes the loads and deinterleave on assignment to a
+     * vector tuple.
+     *
+     * Example:
+     * \code
+     * struct Foo {
+     *   float x, y, z;
+     * };
+     *
+     * void fillWithBar(Foo *_data, uint_v indexes)
+     * {
+     *   Vc::InterleavedMemoryWrapper<Foo, float_v> data(_data);
+     *   const float_v x = bar(1);
+     *   const float_v y = bar(2);
+     *   const float_v z = bar(3);
+     *   data[indexes] = (x, y, z);
+     *   // it's also possible to just store a subset at the front of the struct:
+     *   data[indexes] = (x, y);
+     *   // if you want to store a single entry, use scatter:
+     *   z.scatter(_data, &Foo::x, indexes);
+     * }
+     *
+     * float_v normalizeStuff(Foo *_data, uint_v indexes)
+     * {
+     *   Vc::InterleavedMemoryWrapper<Foo, float_v> data(_data);
+     *   float_v x, y, z;
+     *   (x, y, z) = data[indexes];
+     *   // it is also possible to just load a subset from the front of the struct:
+     *   // (x, y) = data[indexes];
+     *   return Vc::sqrt(x * x + y * y + z * z);
+     * }
+     * \endcode
+     *
+     * You may think of the gather operation (or scatter as the inverse) like this:
+\verbatim
+             Memory: {x0 y0 z0 x1 y1 z1 x2 y2 z2 x3 y3 z3 x4 y4 z4 x5 y5 z5 x6 y6 z6 x7 y7 z7 x8 y8 z8}
+            indexes: [5, 0, 1, 7]
+Result in (x, y, z): ({x5 x0 x1 x7}, {y5 y0 y1 y7}, {z5 z0 z1 z7})
+\endverbatim
+     *
+     * \warning If \p indexes contains non-unique entries on scatter, the result is undefined. If
+     * \c NDEBUG is not defined the implementation will assert that the \p indexes entries are unique.
+     */
+    template <typename IT>
+    Vc_ALWAYS_INLINE enable_if<!std::is_convertible<IT, size_t>::value &&
+                                   std::is_convertible<IT, IndexType>::value &&
+                                   !std::is_const<S>::value,
+                               Access>
+    operator[](IT indexes)
+    {
+        return Access(m_data, indexes);
+    }
+
+    /// const overload (gathers only) of the above function
+    Vc_ALWAYS_INLINE ReadAccess operator[](IndexType indexes) const
+    {
+        return ReadAccess(m_data, indexes);
+    }
+
+    /// alias of the above function
+    Vc_ALWAYS_INLINE ReadAccess gather(IndexType indexes) const { return operator[](indexes); }
+
+    /**
+     * Interleaved access.
+     *
+     * This function is an optimization of the function above, for cases where the index vector
+     * contains consecutive values. It will load \p V::Size consecutive entries from memory and
+     * deinterleave them into Vc vectors.
+     *
+     * \param first The first of \p V::Size indizes to be accessed.
+     *
+     * \return A special (magic) object that executes the loads and deinterleave on assignment to a
+     * vector tuple.
+     *
+     * Example:
+     * \code
+     * struct Foo {
+     *   float x, y, z;
+     * };
+     *
+     * void foo(Foo *_data)
+     * {
+     *   Vc::InterleavedMemoryWrapper<Foo, float_v> data(_data);
+     *   for (size_t i = 0; i < 32U; i += float_v::Size) {
+     *     float_v x, y, z;
+     *     (x, y, z) = data[i];
+     *     // now:
+     *     // x = { _data[i].x, _data[i + 1].x, _data[i + 2].x, ... }
+     *     // y = { _data[i].y, _data[i + 1].y, _data[i + 2].y, ... }
+     *     // z = { _data[i].z, _data[i + 1].z, _data[i + 2].z, ... }
+     *     ...
+     *   }
+     * }
+     * \endcode
+     */
+    Vc_ALWAYS_INLINE ReadSuccessiveEntries operator[](size_t first) const
+    {
+        return ReadSuccessiveEntries(m_data, first);
+    }
+
+    Vc_ALWAYS_INLINE AccessSuccessiveEntries operator[](size_t first)
+    {
+        return AccessSuccessiveEntries(m_data, first);
+    }
+
+    //Vc_ALWAYS_INLINE Access scatter(I indexes, VArg v0, VArg v1);
+};
+}  // namespace Common
+
+using Common::InterleavedMemoryWrapper;
+
+/**
+ * Creates an adapter around a given array of structure (AoS) that enables optimized loads
+ * + deinterleaving operations / interleaving operations + stores for vector access (using
+ * \p V).
+ *
+ * \tparam V The `Vc::Vector<T>` type to use per element of the structure.
+ * \param s A pointer to an array of structures containing data members of type `T`.
+ *
+ * \see Vc::Common::InterleavedMemoryWrapper
+ *
+ * \todo Support destructuring via structured bindings.
+ */
+template <typename V, typename S>
+inline Common::InterleavedMemoryWrapper<S, V> make_interleave_wrapper(S *s)
+{
+    return Common::InterleavedMemoryWrapper<S, V>(s);
+}
+}  // namespace Vc
+
+#endif // VC_COMMON_INTERLEAVEDMEMORY_H_
--- a/Vc/common/iterators.h
+++ b/Vc/common/iterators.h
@ -0,0 +1,282 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_ITERATORS_H_
+#define VC_COMMON_ITERATORS_H_
+
+#include <array>
+#include <iterator>
+#ifdef Vc_MSVC
+#include <intrin.h> // for _BitScanForward
+#endif  // Vc_MSVC
+#include "where.h"
+#include "elementreference.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+
+template<typename _V, typename Flags> class MemoryVector;
+template<typename _V, typename Flags> class MemoryVectorIterator;
+
+template <typename V> class Iterator;
+template <typename V, bool> class IteratorBase;
+template <typename V> class IteratorBase<V, true>
+{
+public:
+    using iterator_category = std::input_iterator_tag;
+    using value_type = typename V::value_type;
+    using difference_type = int;
+    using reference = value_type;
+    Vc_ALWAYS_INLINE reference operator*() const { return v()[i()]; }
+    Vc_ALWAYS_INLINE reference operator[](difference_type i2) const { return v()[i2]; }
+
+private:
+    Vc_INTRINSIC V &v() const { return *static_cast<const Iterator<V> *>(this)->v; }
+    Vc_INTRINSIC difference_type i() const
+    {
+        return static_cast<const Iterator<V> *>(this)->i;
+    }
+};
+
+template <typename V> class IteratorBase<V, false>
+{
+public:
+    using iterator_category = std::input_iterator_tag;
+    using value_type = typename V::value_type;
+    using difference_type = int;
+    using reference = Vc::Detail::ElementReference<V, IteratorBase>;
+    Vc_ALWAYS_INLINE reference operator*() const { return {*v(), i()}; }
+    Vc_ALWAYS_INLINE reference operator[](difference_type i2) const { return {*v(), i2}; }
+
+private:
+    Vc_INTRINSIC V *v() const { return static_cast<const Iterator<V> *>(this)->v; }
+    Vc_INTRINSIC difference_type i() const
+    {
+        return static_cast<const Iterator<V> *>(this)->i;
+    }
+
+    friend reference;
+    static Vc_INTRINSIC value_type get(const V &o, int i)
+    {
+        return o[i];
+    }
+    template <typename T> static Vc_INTRINSIC void set(V &o, int i, T &&v)
+    {
+        o[i] = std::forward<T>(v);
+    }
+};
+
+// class Iterator {{{
+template <typename V> class Iterator : public IteratorBase<V, std::is_const<V>::value>
+{
+    using Base = IteratorBase<V, std::is_const<V>::value>;
+    friend Base;
+
+public:
+    using typename Base::iterator_category;
+    using typename Base::value_type;
+    using typename Base::difference_type;
+    using pointer = const Iterator *;
+    using typename Base::reference;
+
+    constexpr Iterator() = default;
+    constexpr Iterator(V &_v, difference_type _i) : v(&_v), i(_i) {}
+    // rely on implicit copy constructor/assignment
+
+    Vc_ALWAYS_INLINE pointer operator->() const { return this; }
+    using Base::operator*;
+
+    Vc_ALWAYS_INLINE Iterator &operator++()    { ++i; return *this; }
+    Vc_ALWAYS_INLINE Iterator  operator++(int) { Iterator tmp = *this; ++i; return tmp; }
+
+    // bidirectional iteration is supported
+    Vc_ALWAYS_INLINE Iterator &operator--()    { --i; return *this; }
+    Vc_ALWAYS_INLINE Iterator  operator--(int) { Iterator tmp = *this; --i; return tmp; }
+
+    // RandomAccessIterator:
+    using Base::operator[];
+    Vc_ALWAYS_INLINE Iterator &operator+=(difference_type d) { i += d; return *this; }
+    Vc_ALWAYS_INLINE Iterator &operator-=(difference_type d) { i -= d; return *this; }
+    Vc_ALWAYS_INLINE Iterator operator+(difference_type d) const { return {*v, i + d}; }
+    Vc_ALWAYS_INLINE Iterator operator-(difference_type d) const { return {*v, i - d}; }
+    Vc_ALWAYS_INLINE difference_type operator-(const Iterator &rhs) const { return i - rhs.i; }
+    friend Vc_ALWAYS_INLINE Iterator operator+(difference_type d, const Iterator &rhs)
+    {
+        return {*rhs.v, rhs.i + d};
+    }
+
+    // InputIterator would not need to test v == rhs.v, but except for `reference` this
+    // class implements a complete RandomAccessIterator
+    Vc_ALWAYS_INLINE bool operator==(const Iterator<V> &rhs) const { return v == rhs.v && i == rhs.i; }
+    Vc_ALWAYS_INLINE bool operator!=(const Iterator<V> &rhs) const { return v == rhs.v && i != rhs.i; }
+    Vc_ALWAYS_INLINE bool operator< (const Iterator<V> &rhs) const { return v == rhs.v && i <  rhs.i; }
+    Vc_ALWAYS_INLINE bool operator<=(const Iterator<V> &rhs) const { return v == rhs.v && i <= rhs.i; }
+    Vc_ALWAYS_INLINE bool operator> (const Iterator<V> &rhs) const { return v == rhs.v && i >  rhs.i; }
+    Vc_ALWAYS_INLINE bool operator>=(const Iterator<V> &rhs) const { return v == rhs.v && i >= rhs.i; }
+
+private:
+    V *v = nullptr;
+    difference_type i = 0;
+};/*}}}*/
+
+template <typename V> using ConstIterator = Iterator<const V>;
+
+    class BitmaskIterator/*{{{*/
+    {
+#ifdef Vc_MSVC
+        unsigned long mask;
+        unsigned long bit;
+#else
+        size_t mask;
+        size_t bit;
+#endif
+
+        void nextBit()
+        {
+#ifdef Vc_GNU_ASM
+            bit = __builtin_ctzl(mask);
+#elif defined(Vc_MSVC)
+            _BitScanForward(&bit, mask);
+#else
+#error "Not implemented yet. Please contact vc-devel@compeng.uni-frankfurt.de"
+#endif
+        }
+        void resetLsb()
+        {
+            // 01100100 - 1 = 01100011
+            mask &= (mask - 1);
+            /*
+#ifdef Vc_GNU_ASM
+            __asm__("btr %1,%0" : "+r"(mask) : "r"(bit));
+#elif defined(_WIN64)
+            _bittestandreset64(&mask, bit);
+#elif defined(_WIN32)
+            _bittestandreset(&mask, bit);
+#else
+#error "Not implemented yet. Please contact vc-devel@compeng.uni-frankfurt.de"
+#endif
+            */
+        }
+    public:
+        BitmaskIterator(decltype(mask) m) : mask(m) { nextBit(); }
+        BitmaskIterator(const BitmaskIterator &) = default;
+        BitmaskIterator(BitmaskIterator &&) = default;
+
+        Vc_ALWAYS_INLINE size_t operator->() const { return bit; }
+        Vc_ALWAYS_INLINE size_t operator*() const { return bit; }
+
+        Vc_ALWAYS_INLINE BitmaskIterator &operator++()    { resetLsb(); nextBit(); return *this; }
+        Vc_ALWAYS_INLINE BitmaskIterator  operator++(int) { BitmaskIterator tmp = *this; resetLsb(); nextBit(); return tmp; }
+
+        Vc_ALWAYS_INLINE bool operator==(const BitmaskIterator &rhs) const { return mask == rhs.mask; }
+        Vc_ALWAYS_INLINE bool operator!=(const BitmaskIterator &rhs) const { return mask != rhs.mask; }
+    };/*}}}*/
+
+template <typename T>
+Vc_ALWAYS_INLINE
+    enable_if<Traits::is_simd_vector<T>::value || Traits::is_simd_mask<T>::value,
+              Iterator<typename std::remove_reference<T>::type>>
+    begin(T &&x)
+{
+    return {std::forward<T>(x), 0};
+}
+
+template <typename T>
+Vc_ALWAYS_INLINE
+    enable_if<Traits::is_simd_vector<T>::value || Traits::is_simd_mask<T>::value,
+              Iterator<typename std::remove_reference<T>::type>>
+    end(T &&x)
+{
+    using TT = typename std::decay<T>::type;
+    return {std::forward<T>(x), int(TT::size())};
+}
+
+template <typename T>
+Vc_ALWAYS_INLINE enable_if<
+    Traits::is_simd_mask<T>::value || Traits::is_simd_vector<T>::value, ConstIterator<T>>
+cbegin(const T &v)
+{
+    return {v, 0};
+}
+
+template <typename T>
+Vc_ALWAYS_INLINE enable_if<
+    Traits::is_simd_mask<T>::value || Traits::is_simd_vector<T>::value, ConstIterator<T>>
+cend(const T &v)
+{
+    return {v, int(T::size())};
+}
+
+template<typename M> Vc_ALWAYS_INLINE BitmaskIterator begin(const WhereImpl::WhereMask<M> &w)
+{
+    return w.mask.toInt();
+}
+
+template<typename M> Vc_ALWAYS_INLINE BitmaskIterator end(const WhereImpl::WhereMask<M> &)
+{
+    return 0;
+}
+
+template<typename V, typename Flags, typename T> Vc_ALWAYS_INLINE MemoryVectorIterator<V, Flags>
+    makeIterator(T *mem, Flags)
+{
+    return new(mem) MemoryVector<V, Flags>;
+}
+
+template<typename V, typename Flags, typename T> Vc_ALWAYS_INLINE MemoryVectorIterator<const V, Flags>
+    makeIterator(const T *mem, Flags)
+{
+    return new(const_cast<T *>(mem)) MemoryVector<const V, Flags>;
+}
+
+template<typename V, typename Flags, typename FlagsX> Vc_ALWAYS_INLINE MemoryVectorIterator<V, Flags>
+    makeIterator(MemoryVector<V, FlagsX> &mv, Flags)
+{
+    return new(&mv) MemoryVector<V, Flags>;
+}
+
+template<typename V, typename Flags, typename FlagsX> Vc_ALWAYS_INLINE MemoryVectorIterator<const V, Flags>
+    makeIterator(MemoryVector<const V, FlagsX> &mv, Flags)
+{
+    return new(&mv) MemoryVector<const V, Flags>;
+}
+
+}  // namespace Common
+
+using Common::begin;
+using Common::end;
+using Common::cbegin;
+using Common::cend;
+using Common::makeIterator;
+}  // namespace Vc
+
+#endif // VC_COMMON_ITERATORS_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/loadinterface.h
+++ b/Vc/common/loadinterface.h
@ -0,0 +1,105 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+// load ctors{{{1
+/**
+ * Construct a vector from loading its entries from the array at \p mem.
+ *
+ * \param mem A pointer to data. The pointer must not be aligned on a
+ *            MemoryAlignment boundary unless you add the Vc::Aligned flag as a second
+ *            argument.
+ */
+explicit Vc_INTRINSIC Vector(const EntryType *mem)
+{
+    load(mem);
+}
+/**
+ * Construct a vector from loading its entries from the array at \p mem.
+ *
+ * \param mem A pointer to data. If \p flags contains the Vc::Aligned flag, the pointer
+ *            must be aligned on a MemoryAlignment boundary.
+ * \param flags A (combination of) flag object(s), such as Vc::Aligned, Vc::Streaming,
+ *              Vc::Unaligned, and/or Vc::PrefetchDefault.
+ */
+template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
+explicit Vc_INTRINSIC Vector(const EntryType *mem, Flags flags)
+{
+    load(mem, flags);
+}
+
+template <typename U, typename Flags = DefaultLoadTag,
+          typename = enable_if<
+              (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
+               sizeof(EntryType) >= sizeof(U)) &&
+              std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
+explicit Vc_INTRINSIC Vector(const U *x, Flags flags = Flags())
+{
+    load<U, Flags>(x, flags);
+}
+
+// load member functions{{{1
+/**
+ * Load the vector entries from \p mem, overwriting the previous values.
+ *
+ * \param mem
+ * A pointer to data. The pointer must not be aligned on a MemoryAlignment boundary unless
+ * you add the Vc::Aligned flag as a second argument.
+ */
+Vc_INTRINSIC void load(const EntryType *mem)
+{
+    load(mem, DefaultLoadTag());
+}
+/**
+ * Load the vector entries from \p mem, overwriting the previous values.
+ *
+ * \param mem
+ * A pointer to data. If \p flags contains the Vc::Aligned flag, the pointer must be
+ * aligned on a MemoryAlignment boundary.
+ * \param flags
+ * A (combination of) flag object(s), such as Vc::Aligned, Vc::Streaming, Vc::Unaligned,
+ * and/or Vc::PrefetchDefault.
+ */
+template <typename Flags>
+Vc_INTRINSIC enable_if<Traits::is_load_store_flag<Flags>::value, void>
+load(const EntryType *mem, Flags flags)
+{
+    load<EntryType, Flags>(mem, flags);
+}
+private:
+template <typename U, typename Flags>
+struct load_concept : public std::enable_if<
+              (!std::is_integral<U>::value || !std::is_integral<EntryType>::value ||
+               sizeof(EntryType) >= sizeof(U)) &&
+              std::is_arithmetic<U>::value && Traits::is_load_store_flag<Flags>::value, void>
+{};
+
+public:
+template <typename U, typename Flags = DefaultLoadTag>
+Vc_INTRINSIC_L typename load_concept<U, Flags>::type load(const U *mem, Flags = Flags()) Vc_INTRINSIC_R;
+//}}}1
+
+// vim: foldmethod=marker
--- a/Vc/common/loadstoreflags.h
+++ b/Vc/common/loadstoreflags.h
@ -0,0 +1,243 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_LOADSTOREFLAGS_H_
+#define VC_COMMON_LOADSTOREFLAGS_H_
+
+#include "../traits/type_traits.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+
+/**
+ * Hint for \ref Prefetch to select prefetches that mark the memory as exclusive.
+ *
+ * This hint may optimize the prefetch if the memory will subsequently be written to.
+ */
+struct Exclusive {};
+/**
+ * Hint for \ref Prefetch to select prefetches that mark the memory as shared.
+ */
+struct Shared {};
+
+namespace LoadStoreFlags
+{
+
+struct StreamingFlag {};
+struct UnalignedFlag {};
+struct PrefetchFlagBase {};
+// TODO: determine a good default for typical CPU use
+template <size_t L1 = 16 * 64, size_t L2 = 128 * 64, typename ExclusiveOrShared_ = void>
+struct PrefetchFlag : public PrefetchFlagBase {
+    typedef ExclusiveOrShared_ ExclusiveOrShared;
+    static constexpr size_t L1Stride = L1;
+    static constexpr size_t L2Stride = L2;
+    static constexpr bool IsExclusive = std::is_same<ExclusiveOrShared, Exclusive>::value;
+    static constexpr bool IsShared = std::is_same<ExclusiveOrShared, Shared>::value;
+};
+
+template<typename Base, typename Default, typename... LoadStoreFlags> struct ExtractType
+{
+    typedef Default type;
+};
+template<typename Base, typename Default, typename T, typename... LoadStoreFlags> struct ExtractType<Base, Default, T, LoadStoreFlags...>
+{
+    typedef typename std::conditional<std::is_base_of<Base, T>::value, T, typename ExtractType<Base, Default, LoadStoreFlags...>::type>::type type;
+};
+
+// ICC warns about the constexpr members in LoadStoreFlags: member "LoadStoreFlags<Flags...>::IsAligned" was declared but never referenced
+// who needs that warning, especially if it was referenced...
+// The warning cannot be reenabled because it gets emitted whenever the LoadStoreFlags is instantiated
+// somewhere, so it could be anywhere.
+#ifdef Vc_ICC
+#pragma warning(disable: 177)
+#endif
+/**\internal
+ * Implementation of the load/store flags mechanism. This is internal API. Only some
+ * concrete aliases are API-relevant types.
+ */
+template<typename... Flags> struct LoadStoreFlags
+{
+private:
+    // ICC doesn't grok this line:
+    //template<typename Test> using TestFlag = std::is_same<typename ExtractType<StreamingFlag, void, Flags...>::type, void>;
+    typedef typename ExtractType<PrefetchFlagBase, PrefetchFlag<0, 0>, Flags...>::type Prefetch;
+
+public:
+    constexpr LoadStoreFlags() {}
+
+    static constexpr bool IsStreaming = !std::is_same<typename ExtractType<StreamingFlag, void, Flags...>::type, void>::value;
+    static constexpr bool IsUnaligned = !std::is_same<typename ExtractType<UnalignedFlag, void, Flags...>::type, void>::value;
+    static constexpr bool IsAligned = !IsUnaligned;
+    static constexpr bool IsPrefetch = !std::is_same<typename ExtractType<PrefetchFlagBase, void, Flags...>::type, void>::value;
+    static constexpr bool IsExclusivePrefetch = Prefetch::IsExclusive;
+    static constexpr bool IsSharedPrefetch = Prefetch::IsShared;
+    static constexpr size_t L1Stride = Prefetch::L1Stride;
+    static constexpr size_t L2Stride = Prefetch::L2Stride;
+
+    typedef LoadStoreFlags<typename std::conditional<std::is_same<Flags, UnalignedFlag>::value, void, Flags>::type...> UnalignedRemoved;
+
+    // The following EnableIf* convenience types cannot use enable_if because then no LoadStoreFlags type
+    // could ever be instantiated. Instead these types are defined either as void* or void. The
+    // function that does SFINAE then assigns "= nullptr" to this type. Thus, the ones with just
+    // void result in substitution failure.
+    typedef typename std::conditional<IsAligned   && !IsStreaming, void *, void>::type EnableIfAligned;
+    typedef typename std::conditional<IsAligned   &&  IsStreaming, void *, void>::type EnableIfStreaming;
+    typedef typename std::conditional<IsUnaligned && !IsStreaming, void *, void>::type EnableIfUnalignedNotStreaming;
+    typedef typename std::conditional<IsUnaligned &&  IsStreaming, void *, void>::type EnableIfUnalignedAndStreaming;
+    typedef typename std::conditional<IsUnaligned                , void *, void>::type EnableIfUnaligned;
+    typedef typename std::conditional<!IsUnaligned               , void *, void>::type EnableIfNotUnaligned;
+    typedef typename std::conditional<IsPrefetch                 , void *, void>::type EnableIfPrefetch;
+    typedef typename std::conditional<!IsPrefetch                , void *, void>::type EnableIfNotPrefetch;
+};
+
+/**\internal
+ * Specialization for no flags (i.e aligned, non-streaming, no prefetching)
+ */
+template<> struct LoadStoreFlags<>
+{
+    constexpr LoadStoreFlags() {}
+
+    static constexpr bool IsStreaming = false;
+    static constexpr bool IsUnaligned = false;
+    static constexpr bool IsAligned = !IsUnaligned;
+    static constexpr bool IsPrefetch = false;
+    static constexpr bool IsExclusivePrefetch = false;
+    static constexpr bool IsSharedPrefetch = false;
+    static constexpr size_t L1Stride = 0;
+    static constexpr size_t L2Stride = 0;
+    typedef void* EnableIfAligned;
+    typedef void* EnableIfNotUnaligned;
+    typedef void* EnableIfNotPrefetch;
+};
+
+/**
+ * Operator for concatenation of LoadStoreFlags.
+ *
+ * Example:
+ * \code
+ * float_v x(mem, Vc::Aligned | Vc::Streaming);
+ * \endcode
+ */
+template<typename... LFlags, typename... RFlags>
+constexpr LoadStoreFlags<LFlags..., RFlags...> operator|(LoadStoreFlags<LFlags...>, LoadStoreFlags<RFlags...>)
+{
+    return LoadStoreFlags<LFlags..., RFlags...>();
+}
+
+} // LoadStoreFlags namespace
+
+using LoadStoreFlags::PrefetchFlag;
+
+typedef LoadStoreFlags::LoadStoreFlags<> AlignedTag;
+typedef LoadStoreFlags::LoadStoreFlags<LoadStoreFlags::StreamingFlag> StreamingTag;
+typedef LoadStoreFlags::LoadStoreFlags<LoadStoreFlags::UnalignedFlag> UnalignedTag;
+
+/// The default load tag type uses unaligned (non-streaming) loads.
+typedef UnalignedTag DefaultLoadTag;
+/// The default store tag type uses unaligned (non-streaming) stores.
+typedef UnalignedTag DefaultStoreTag;
+
+/**\addtogroup Utilities
+ * @{
+ */
+/**
+ * Use this object for a \p flags parameter to request aligned loads and stores.
+ *
+ * It specifies that a load/store can expect a memory address that is aligned on
+ * the correct boundary. (i.e. \p MemoryAlignment)
+ *
+ * \warning
+ * If you specify Aligned, but the memory address is not aligned the program
+ * will most likely crash.
+ */
+constexpr AlignedTag Aligned;
+
+/**
+ * Use this object for a \p flags parameter to request unaligned loads and stores.
+ *
+ * It specifies that a load/store can \em not expect a memory address that is
+ * aligned on the correct boundary. (i.e. alignment is less than
+ * \p MemoryAlignment)
+ *
+ * \note
+ * If you specify Unaligned, but the memory address is aligned the load/store
+ * will execute slightly slower than necessary.
+ */
+constexpr UnalignedTag Unaligned;
+
+/**
+ * Use this object for a \p flags parameter to request streaming loads and stores.
+ *
+ * It specifies that the cache should be bypassed for the given load/store.
+ * Whether this will actually be done depends on the target system's capabilities.
+ *
+ * Streaming stores can be interesting when the code calculates values that, after being
+ * written to memory, will not be used for a long time or used by a different thread.
+ *
+ * \note
+ * Expect that most target systems do not support unaligned streaming loads or stores.
+ * Therefore, make sure that you also specify Aligned.
+ */
+constexpr StreamingTag Streaming;
+
+/**
+ * Use this object for a \p flags parameter to request default software prefetches to be
+ * emitted.
+ */
+constexpr LoadStoreFlags::LoadStoreFlags<PrefetchFlag<>> PrefetchDefault;
+///@}
+
+/**
+ * \tparam L1
+ * \tparam L2
+ * \tparam ExclusiveOrShared
+ */
+template <size_t L1 = PrefetchFlag<>::L1Stride,
+          size_t L2 = PrefetchFlag<>::L2Stride,
+          typename ExclusiveOrShared = PrefetchFlag<>::ExclusiveOrShared>
+struct Prefetch : public LoadStoreFlags::LoadStoreFlags<PrefetchFlag<L1, L2, ExclusiveOrShared>>
+{
+};
+
+namespace Traits
+{
+///\internal partial specialization for detecting LoadStoreFlags types
+template <typename... Ts>
+struct is_loadstoreflag_internal<LoadStoreFlags::LoadStoreFlags<Ts...>> : public std::true_type
+{
+};
+///\internal partial specialization for detecting the derived Prefetch type as a
+/// load/store flag.
+template <size_t L1, size_t L2, typename ExclusiveOrShared>
+struct is_loadstoreflag_internal<Prefetch<L1, L2, ExclusiveOrShared>> : public std::true_type
+{
+};
+}  // namespace Traits
+}  // namespace Vc
+
+#endif // VC_COMMON_LOADSTOREFLAGS_H_
--- a/Vc/common/logarithm.h
+++ b/Vc/common/logarithm.h
@ -0,0 +1,276 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+/* The log implementations are based on code from Julien Pommier which carries the following
+   copyright information:
+ */
+/*
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+*/
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#ifdef Vc_COMMON_MATH_H_INTERNAL
+
+enum LogarithmBase {
+    BaseE, Base10, Base2
+};
+
+namespace Detail
+{
+template <typename T, typename Abi>
+using Const = typename std::conditional<std::is_same<Abi, VectorAbi::Avx>::value,
+                                        AVX::Const<T>, SSE::Const<T>>::type;
+
+template<LogarithmBase Base>
+struct LogImpl
+{
+    template<typename T, typename Abi> static Vc_ALWAYS_INLINE void log_series(Vector<T, Abi> &Vc_RESTRICT x, typename Vector<T, Abi>::AsArg exponent) {
+        typedef Vector<T, Abi> V;
+        typedef Detail::Const<T, Abi> C;
+        // Taylor series around x = 2^exponent
+        //   f(x) = ln(x)   → exponent * ln(2) → C::ln2_small + C::ln2_large
+        //  f'(x) =    x⁻¹  →  x               → 1
+        // f''(x) = -  x⁻²  → -x² / 2          → C::_1_2()
+        //        =  2!x⁻³  →  x³ / 3          → C::P(8)
+        //        = -3!x⁻⁴  → -x⁴ / 4          → C::P(7)
+        //        =  4!x⁻⁵  →  x⁵ / 5          → C::P(6)
+        // ...
+        // The high order coefficients are adjusted to reduce the error that occurs from ommission
+        // of higher order terms.
+        // P(0) is the smallest term and |x| < 1 ⇒ |xⁿ| > |xⁿ⁺¹|
+        // The order of additions must go from smallest to largest terms
+        const V x2 = x * x; // 0 → 4
+#ifdef Vc_LOG_ILP
+        V y2 = (C::P(6) * /*4 →  8*/ x2 + /* 8 → 11*/ C::P(7) * /*1 → 5*/ x) + /*11 → 14*/ C::P(8);
+        V y0 = (C::P(0) * /*5 →  9*/ x2 + /* 9 → 12*/ C::P(1) * /*2 → 6*/ x) + /*12 → 15*/ C::P(2);
+        V y1 = (C::P(3) * /*6 → 10*/ x2 + /*10 → 13*/ C::P(4) * /*3 → 7*/ x) + /*13 → 16*/ C::P(5);
+        const V x3 = x2 * x;  // 7 → 11
+        const V x6 = x3 * x3; // 11 → 15
+        const V x9 = x6 * x3; // 15 → 19
+        V y = (y0 * /*19 → 23*/ x9 + /*23 → 26*/ y1 * /*16 → 20*/ x6) + /*26 → 29*/ y2 * /*14 → 18*/ x3;
+#elif defined Vc_LOG_ILP2
+        /*
+         *                            name start done
+         *  movaps %xmm0, %xmm1     ; x     0     1
+         *  movaps %xmm0, %xmm2     ; x     0     1
+         *  mulps %xmm1, %xmm1      ; x2    1     5 *xmm1
+         *  movaps <P8>, %xmm15     ; y8    1     2
+         *  mulps %xmm1, %xmm2      ; x3    5     9 *xmm2
+         *  movaps %xmm1, %xmm3     ; x2    5     6
+         *  movaps %xmm1, %xmm4     ; x2    5     6
+         *  mulps %xmm3, %xmm3      ; x4    6    10 *xmm3
+         *  movaps %xmm2, %xmm5     ; x3    9    10
+         *  movaps %xmm2, %xmm6     ; x3    9    10
+         *  mulps %xmm2, %xmm4      ; x5    9    13 *xmm4
+         *  movaps %xmm3, %xmm7     ; x4   10    11
+         *  movaps %xmm3, %xmm8     ; x4   10    11
+         *  movaps %xmm3, %xmm9     ; x4   10    11
+         *  mulps %xmm5, %xmm5      ; x6   10    14 *xmm5
+         *  mulps %xmm3, %xmm6      ; x7   11    15 *xmm6
+         *  mulps %xmm7, %xmm7      ; x8   12    16 *xmm7
+         *  movaps %xmm4, %xmm10    ; x5   13    14
+         *  mulps %xmm4, %xmm8      ; x9   13    17 *xmm8
+         *  mulps %xmm5, %xmm10     ; x11  14    18 *xmm10
+         *  mulps %xmm5, %xmm9      ; x10  15    19 *xmm9
+         *  mulps <P0>, %xmm10      ; y0   18    22
+         *  mulps <P1>, %xmm9       ; y1   19    23
+         *  mulps <P2>, %xmm8       ; y2   20    24
+         *  mulps <P3>, %xmm7       ; y3   21    25
+         *  addps %xmm10, %xmm9     ; y    23    26
+         *  addps %xmm9, %xmm8      ; y    26    29
+         *  addps %xmm8, %xmm7      ; y    29    32
+         */
+        const V x3 = x2 * x;  // 4  → 8
+        const V x4 = x2 * x2; // 5  → 9
+        const V x5 = x2 * x3; // 8  → 12
+        const V x6 = x3 * x3; // 9  → 13
+        const V x7 = x4 * x3; // 
+        const V x8 = x4 * x4;
+        const V x9 = x5 * x4;
+        const V x10 = x5 * x5;
+        const V x11 = x5 * x6; // 13 → 17
+        V y = C::P(0) * x11 + C::P(1) * x10 + C::P(2) * x9 + C::P(3) * x8 + C::P(4) * x7
+            + C::P(5) * x6  + C::P(6) * x5  + C::P(7) * x4 + C::P(8) * x3;
+#else
+        V y = C::P(0);
+        Vc::Common::unrolled_loop<int, 1, 9>([&](int i) { y = y * x + C::P(i); });
+        y *= x * x2;
+#endif
+        switch (Base) {
+        case BaseE:
+            // ln(2) is split in two parts to increase precision (i.e. ln2_small + ln2_large = ln(2))
+            y += exponent * C::ln2_small();
+            y -= x2 * C::_1_2(); // [0, 0.25[
+            x += y;
+            x += exponent * C::ln2_large();
+            break;
+        case Base10:
+            y += exponent * C::ln2_small();
+            y -= x2 * C::_1_2(); // [0, 0.25[
+            x += y;
+            x += exponent * C::ln2_large();
+            x *= C::log10_e();
+            break;
+        case Base2:
+            {
+                const V x_ = x;
+                x *= C::log2_e();
+                y *= C::log2_e();
+                y -= x_ * x * C::_1_2(); // [0, 0.25[
+                x += y;
+                x += exponent;
+                break;
+            }
+        }
+    }
+
+template <typename Abi>
+static Vc_ALWAYS_INLINE void log_series(Vector<double, Abi> &Vc_RESTRICT x,
+                                        typename Vector<double, Abi>::AsArg exponent)
+{
+    typedef Vector<double, Abi> V;
+    typedef Detail::Const<double, Abi> C;
+        const V x2 = x * x;
+        V y = C::P(0);
+        V y2 = C::Q(0) + x;
+        Vc::Common::unrolled_loop<int, 1, 5>([&](int i) {
+            y = y * x + C::P(i);
+            y2 = y2 * x + C::Q(i);
+        });
+        y2 = x / y2;
+        y = y * x + C::P(5);
+        y = x2 * y * y2;
+        // TODO: refactor the following with the float implementation:
+        switch (Base) {
+        case BaseE:
+            // ln(2) is split in two parts to increase precision (i.e. ln2_small + ln2_large = ln(2))
+            y += exponent * C::ln2_small();
+            y -= x2 * C::_1_2(); // [0, 0.25[
+            x += y;
+            x += exponent * C::ln2_large();
+            break;
+        case Base10:
+            y += exponent * C::ln2_small();
+            y -= x2 * C::_1_2(); // [0, 0.25[
+            x += y;
+            x += exponent * C::ln2_large();
+            x *= C::log10_e();
+            break;
+        case Base2:
+            {
+                const V x_ = x;
+                x *= C::log2_e();
+                y *= C::log2_e();
+                y -= x_ * x * C::_1_2(); // [0, 0.25[
+                x += y;
+                x += exponent;
+                break;
+            }
+        }
+    }
+
+template <typename T, typename Abi, typename V = Vector<T, Abi>>
+static inline Vector<T, Abi> calc(V _x)
+{
+        typedef typename V::Mask M;
+    typedef Detail::Const<T, Abi> C;
+
+        V x(_x);
+
+        const M invalidMask = x < V::Zero();
+        const M infinityMask = x == V::Zero();
+        const M denormal = x <= C::min();
+
+        x(denormal) *= V(Vc::Detail::doubleConstant<1, 0, 54>()); // 2²⁵
+        V exponent = Detail::exponent(x.data());                    // = ⎣log₂(x)⎦
+        exponent(denormal) -= 54;
+
+        x.setZero(C::exponentMask()); // keep only the fractional part ⇒ x ∈ [1, 2[
+        x = Detail::operator|(x,
+                              C::_1_2());  // and set the exponent to 2⁻¹   ⇒ x ∈ [½, 1[
+
+        // split calculation in two cases:
+        // A: x ∈ [½, √½[
+        // B: x ∈ [√½, 1[
+        // √½ defines the point where Δe(x) := log₂(x) - ⎣log₂(x)⎦ = ½, i.e.
+        // log₂(√½) - ⎣log₂(√½)⎦ = ½ * -1 - ⎣½ * -1⎦ = -½ + 1 = ½
+
+        const M smallX = x < C::_1_sqrt2();
+        x(smallX) += x; // => x ∈ [√½,     1[ ∪ [1.5, 1 + √½[
+        x -= V::One();  // => x ∈ [√½ - 1, 0[ ∪ [0.5, √½[
+        exponent(!smallX) += V::One();
+
+        log_series(x, exponent); // A: (ˣ⁄₂ᵉ - 1, e)  B: (ˣ⁄₂ᵉ⁺¹ - 1, e + 1)
+
+        x.setQnan(invalidMask);        // x < 0 → NaN
+        x(infinityMask) = C::neginf(); // x = 0 → -∞
+
+        return x;
+    }
+};
+}  // namespace Detail
+
+template <typename T, typename Abi>
+Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log(
+    const Vector<T, Abi> &x)
+{
+    return Detail::LogImpl<BaseE>::calc<T, Abi>(x);
+}
+template <typename T, typename Abi>
+Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log10(
+    const Vector<T, Abi> &x)
+{
+    return Detail::LogImpl<Base10>::calc<T, Abi>(x);
+}
+template <typename T, typename Abi>
+Vc_INTRINSIC Vc_CONST Vector<T, detail::not_fixed_size_abi<Abi>> log2(
+    const Vector<T, Abi> &x)
+{
+    return Detail::LogImpl<Base2>::calc<T, Abi>(x);
+}
+
+#endif // Vc_COMMON_MATH_H_INTERNAL
--- a/Vc/common/macros.h
+++ b/Vc/common/macros.h
@ -0,0 +1,318 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_MACROS_H_
+#define VC_COMMON_MACROS_H_
+
+#include "../global.h"
+
+
+#ifdef Vc_MSVC
+#define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_)                                      \
+    typedef __declspec(align(n_)) type_ new_type_
+#elif __GNUC__
+#define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_)                                      \
+    typedef type_ new_type_[[gnu::aligned(n_)]]
+#else  // the following is actually ill-formed according to C++1[14]
+#define Vc_ALIGNED_TYPEDEF(n_, type_, new_type_)                                      \
+    using new_type_ alignas(sizeof(n_)) = type_
+#endif
+
+// On Windows (WIN32) we might see macros called min and max. Just undefine them and hope
+// noone (re)defines them (NOMINMAX should help).
+#ifdef WIN32
+#define NOMINMAX 1
+#if defined min
+#undef min
+#endif
+#if defined max
+#undef max
+#endif
+#endif  // WIN32
+
+#if defined Vc_GCC && Vc_GCC >= 0x60000
+// GCC 6 drops all attributes on types passed as template arguments. This is important
+// if a may_alias gets lost and therefore needs to be readded in the implementation of
+// the class template.
+#define Vc_TEMPLATES_DROP_ATTRIBUTES 1
+#endif
+
+#if defined Vc_CLANG || defined Vc_APPLECLANG
+#  define Vc_UNREACHABLE __builtin_unreachable
+#  define Vc_NEVER_INLINE [[gnu::noinline]]
+#  define Vc_INTRINSIC_L inline
+#  define Vc_INTRINSIC_R __attribute__((always_inline))
+#  define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R
+#  define Vc_FLATTEN
+#  define Vc_CONST __attribute__((const))
+#  define Vc_CONST_L
+#  define Vc_CONST_R Vc_CONST
+#  define Vc_PURE __attribute__((pure))
+#  define Vc_PURE_L
+#  define Vc_PURE_R Vc_PURE
+#  define Vc_MAY_ALIAS __attribute__((may_alias))
+#  define Vc_ALWAYS_INLINE_L inline
+#  define Vc_ALWAYS_INLINE_R __attribute__((always_inline))
+#  define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R
+#  define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0)
+#  define Vc_IS_LIKELY(x) __builtin_expect(x, 1)
+#  define Vc_RESTRICT __restrict__
+#  define Vc_DEPRECATED(msg)
+#  define Vc_DEPRECATED_ALIAS(msg)
+#  define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#elif defined(__GNUC__)
+#  define Vc_UNREACHABLE __builtin_unreachable
+#  if defined Vc_GCC && !defined __OPTIMIZE__
+#    define Vc_MAY_ALIAS
+#  else
+#    define Vc_MAY_ALIAS __attribute__((__may_alias__))
+#  endif
+#  define Vc_INTRINSIC_R __attribute__((__always_inline__, __artificial__))
+#  define Vc_INTRINSIC_L inline
+#  define Vc_INTRINSIC Vc_INTRINSIC_L Vc_INTRINSIC_R
+#  define Vc_FLATTEN __attribute__((__flatten__))
+#  define Vc_ALWAYS_INLINE_L inline
+#  define Vc_ALWAYS_INLINE_R __attribute__((__always_inline__))
+#  define Vc_ALWAYS_INLINE Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE_R
+#  ifdef Vc_ICC
+// ICC miscompiles if there are functions marked as pure or const
+#    define Vc_PURE
+#    define Vc_CONST
+#    define Vc_NEVER_INLINE
+#  else
+#    define Vc_NEVER_INLINE [[gnu::noinline]]
+#    define Vc_PURE __attribute__((__pure__))
+#    define Vc_CONST __attribute__((__const__))
+#  endif
+#  define Vc_CONST_L
+#  define Vc_CONST_R Vc_CONST
+#  define Vc_PURE_L
+#  define Vc_PURE_R Vc_PURE
+#  define Vc_IS_UNLIKELY(x) __builtin_expect(x, 0)
+#  define Vc_IS_LIKELY(x) __builtin_expect(x, 1)
+#  define Vc_RESTRICT __restrict__
+#  ifdef Vc_ICC
+#    define Vc_DEPRECATED(msg)
+#    define Vc_DEPRECATED_ALIAS(msg)
+#  else
+#    define Vc_DEPRECATED(msg) __attribute__((__deprecated__(msg)))
+#    define Vc_DEPRECATED_ALIAS(msg) __attribute__((__deprecated__(msg)))
+#  endif
+#  define Vc_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#else
+#  define Vc_NEVER_INLINE
+#  define Vc_FLATTEN
+#  ifdef Vc_PURE
+#    undef Vc_PURE
+#  endif
+#  define Vc_MAY_ALIAS
+#  ifdef Vc_MSVC
+#    define Vc_ALWAYS_INLINE inline __forceinline
+#    define Vc_ALWAYS_INLINE_L Vc_ALWAYS_INLINE
+#    define Vc_ALWAYS_INLINE_R
+#    define Vc_CONST __declspec(noalias)
+#    define Vc_CONST_L Vc_CONST
+#    define Vc_CONST_R
+#    define Vc_PURE /*Vc_CONST*/
+#    define Vc_PURE_L Vc_PURE
+#    define Vc_PURE_R
+#    define Vc_INTRINSIC inline __forceinline
+#    define Vc_INTRINSIC_L Vc_INTRINSIC
+#    define Vc_INTRINSIC_R
+namespace Vc_VERSIONED_NAMESPACE {
+namespace detail
+{
+static Vc_INTRINSIC void unreachable() { __assume(0); }
+}  // namespace detail
+}
+#    define Vc_UNREACHABLE Vc::detail::unreachable
+#  else
+#    define Vc_ALWAYS_INLINE
+#    define Vc_ALWAYS_INLINE_L
+#    define Vc_ALWAYS_INLINE_R
+#    define Vc_CONST
+#    define Vc_CONST_L
+#    define Vc_CONST_R
+#    define Vc_PURE
+#    define Vc_PURE_L
+#    define Vc_PURE_R
+#    define Vc_INTRINSIC
+#    define Vc_INTRINSIC_L
+#    define Vc_INTRINSIC_R
+#    define Vc_UNREACHABLE std::abort
+#  endif
+#  define Vc_IS_UNLIKELY(x) x
+#  define Vc_IS_LIKELY(x) x
+#  define Vc_RESTRICT __restrict
+#  define Vc_DEPRECATED(msg) __declspec(deprecated(msg))
+#  define Vc_DEPRECATED_ALIAS(msg)
+#  define Vc_WARN_UNUSED_RESULT
+#endif
+
+#ifdef Vc_CXX14
+#undef Vc_DEPRECATED
+#define Vc_DEPRECATED(msg_) [[deprecated(msg_)]]
+#endif
+
+#define Vc_NOTHING_EXPECTING_SEMICOLON static_assert(true, "")
+
+#define Vc_FREE_STORE_OPERATORS_ALIGNED(align_)                                          \
+    /**\name new/delete overloads for correct alignment */                               \
+    /**@{*/                                                                              \
+    /*!\brief Allocates correctly aligned memory */                                      \
+    Vc_ALWAYS_INLINE void *operator new(size_t size)                                     \
+    {                                                                                    \
+        return Vc::Common::aligned_malloc<align_>(size);                                 \
+    }                                                                                    \
+    /*!\brief Returns \p p. */                                                           \
+    Vc_ALWAYS_INLINE void *operator new(size_t, void *p) { return p; }                   \
+    /*!\brief Allocates correctly aligned memory */                                      \
+    Vc_ALWAYS_INLINE void *operator new[](size_t size)                                   \
+    {                                                                                    \
+        return Vc::Common::aligned_malloc<align_>(size);                                 \
+    }                                                                                    \
+    /*!\brief Returns \p p. */                                                           \
+    Vc_ALWAYS_INLINE void *operator new[](size_t, void *p) { return p; }                 \
+    /*!\brief Frees aligned memory. */                                                   \
+    Vc_ALWAYS_INLINE void operator delete(void *ptr, size_t) { Vc::Common::free(ptr); }  \
+    /*!\brief Does nothing. */                                                           \
+    Vc_ALWAYS_INLINE void operator delete(void *, void *) {}                             \
+    /*!\brief Frees aligned memory. */                                                   \
+    Vc_ALWAYS_INLINE void operator delete[](void *ptr, size_t)                           \
+    {                                                                                    \
+        Vc::Common::free(ptr);                                                           \
+    }                                                                                    \
+    /*!\brief Does nothing. */                                                           \
+    Vc_ALWAYS_INLINE void operator delete[](void *, void *) {}                           \
+    /**@}*/                                                                              \
+    Vc_NOTHING_EXPECTING_SEMICOLON
+
+#ifdef Vc_ASSERT
+#define Vc_EXTERNAL_ASSERT 1
+#else
+#ifdef NDEBUG
+#define Vc_ASSERT(x)
+#else
+#include <assert.h>
+#define Vc_ASSERT(x) assert(x);
+#endif
+#endif
+
+#if defined Vc_CLANG || defined Vc_APPLECLANG
+#define Vc_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define Vc_HAS_BUILTIN(x) 0
+#endif
+
+#define Vc_CAT_HELPER_(a, b, c, d) a##b##c##d
+#define Vc_CAT(a, b, c, d) Vc_CAT_HELPER_(a, b, c, d)
+
+#define Vc_CAT_IMPL(a, b) a##b
+#define Vc_CAT2(a, b) Vc_CAT_IMPL(a, b)
+
+#define Vc_APPLY_IMPL_1_(macro, a, b, c, d, e) macro(a)
+#define Vc_APPLY_IMPL_2_(macro, a, b, c, d, e) macro(a, b)
+#define Vc_APPLY_IMPL_3_(macro, a, b, c, d, e) macro(a, b, c)
+#define Vc_APPLY_IMPL_4_(macro, a, b, c, d, e) macro(a, b, c, d)
+#define Vc_APPLY_IMPL_5_(macro, a, b, c, d, e) macro(a, b, c, d, e)
+
+#define Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \
+    size(macro, double_v, a, b, c, d) \
+    size(macro,  float_v, a, b, c, d)
+#define Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d) \
+    size(macro,    int_v, a, b, c, d) \
+    size(macro,   uint_v, a, b, c, d) \
+    size(macro,  short_v, a, b, c, d) \
+    size(macro, ushort_v, a, b, c, d)
+#define Vc_LIST_VECTOR_TYPES(size, macro, a, b, c, d) \
+    Vc_LIST_FLOAT_VECTOR_TYPES(size, macro, a, b, c, d) \
+    Vc_LIST_INT_VECTOR_TYPES(size, macro, a, b, c, d)
+#define Vc_LIST_COMPARES(size, macro, a, b, c, d) \
+    size(macro, ==, a, b, c, d) \
+    size(macro, !=, a, b, c, d) \
+    size(macro, <=, a, b, c, d) \
+    size(macro, >=, a, b, c, d) \
+    size(macro, < , a, b, c, d) \
+    size(macro, > , a, b, c, d)
+#define Vc_LIST_LOGICAL(size, macro, a, b, c, d) \
+    size(macro, &&, a, b, c, d) \
+    size(macro, ||, a, b, c, d)
+#define Vc_LIST_BINARY(size, macro, a, b, c, d) \
+    size(macro, |, a, b, c, d) \
+    size(macro, &, a, b, c, d) \
+    size(macro, ^, a, b, c, d)
+#define Vc_LIST_SHIFTS(size, macro, a, b, c, d) \
+    size(macro, <<, a, b, c, d) \
+    size(macro, >>, a, b, c, d)
+#define Vc_LIST_ARITHMETICS(size, macro, a, b, c, d) \
+    size(macro, +, a, b, c, d) \
+    size(macro, -, a, b, c, d) \
+    size(macro, *, a, b, c, d) \
+    size(macro, /, a, b, c, d) \
+    size(macro, %, a, b, c, d)
+
+#define Vc_APPLY_0(_list, macro)             _list(Vc_APPLY_IMPL_1_, macro, 0, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
+#define Vc_APPLY_1(_list, macro, a)          _list(Vc_APPLY_IMPL_2_, macro, a, 0, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
+#define Vc_APPLY_2(_list, macro, a, b)       _list(Vc_APPLY_IMPL_3_, macro, a, b, 0, 0) Vc_NOTHING_EXPECTING_SEMICOLON
+#define Vc_APPLY_3(_list, macro, a, b, c)    _list(Vc_APPLY_IMPL_4_, macro, a, b, c, 0) Vc_NOTHING_EXPECTING_SEMICOLON
+#define Vc_APPLY_4(_list, macro, a, b, c, d) _list(Vc_APPLY_IMPL_5_, macro, a, b, c, d) Vc_NOTHING_EXPECTING_SEMICOLON
+
+#define Vc_ALL_COMPARES(macro)     Vc_APPLY_0(Vc_LIST_COMPARES, macro)
+#define Vc_ALL_LOGICAL(macro)      Vc_APPLY_0(Vc_LIST_LOGICAL, macro)
+#define Vc_ALL_BINARY(macro)       Vc_APPLY_0(Vc_LIST_BINARY, macro)
+#define Vc_ALL_SHIFTS(macro)       Vc_APPLY_0(Vc_LIST_SHIFTS, macro)
+#define Vc_ALL_ARITHMETICS(macro)  Vc_APPLY_0(Vc_LIST_ARITHMETICS, macro)
+#define Vc_ALL_FLOAT_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_FLOAT_VECTOR_TYPES, macro)
+#define Vc_ALL_VECTOR_TYPES(macro) Vc_APPLY_0(Vc_LIST_VECTOR_TYPES, macro)
+
+#define Vc_EXACT_TYPE(_test, _reference, _type) \
+    typename std::enable_if<std::is_same<_test, _reference>::value, _type>::type
+
+#define Vc_make_unique(name) Vc_CAT(Vc_,name,_,__LINE__)
+
+#if defined(Vc_NO_NOEXCEPT)
+#define Vc_NOEXCEPT throw()
+#else
+#define Vc_NOEXCEPT noexcept
+#endif
+
+#ifdef Vc_NO_ALWAYS_INLINE
+#undef Vc_ALWAYS_INLINE
+#undef Vc_ALWAYS_INLINE_L
+#undef Vc_ALWAYS_INLINE_R
+#define Vc_ALWAYS_INLINE inline
+#define Vc_ALWAYS_INLINE_L inline
+#define Vc_ALWAYS_INLINE_R
+#undef Vc_INTRINSIC
+#undef Vc_INTRINSIC_L
+#undef Vc_INTRINSIC_R
+#define Vc_INTRINSIC inline
+#define Vc_INTRINSIC_L inline
+#define Vc_INTRINSIC_R
+#endif
+
+#endif // VC_COMMON_MACROS_H_
--- a/Vc/common/makeContainer.h
+++ b/Vc/common/makeContainer.h
@ -0,0 +1,150 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_MAKECONTAINER_H_
+#define VC_COMMON_MAKECONTAINER_H_
+
+#include "../vector.h"
+#include <initializer_list>
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+
+    namespace
+    {
+        template<typename Container, typename T> struct make_container_helper
+        {
+            static constexpr Container help(std::initializer_list<T> list) { return { list }; }
+        };
+
+        template <typename T_, typename Abi, typename Alloc,
+                  template <class, class> class Container>
+        struct make_container_helper<Container<Vector<T_, Abi>, Alloc>,
+                                     typename Vector<T_, Abi>::EntryType> {
+            typedef Vector<T_, Abi> V;
+            typedef typename V::EntryType T;
+            typedef Container<V, Alloc> C;
+            static inline C help(std::initializer_list<T> list) {
+                const std::size_t size = (list.size() + (V::Size - 1)) / V::Size;
+                C v(size);
+                auto containerIt = v.begin();
+                auto init = std::begin(list);
+                const auto initEnd = std::end(list);
+                for (std::size_t i = 0; i < size - 1; ++i) {
+                    *containerIt++ = V(init, Vc::Unaligned);
+                    init += V::Size;
+                }
+                Vc_ASSERT(all_of(*containerIt == V::Zero()));
+                int j = 0;
+                while (init != initEnd) {
+                    (*containerIt)[j++] = *init++;
+                }
+                return v;
+            }
+        };
+
+        template <typename T_, typename Abi, std::size_t N,
+                  template <class, std::size_t> class Container>
+        struct make_container_helper<Container<Vector<T_, Abi>, N>,
+                                     typename Vector<T_, Abi>::EntryType> {
+            typedef Vector<T_, Abi> V;
+            typedef typename V::EntryType T;
+            static constexpr std::size_t size = (N + (V::Size - 1)) / V::Size;
+            typedef Container<
+                V,
+#if defined Vc_CLANG && Vc_CLANG < 0x30700 // TODO: when did Vc_APPLECLANG fix it?
+                // clang before 3.7.0 has a bug when returning std::array<__m256x, 1>. So
+                // increase it to std::array<__m256x, 2> and fill it with zeros. Better
+                // than returning garbage.
+                (size == 1 && std::is_same<Abi, VectorAbi::Avx>::value) ? 2 :
+#endif
+                                                                        size> C;
+            static inline C help(std::initializer_list<T> list) {
+                Vc_ASSERT(N == list.size())
+                Vc_ASSERT(size == (list.size() + (V::Size - 1)) / V::Size)
+                C v;
+                auto containerIt = v.begin();
+                auto init = std::begin(list);
+                const auto initEnd = std::end(list);
+                for (std::size_t i = 0; i < size - 1; ++i) {
+                    *containerIt++ = V(init, Vc::Unaligned);
+                    init += V::Size;
+                }
+                Vc_ASSERT(all_of(*containerIt == V::Zero()));
+                int j = 0;
+                while (init != initEnd) {
+                    (*containerIt)[j++] = *init++;
+                }
+                return v;
+            }
+        };
+    } // anonymous namespace
+
+    /**
+     * \ingroup Containers
+     * \headerfile makeContainer.h <Vc/Utils>
+     *
+     * Construct a container of Vc vectors from a std::initializer_list of scalar entries.
+     *
+     * \tparam Container The container type to construct.
+     * \tparam T The scalar type to use for the initializer_list.
+     *
+     * \param list An initializer list of arbitrary size. The type of the entries is important!
+     * If you pass a list of integers you will get a container filled with Vc::int_v objects.
+     * If, instead, you want to have a container of Vc::float_v objects, be sure the include a
+     * period (.) and the 'f' postfix in the literals. Alternatively, you can pass the
+     * type as second template argument to makeContainer.
+     *
+     * \return Returns a container of the requested class filled with the minimum number of SIMD
+     * vectors to hold the values in the initializer list.
+     * If the number of values in \p list does not match the number of values in the
+     * returned container object, the remaining values in the returned object will be
+     * zero-initialized.
+     *
+     * Example:
+     * \code
+     * auto data = Vc::makeContainer<std::vector<float_v>>({ 1.f, 2.f, 3.f, 4.f, 5.f });
+     * // data.size() == 5 if float_v::Size == 1 (i.e. Vc_IMPL=Scalar)
+     * // data.size() == 2 if float_v::Size == 4 (i.e. Vc_IMPL=SSE)
+     * // data.size() == 1 if float_v::Size == 8 (i.e. Vc_IMPL=AVX)
+     * \endcode
+     */
+    template<typename Container, typename T>
+    constexpr auto makeContainer(std::initializer_list<T> list) -> decltype(make_container_helper<Container, T>::help(list))
+    {
+        return make_container_helper<Container, T>::help(list);
+    }
+
+    template<typename Container, typename T>
+    constexpr auto make_container(std::initializer_list<T> list) -> decltype(makeContainer<Container, T>(list))
+    {
+        return makeContainer<Container, T>(list);
+    }
+
+}  // namespace Vc
+
+#endif // VC_COMMON_MAKECONTAINER_H_
--- a/Vc/common/make_unique.h
+++ b/Vc/common/make_unique.h
@ -0,0 +1,56 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_MAKE_UNIQUE_H_
+#define VC_COMMON_MAKE_UNIQUE_H_
+
+#include <memory>
+#include "malloc.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+
+template<typename T> struct Deleter
+{
+    Vc_ALWAYS_INLINE void operator()(T *ptr) {
+        ptr->~T();
+        Vc::free(ptr);
+    }
+};
+
+template<class T, MallocAlignment A = Vc::AlignOnVector, class... Args>
+inline std::unique_ptr<T, Deleter<T>> make_unique(Args&&... args)
+{
+    return std::unique_ptr<T, Deleter<T>>(new(Vc::malloc<T, A>(1)) T(std::forward<Args>(args)...));
+}
+
+}  // namespace Common
+}  // namespace Vc
+
+#endif // VC_COMMON_MAKE_UNIQUE_H_
--- a/Vc/common/malloc.h
+++ b/Vc/common/malloc.h
@ -0,0 +1,169 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_MALLOC_H_
+#define VC_COMMON_MALLOC_H_
+
+#ifndef Vc_VECTOR_DECLARED_
+#error "Incorrect inclusion order. This header must be included from Vc/vector.h only."
+#endif
+
+#if defined _WIN32 || defined _WIN64
+#include <malloc.h>
+#else
+#include <cstdlib>
+#endif
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+
+template <size_t X> static constexpr size_t nextMultipleOf(size_t value)
+{
+    return (value % X) > 0 ? value + X - (value % X) : value;
+}
+
+template <std::size_t alignment> Vc_INTRINSIC void *aligned_malloc(std::size_t n)
+{
+#ifdef __MIC__
+    return _mm_malloc(nextMultipleOf<alignment>(n), alignment);
+#elif defined(_WIN32)
+# ifdef __GNUC__
+    return __mingw_aligned_malloc(nextMultipleOf<alignment>(n), alignment);
+# else
+    return _aligned_malloc(nextMultipleOf<alignment>(n), alignment);
+# endif
+#else
+    void *ptr = nullptr;
+    if (0 == posix_memalign(&ptr, alignment < sizeof(void *) ? sizeof(void *) : alignment,
+                            nextMultipleOf<alignment>(n))) {
+        return ptr;
+    }
+    return ptr;
+#endif
+}
+
+template <Vc::MallocAlignment A> Vc_ALWAYS_INLINE void *malloc(size_t n)
+{
+    switch (A) {
+    case Vc::AlignOnVector:
+        return aligned_malloc<Vc::VectorAlignment>(n);
+    case Vc::AlignOnCacheline:
+        // TODO: hardcoding 64 is not such a great idea
+        return aligned_malloc<64>(n);
+    case Vc::AlignOnPage:
+        // TODO: hardcoding 4096 is not such a great idea
+        return aligned_malloc<4096>(n);
+    }
+    return nullptr;
+}
+
+Vc_ALWAYS_INLINE void free(void *p)
+{
+#ifdef __MIC__
+    _mm_free(p);
+#elif defined(_WIN32)
+# ifdef __GNUC__
+    return __mingw_aligned_free(p);
+# else
+    return _aligned_free(p);
+# endif
+#else
+    std::free(p);
+#endif
+}
+}  // namespace Common
+
+/**
+ * Allocates memory on the Heap with alignment and padding suitable for vectorized access.
+ *
+ * Memory that was allocated with this function must be released with Vc::free! Other methods might
+ * work but are not portable.
+ *
+ * \param n Specifies the number of objects the allocated memory must be able to store.
+ * \tparam T The type of the allocated memory. Note, that the constructor is not called.
+ * \tparam A Determines the alignment of the memory. See \ref Vc::MallocAlignment.
+ *
+ * \return Pointer to memory of the requested type, or 0 on error. The allocated memory is padded at
+ * the end to be a multiple of the requested alignment \p A. Thus if you request memory for 21
+ * int objects, aligned via Vc::AlignOnCacheline, you can safely read a full cacheline until the
+ * end of the array, without generating an out-of-bounds access. For a cacheline size of 64 Bytes
+ * and an int size of 4 Bytes you would thus get an array of 128 Bytes to work with.
+ *
+ * \warning
+ * \li The standard malloc function specifies the number of Bytes to allocate whereas this
+ *     function specifies the number of values, thus differing in a factor of sizeof(T).
+ * \li This function is mainly meant for use with builtin types. If you use a custom
+ *     type with a sizeof that is not a multiple of 2 the results might not be what you expect.
+ * \li The constructor of T is not called. You can make up for this:
+ * \code
+ * SomeType *array = new(Vc::malloc<SomeType, Vc::AlignOnCacheline>(N)) SomeType[N];
+ * \endcode
+ *
+ * \see Vc::free
+ *
+ * \ingroup Utilities
+ * \headerfile memory.h <Vc/Memory>
+ */
+template<typename T, Vc::MallocAlignment A>
+Vc_ALWAYS_INLINE T *malloc(size_t n)
+{
+    return static_cast<T *>(Common::malloc<A>(n * sizeof(T)));
+}
+
+/**
+ * Frees memory that was allocated with Vc::malloc.
+ *
+ * \param p The pointer to the memory to be freed.
+ *
+ * \tparam T The type of the allocated memory.
+ *
+ * \warning The destructor of T is not called. If needed, you can call the destructor before calling
+ * free:
+ * \code
+ * for (int i = 0; i < N; ++i) {
+ *   p[i].~T();
+ * }
+ * Vc::free(p);
+ * \endcode
+ *
+ * \ingroup Utilities
+ * \headerfile memory.h <Vc/Memory>
+ *
+ * \see Vc::malloc
+ */
+template<typename T>
+Vc_ALWAYS_INLINE void free(T *p)
+{
+    Common::free(p);
+}
+}  // namespace Vc
+
+#endif // VC_COMMON_MALLOC_H_
--- a/Vc/common/mask.h
+++ b/Vc/common/mask.h
@ -0,0 +1,435 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_MASK_H_
+#define VC_COMMON_MASK_H_
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+/**
+ * \class Mask mask.h <Vc/vector.h>
+ * \ingroup Masks
+ *
+ * The main SIMD mask class.
+ */
+template <typename T, typename Abi = VectorAbi::Best<T>> class Mask
+{
+public:
+    /**
+     * Returns the number of boolean components (\VSize{T}) in a mask of this type.
+     *
+     * The size of the mask. I.e. the number of boolean entries in the mask. Do not
+     * make any assumptions about the size of masks.
+     *
+     * In addition, you can easily use if clauses that compare sizes. The compiler can
+     * statically evaluate and fully optimize dead code away (very much like \#ifdef, but
+     * with syntax checking).
+     *
+     * \returns The number of components (i.e. \VSize{T}) objects of this mask type store
+     * and manipulate.
+     */
+    static constexpr size_t size() { return VectorTraits<T, Abi>::size(); }
+    ///\copydoc size
+    ///\deprecated Use Vc::Mask::size instead.
+    static constexpr size_t Size = VectorTraits<T, Abi>::size();
+
+    /**
+     * Specifies the alignment requirement for aligned load and store calls for objects of
+     * this mask type.
+     */
+    static constexpr size_t MemoryAlignment = VectorTraits<T, Abi>::maskMemoryAlignment();
+
+    /// The ABI tag type of the current template instantiation.
+    using abi = Abi;
+
+    /**
+     * The \c EntryType of masks is always \c bool, independent of \c T.
+     */
+    using EntryType = bool;
+    /// \copydoc EntryType
+    using value_type = EntryType;
+
+    /// The reference wrapper type used for accessing individual mask components.
+    using EntryReference = typename VectorTraits<T, Abi>::EntryReference;
+    /// \copydoc EntryReference
+    using value_reference = EntryReference;
+
+    /**
+     * The \c VectorEntryType, in contrast to \c EntryType, reveals information about the SIMD
+     * implementation.
+     * This type is useful for the \c sizeof operator in generic functions.
+     */
+    using VectorEntryType = typename VectorTraits<T, Abi>::VectorEntryType;
+
+    /**\internal
+     * The \c VectorType reveals the implementation-specific internal type used for the SIMD type.
+     */
+    using VectorType = typename VectorTraits<T, Abi>::VectorType;
+    /**\internal
+     * \copydoc VectorType
+     */
+    using vector_type = VectorType;
+
+    /*
+     * The associated Vector<T> type.
+     */
+    //using Vector = Vector<T, Abi>;
+
+    /// \name Generators
+    ///@{
+    /**
+     * Creates a new mask object initialized to zero/\c false.
+     *
+     * \returns A mask object with zero-initialized components.
+     */
+    Vc_INTRINSIC static Mask Zero();
+
+    /**
+     * Creates a mask object initialized to one/\c true.
+     *
+     * \returns A mask object with components initialized to \c true.
+     */
+    Vc_INTRINSIC static Mask One();
+
+    /// Generate a mask object from booleans returned from the function \p gen.
+    template <typename G> static Vc_INTRINSIC Mask generate(G &&gen);
+    ///@}
+
+    /// \name Compile-Time Constant Initialization
+    ///@{
+    /**
+     * Construct a zero-initialized vector object.
+     *
+     * This constructor follows the behavior of the underlying \c bool type in that the
+     * expression `bool()` zero-initializes the object (to \c false). On the other hand
+     * the variable \c x in `bool x;` is uninitialized.
+     * Since, for class types, both expressions call the default constructor `Mask<T> x`
+     * must zero-initialize \c x as well.
+     */
+    Vc_INTRINSIC Mask() = default;
+
+    /// Zero-initialize the new mask object (\c false).
+    /// \see Vc::Zero, Zero()
+    Vc_INTRINSIC explicit Mask(VectorSpecialInitializerZero);
+
+    /// Initialize the new mask object to one (\c true).
+    /// \see Vc::One, One()
+    Vc_INTRINSIC explicit Mask(VectorSpecialInitializerOne);
+    ///@}
+
+    /// \name Conversion/Broadcast Constructors
+    ///@{
+    /**
+     * Broadcast constructor.
+     *
+     * Set all components of the new mask object to \p b.
+     *
+     * \param b Determines the initial state of the mask.
+     */
+    Vc_INTRINSIC explicit Mask(bool b);
+
+    /**
+     * Implicit conversion from a compatible (equal \VSize{T} on every platform) mask
+     * object.
+     *
+     * \param otherMask The mask to be converted.
+     */
+    template <typename U>
+    Vc_INTRINSIC Mask(U &&otherMask,
+                      Common::enable_if_mask_converts_implicitly<Mask, T, U> = nullarg);
+
+#if Vc_IS_VERSION_1
+    /**
+     * Explicit conversion (static_cast) from a mask object that potentially has a
+     * different \VSize{T}.
+     *
+     * \param otherMask The mask to be converted.
+     *
+     * \internal This is implemented via simd_cast in scalar/simd_cast_caller.h
+     */
+    template <typename U>
+    Vc_DEPRECATED(
+        "use simd_cast instead of explicit type casting to convert between mask types")
+        Vc_INTRINSIC_L
+        explicit Mask(U &&otherMask, Common::enable_if_mask_converts_explicitly<T, U> =
+                                         nullarg) Vc_INTRINSIC_R;
+    ///@}
+#endif
+
+    /**
+     * \name Loads & Stores
+     */
+    ///@{
+    /**
+     * Load constructor from an array of \c bool.
+     *
+     * This constructor implements an explicit conversion from an array of booleans to a
+     * mask object. It corresponds to a Vector load constructor.
+     *
+     * \param mem A pointer to the start of the array of booleans.
+     * \see Mask(const bool *, Flags), load(const bool *)
+     */
+    Vc_ALWAYS_INLINE explicit Mask(const bool *mem);
+    /**
+     * Overload of the above with a load/store flag argument.
+     *
+     * \param mem A pointer to the start of the array of booleans.
+     * \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming,
+     * Vc::Unaligned, Vc::PrefetchDefault, ...
+     * \see load(const bool *, Flags)
+     */
+    template <typename Flags> Vc_ALWAYS_INLINE explicit Mask(const bool *mem, Flags flags);
+
+    /**
+     * Load the components of the mask from an array of \c bool.
+     *
+     * \param mem A pointer to the start of the array of booleans.
+     * \see load(const bool *, Flags), Mask(const bool *)
+     */
+    Vc_ALWAYS_INLINE void load(const bool *mem);
+    /**
+     * Overload of the above with a load/store flag argument.
+     *
+     * \param mem A pointer to the start of the array of booleans.
+     * \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming,
+     * Vc::Unaligned, Vc::PrefetchDefault, ...
+     * \see Mask(const bool *, Flags)
+     */
+    template <typename Flags> Vc_ALWAYS_INLINE void load(const bool *mem, Flags flags);
+
+    /**
+     * Store the values of the mask to an array of \c bool.
+     *
+     * \param mem A pointer to the start of the array of booleans.
+     * \see store(bool *, Flags)
+     */
+    Vc_ALWAYS_INLINE void store(bool *mem) const;
+    /**
+     * Overload of the above with a load/store flag argument.
+     *
+     * \param mem A pointer to the start of the array of booleans.
+     * \param flags Choose a combination of flags such as Vc::Aligned, Vc::Streaming,
+     * Vc::Unaligned, Vc::PrefetchDefault, ...
+     */
+    template <typename Flags> Vc_ALWAYS_INLINE void store(bool *mem, Flags flags) const;
+    ///@}
+
+    /// \name Comparison Operators
+    ///@{
+    /**
+     * Returns whether the two masks are equal in all components.
+     *
+     * \param mask The other mask to compare against.
+     * \returns A scalar boolean value that says whether all components of the two masks
+     * are equal.
+     *
+     * \note If you expected a behavior similar to the compare operator of Vc::Vector,
+     * consider that the bitwise operators already implement such functionality. There is
+     * little use, typically, in having `a == b` return the same as `a ^ b`. In general,
+     * it is more useful to query `all_of(a ^ b)` which is the same as this equality
+     * operator.
+     */
+    Vc_ALWAYS_INLINE bool operator==(const Mask &mask) const;
+
+    /**
+     * Returns whether the two masks are different in at least one component.
+     *
+     * \param mask The other mask to compare against.
+     * \returns A scalar boolean value that says whether at least one component of the two masks is different.
+     *
+     * \note `(a == b) == !(a != b)` holds
+     * \see Mask::operator==(const Mask &)
+     */
+    Vc_ALWAYS_INLINE bool operator!=(const Mask &mask) const;
+    ///@}
+
+    /**
+     * \name Logical and Binary Operators
+     *
+     * \brief Component-wise logical/binary operations on mask objects.
+     *
+     * The effect of logical and binary \c AND and \c OR is equivalent for mask types (as
+     * it is for \c bool).
+     */
+    ///@{
+
+    /// Returns the component-wise application of a logical \c AND to \p mask.
+    Vc_ALWAYS_INLINE Mask operator&&(const Mask &mask) const;
+    /// Returns the component-wise application of a binary \c AND to \p mask.
+    Vc_ALWAYS_INLINE Mask operator&(const Mask &mask) const;
+    /// Returns the component-wise application of a logical \c OR to \p mask.
+    Vc_ALWAYS_INLINE Mask operator||(const Mask &mask) const;
+    /// Returns the component-wise application of a binary \c OR to \p mask.
+    Vc_ALWAYS_INLINE Mask operator|(const Mask &mask) const;
+    /// Returns the component-wise application of a binary \c XOR to \p mask.
+    Vc_ALWAYS_INLINE Mask operator^(const Mask &mask) const;
+    /// Returns a mask with inverted components.
+    Vc_ALWAYS_INLINE Mask operator!() const;
+
+    /// Modifies the mask using an \c AND operation with \p mask.
+    Vc_ALWAYS_INLINE Mask &operator&=(const Mask &mask);
+    /// Modifies the mask using an \c OR operation with \p mask.
+    Vc_ALWAYS_INLINE Mask &operator|=(const Mask &mask);
+    /// Modifies the mask using an \c XOR operation with \p mask.
+    Vc_ALWAYS_INLINE Mask &operator^=(const Mask &mask);
+    ///@}
+
+    /**
+     * \name Reductions
+     *
+     * \see any_of, all_of, none_of, some_of
+     */
+    ///@{
+
+    /// Returns a logical \c AND of all components.
+    Vc_ALWAYS_INLINE bool isFull() const;
+    /// Returns a logical \c OR of all components.
+    Vc_ALWAYS_INLINE bool isNotEmpty() const;
+    /// Returns \c true if components are \c false, \c false otherwise.
+    Vc_ALWAYS_INLINE bool isEmpty() const;
+    /// Returns `!isFull() && !isEmpty()`.
+    Vc_ALWAYS_INLINE bool isMix() const;
+    ///@}
+
+    /**\internal
+     * \name Internal Data Access
+     */
+    ///@{
+    Vc_ALWAYS_INLINE bool data() const;
+    Vc_ALWAYS_INLINE bool dataI() const;
+    Vc_ALWAYS_INLINE bool dataD() const;
+    ///@}
+
+    /// \name Scalar Subscript Operators
+    ///@{
+    /**
+     * Lvalue-reference-like access to mask entries.
+     *
+     * \param index Determines the boolean to be accessed.
+     * \return a temporary proxy object referencing the \p index th entry of the mask.
+     *
+     * \warning This operator does not return an lvalue reference (to \c bool), but rather
+     * a temporary (rvalue) object that mimics an lvalue reference (as much as is possible
+     * with C++11/14).
+     */
+    Vc_ALWAYS_INLINE EntryReference operator[](size_t index);
+
+    /**
+     * Read-only access to mask entries.
+     *
+     * \param index Determines the boolean to be accessed.
+     * \return The \p index th entry of the mask as a \c bool (rvalue).
+     *
+     * \warning This operator does not return an lvalue reference (to `const bool`), but
+     * rather a temporary (rvalue) \c bool.
+     */
+    Vc_ALWAYS_INLINE EntryType operator[](size_t index) const;
+    ///@}
+
+    /// Returns how many components of the mask are \c true.
+    Vc_ALWAYS_INLINE int count() const;
+
+    /**
+     * Returns the index of the first one in the mask.
+     *
+     * \returns the index of the first component that is \c true.
+     *
+     * \warning The return value is undefined if the mask is empty.
+     *
+     * Thus, unless `none_of(mask)`, `mask[mask.firstOne()] == true` holds and `mask[i] ==
+     * false` for all `i < mask.firstOne()`.
+     */
+    Vc_ALWAYS_INLINE int firstOne() const;
+
+    /**
+     * Convert the boolean components of the mask into bits of an integer.
+     *
+     * \return An \c int where each bit corresponds to the boolean value in the mask.
+     *
+     * For example, the mask `[true, false, false, true]` results in a `9` (in binary: `1001`).
+     */
+    Vc_ALWAYS_INLINE int toInt() const;
+
+    /// Returns a mask with components shifted by \p amount places.
+    Vc_INTRINSIC Vc_PURE Mask shifted(int amount) const;
+
+    Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Mask));
+
+private:
+    VectorType d;
+};
+
+/**
+ * \ingroup Utilities
+ *
+ * \name Boolean Reductions
+ */
+//@{
+/** \ingroup Utilities
+ *  Returns whether all entries in the mask \p m are \c true.
+ */
+template<typename Mask> constexpr bool all_of(const Mask &m) { return m.isFull(); }
+/** \ingroup Utilities
+ *  Returns \p b
+ */
+constexpr bool all_of(bool b) { return b; }
+
+/** \ingroup Utilities
+ *  Returns whether at least one entry in the mask \p m is \c true.
+ */
+template<typename Mask> constexpr bool any_of(const Mask &m) { return m.isNotEmpty(); }
+/** \ingroup Utilities
+ *  Returns \p b
+ */
+constexpr bool any_of(bool b) { return b; }
+
+/** \ingroup Utilities
+ *  Returns whether all entries in the mask \p m are \c false.
+ */
+template<typename Mask> constexpr bool none_of(const Mask &m) { return m.isEmpty(); }
+/** \ingroup Utilities
+ *  Returns \p !b
+ */
+constexpr bool none_of(bool b) { return !b; }
+
+/** \ingroup Utilities
+ *  Returns whether at least one entry in \p m is \c true and at least one entry in \p m is \c
+ *  false.
+ */
+template<typename Mask> constexpr bool some_of(const Mask &m) { return m.isMix(); }
+/** \ingroup Utilities
+ *  Returns \c false
+ */
+constexpr bool some_of(bool) { return false; }
+//@}
+}  // namespace Vc
+
+#endif  // VC_COMMON_MASK_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/maskbool.h
+++ b/Vc/common/maskbool.h
@ -0,0 +1,98 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_MASKENTRY_H_
+#define VC_COMMON_MASKENTRY_H_
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+
+namespace
+{
+    template<size_t Bytes> struct MaskBoolStorage;
+    // the following for typedefs must use std::intN_t and NOT! Vc::intN_t. The latter
+    // segfaults ICC 15.0.3.
+    template<> struct MaskBoolStorage<1> { typedef std::int8_t  type; };
+    template<> struct MaskBoolStorage<2> { typedef std::int16_t type; };
+    template<> struct MaskBoolStorage<4> { typedef std::int32_t type; };
+    template<> struct MaskBoolStorage<8> { typedef std::int64_t type; };
+} // anonymous namespace
+
+template<size_t Bytes> class MaskBool
+{
+    typedef typename MaskBoolStorage<Bytes>::type storage_type Vc_MAY_ALIAS;
+    storage_type data;
+public:
+    constexpr MaskBool(bool x) noexcept : data(x ? -1 : 0) {}
+    Vc_ALWAYS_INLINE MaskBool &operator=(bool x) noexcept { data = x ? -1 : 0; return *this; }
+    template <typename T, typename = enable_if<(!std::is_same<T, bool>::value &&
+                                                std::is_fundamental<T>::value)>>
+    Vc_ALWAYS_INLINE MaskBool &operator=(T x) noexcept
+    {
+        data = reinterpret_cast<const storage_type &>(x);
+        return *this;
+    }
+
+    Vc_ALWAYS_INLINE MaskBool(const MaskBool &) noexcept = default;
+    Vc_ALWAYS_INLINE MaskBool &operator=(const MaskBool &) noexcept = default;
+
+    template <typename T, typename = enable_if<(std::is_same<T, bool>::value ||
+                                                (std::is_fundamental<T>::value &&
+                                                 sizeof(storage_type) == sizeof(T)))>>
+    constexpr operator T() const noexcept
+    {
+        return std::is_same<T, bool>::value ? T((data & 1) != 0) : aliasing_cast<T>(data);
+    }
+} Vc_MAY_ALIAS;
+
+template <typename A,
+          typename B,
+          typename std::enable_if<
+              std::is_convertible<A, bool>::value &&std::is_convertible<B, bool>::value,
+              int>::type = 0>
+constexpr bool operator==(A &&a, B &&b)
+{
+    return static_cast<bool>(a) == static_cast<bool>(b);
+}
+template <typename A,
+          typename B,
+          typename std::enable_if<
+              std::is_convertible<A, bool>::value &&std::is_convertible<B, bool>::value,
+              int>::type = 0>
+constexpr bool operator!=(A &&a, B &&b)
+{
+    return static_cast<bool>(a) != static_cast<bool>(b);
+}
+
+}  // namespace Common
+}  // namespace Vc
+
+#endif // VC_COMMON_MASKENTRY_H_
--- a/Vc/common/math.h
+++ b/Vc/common/math.h
@ -0,0 +1,142 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_MATH_H_
+#define VC_COMMON_MATH_H_
+
+#define Vc_COMMON_MATH_H_INTERNAL 1
+
+#include "trigonometric.h"
+
+#include "const.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+// TODO, not vectorized:
+template <class T, class Abi>
+SimdArray<int, Vector<T, Abi>::size()> fpclassify(const Vector<T, Abi> &x)
+{
+    return SimdArray<int, Vector<T, Abi>::size()>(
+        [&](std::size_t i) { return std::fpclassify(x[i]); });
+}
+template <class T, size_t N> SimdArray<int, N> fpclassify(const SimdArray<T, N> &x)
+{
+    return SimdArray<int, N>([&](std::size_t i) { return std::fpclassify(x[i]); });
+}
+
+#ifdef Vc_IMPL_SSE
+// for SSE, AVX, and AVX2
+#include "logarithm.h"
+#include "exponential.h"
+#ifdef Vc_IMPL_AVX
+inline AVX::double_v exp(AVX::double_v _x)
+{
+    AVX::Vector<double> x = _x;
+    typedef AVX::Vector<double> V;
+        typedef V::Mask M;
+    typedef AVX::Const<double> C;
+
+        const M overflow  = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>(); // max log
+        const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>(); // min log
+
+        V px = floor(C::log2_e() * x + 0.5);
+        __m128i tmp = _mm256_cvttpd_epi32(px.data());
+    const SimdArray<int, V::Size> n = SSE::int_v{tmp};
+        x -= px * C::ln2_large(); //Vc::Detail::doubleConstant<1, 0x00062e4000000000ull, -1>();  // ln2
+        x -= px * C::ln2_small(); //Vc::Detail::doubleConstant<1, 0x0007f7d1cf79abcaull, -20>(); // ln2
+
+        const double P[] = {
+            Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(),
+            Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull,  -6>(),
+            Vc::Detail::doubleConstant<1, 0x0000000000000000ull,   0>()
+        };
+        const double Q[] = {
+            Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(),
+            Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull,  -9>(),
+            Vc::Detail::doubleConstant<1, 0x000d17099887e074ull,  -3>(),
+            Vc::Detail::doubleConstant<1, 0x0000000000000000ull,   1>()
+        };
+        const V x2 = x * x;
+        px = x * ((P[0] * x2 + P[1]) * x2 + P[2]);
+        x =  px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px);
+        x = V::One() + 2.0 * x;
+
+        x = ldexp(x, n); // == x * 2ⁿ
+
+        x(overflow) = std::numeric_limits<double>::infinity();
+        x.setZero(underflow);
+
+        return x;
+    }
+#endif  // Vc_IMPL_AVX
+
+inline SSE::double_v exp(SSE::double_v::AsArg _x) {
+    SSE::Vector<double> x = _x;
+    typedef SSE::Vector<double> V;
+        typedef V::Mask M;
+    typedef SSE::Const<double> C;
+
+        const M overflow  = x > Vc::Detail::doubleConstant< 1, 0x0006232bdd7abcd2ull, 9>(); // max log
+        const M underflow = x < Vc::Detail::doubleConstant<-1, 0x0006232bdd7abcd2ull, 9>(); // min log
+
+        V px = floor(C::log2_e() * x + 0.5);
+    SimdArray<int, V::Size> n;
+        _mm_storel_epi64(reinterpret_cast<__m128i *>(&n), _mm_cvttpd_epi32(px.data()));
+        x -= px * C::ln2_large(); //Vc::Detail::doubleConstant<1, 0x00062e4000000000ull, -1>();  // ln2
+        x -= px * C::ln2_small(); //Vc::Detail::doubleConstant<1, 0x0007f7d1cf79abcaull, -20>(); // ln2
+
+        const double P[] = {
+            Vc::Detail::doubleConstant<1, 0x000089cdd5e44be8ull, -13>(),
+            Vc::Detail::doubleConstant<1, 0x000f06d10cca2c7eull,  -6>(),
+            Vc::Detail::doubleConstant<1, 0x0000000000000000ull,   0>()
+        };
+        const double Q[] = {
+            Vc::Detail::doubleConstant<1, 0x00092eb6bc365fa0ull, -19>(),
+            Vc::Detail::doubleConstant<1, 0x0004ae39b508b6c0ull,  -9>(),
+            Vc::Detail::doubleConstant<1, 0x000d17099887e074ull,  -3>(),
+            Vc::Detail::doubleConstant<1, 0x0000000000000000ull,   1>()
+        };
+        const V x2 = x * x;
+        px = x * ((P[0] * x2 + P[1]) * x2 + P[2]);
+        x =  px / ((((Q[0] * x2 + Q[1]) * x2 + Q[2]) * x2 + Q[3]) - px);
+        x = V::One() + 2.0 * x;
+
+        x = ldexp(x, n); // == x * 2ⁿ
+
+        x(overflow) = std::numeric_limits<double>::infinity();
+        x.setZero(underflow);
+
+        return x;
+    }
+
+#endif
+}  // namespace Vc
+
+#undef Vc_COMMON_MATH_H_INTERNAL
+
+#endif // VC_COMMON_MATH_H_
--- a/Vc/common/memory.h
+++ b/Vc/common/memory.h
@ -0,0 +1,591 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_MEMORY_H_
+#define VC_COMMON_MEMORY_H_
+
+#include "memorybase.h"
+#include <assert.h>
+#include <algorithm>
+#include <cstring>
+#include <cstddef>
+#include <initializer_list>
+#include "memoryfwd.h"
+#include "malloc.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+template<typename V, size_t Size> struct _MemorySizeCalculation
+{
+    enum AlignmentCalculations {
+        Alignment = V::Size,
+        AlignmentMask = Alignment - 1,
+        MaskedSize = Size & AlignmentMask,
+        Padding = Alignment - MaskedSize,
+        PaddedSize = MaskedSize == 0 ? Size : Size + Padding
+    };
+};
+
+/**
+ * \ingroup Containers
+ * \headerfile memory.h <Vc/Memory>
+ *
+ * A helper class for fixed-size two-dimensional arrays.
+ *
+ * \param V The vector type you want to operate on. (e.g. float_v or uint_v)
+ * \param Size1 Number of rows
+ * \param Size2 Number of columns
+ */
+template <typename V, size_t Size1, size_t Size2, bool InitPadding>
+class Memory : public MemoryBase<V, Memory<V, Size1, Size2, InitPadding>, 2,
+                                 Memory<V, Size2, 0, InitPadding>>
+{
+public:
+    typedef typename V::EntryType EntryType;
+
+private:
+    using RowMemory = Memory<V, Size2, 0, InitPadding>;
+    typedef MemoryBase<V, Memory<V, Size1, Size2, InitPadding>, 2, RowMemory> Base;
+    friend class MemoryBase<V, Memory<V, Size1, Size2, InitPadding>, 2, RowMemory>;
+    friend class MemoryDimensionBase<V, Memory<V, Size1, Size2, InitPadding>, 2,
+                                     RowMemory>;
+    enum : size_t {
+        Alignment = V::MemoryAlignment,
+        PaddedSize2 = _MemorySizeCalculation<V, Size2>::PaddedSize
+    };
+    alignas(static_cast<size_t>(Alignment))  // GCC complains about 'is not an
+                                             // integer constant' unless the
+                                             // static_cast is present
+        RowMemory m_mem[Size1];
+
+        public:
+            using Base::vector;
+            enum Constants {
+                RowCount = Size1,
+                VectorsCount = PaddedSize2 / V::Size
+            };
+
+            Memory() = default;
+
+            /**
+             * \return the number of rows in the array.
+             *
+             * \note This function can be eliminated by an optimizing compiler.
+             */
+            static constexpr size_t rowsCount() { return RowCount; }
+            /**
+             * \return the number of scalar entries in the whole array.
+             *
+             * \warning Do not use this function for scalar iteration over the array since there will be
+             * padding between rows if \c Size2 is not divisible by \c V::Size.
+             *
+             * \note This function can be optimized into a compile-time constant.
+             */
+            static constexpr size_t entriesCount() { return Size1 * Size2; }
+            /**
+             * \return the number of vectors in the whole array.
+             *
+             * \note This function can be optimized into a compile-time constant.
+             */
+            static constexpr size_t vectorsCount() { return VectorsCount * Size1; }
+
+            /**
+             * Copies the data from a different object.
+             *
+             * \param rhs The object to copy the data from.
+             *
+             * \return reference to the modified Memory object.
+             *
+             * \note Both objects must have the exact same vectorsCount().
+             */
+            template<typename Parent, typename RM>
+            Vc_ALWAYS_INLINE Memory &operator=(const MemoryBase<V, Parent, 2, RM> &rhs) {
+                assert(vectorsCount() == rhs.vectorsCount());
+                Detail::copyVectors(*this, rhs);
+                return *this;
+            }
+
+            Vc_ALWAYS_INLINE Memory &operator=(const Memory &rhs) {
+                Detail::copyVectors(*this, rhs);
+                return *this;
+            }
+
+            /**
+             * Initialize all data with the given vector.
+             *
+             * \param v This vector will be used to initialize the memory.
+             *
+             * \return reference to the modified Memory object.
+             */
+            inline Memory &operator=(const V &v) {
+                for (size_t i = 0; i < vectorsCount(); ++i) {
+                    vector(i) = v;
+                }
+                return *this;
+            }
+};
+
+    /**
+     * A helper class to simplify usage of correctly aligned and padded memory, allowing both vector and
+     * scalar access.
+     *
+     * Example:
+     * \code
+        Vc::Memory<int_v, 11> array;
+
+        // scalar access:
+        for (size_t i = 0; i < array.entriesCount(); ++i) {
+            int x = array[i]; // read
+            array[i] = x;     // write
+        }
+        // more explicit alternative:
+        for (size_t i = 0; i < array.entriesCount(); ++i) {
+            int x = array.scalar(i); // read
+            array.scalar(i) = x;     // write
+        }
+
+        // vector access:
+        for (size_t i = 0; i < array.vectorsCount(); ++i) {
+            int_v x = array.vector(i); // read
+            array.vector(i) = x;       // write
+        }
+     * \endcode
+     * This code allocates a small array and implements three equivalent loops (that do nothing useful).
+     * The loops show how scalar and vector read/write access is best implemented.
+     *
+     * Since the size of 11 is not a multiple of int_v::Size (unless you use the
+     * scalar Vc implementation) the last write access of the vector loop would normally be out of
+     * bounds. But the Memory class automatically pads the memory such that the whole array can be
+     * accessed with correctly aligned memory addresses.
+     *
+     * \param V The vector type you want to operate on. (e.g. float_v or uint_v)
+     * \param Size The number of entries of the scalar base type the memory should hold. This
+     * is thus the same number as you would use for a normal C array (e.g. float mem[11] becomes
+     * Memory<float_v, 11> mem).
+     *
+     * \see Memory<V, 0u>
+     *
+     * \ingroup Containers
+     * \headerfile memory.h <Vc/Memory>
+     */
+template <typename V, size_t Size, bool InitPadding>
+class Memory<V, Size, 0u, InitPadding> :
+    public MemoryBase<V, Memory<V, Size, 0u, InitPadding>, 1, void>
+    {
+        public:
+            typedef typename V::EntryType EntryType;
+        private:
+            typedef MemoryBase<V, Memory<V, Size, 0u, InitPadding>, 1, void> Base;
+            friend class MemoryBase<V, Memory<V, Size, 0u, InitPadding>, 1, void>;
+            friend class MemoryDimensionBase<V, Memory<V, Size, 0u, InitPadding>, 1, void>;
+            enum : size_t {
+                Alignment = V::MemoryAlignment,     // in Bytes
+                MaskedSize = Size & (V::Size - 1),  // the fraction of Size that exceeds
+                                                    // an integral multiple of V::Size
+                Padding = V::Size - MaskedSize,
+                PaddedSize = MaskedSize == 0 ? Size : Size + Padding
+            };
+            alignas(static_cast<size_t>(Alignment))  // GCC complains about 'is not an
+                                                     // integer constant' unless the
+                                                     // static_cast is present
+                EntryType m_mem[PaddedSize];
+
+        public:
+            using Base::vector;
+            enum Constants {
+                EntriesCount = Size,
+                VectorsCount = PaddedSize / V::Size
+            };
+
+            Memory()
+            {
+                if (InitPadding) {
+                    Base::lastVector() = V::Zero();
+                }
+            }
+
+            Memory(std::initializer_list<EntryType> init)
+            {
+                Vc_ASSERT(init.size() <= Size);
+                Base::lastVector() = V::Zero();
+                std::copy(init.begin(), init.end(), &m_mem[0]);
+            }
+
+            /**
+             * Wrap existing data with the Memory convenience class.
+             *
+             * This function returns a \em reference to a Memory<V, Size, 0> object that you must
+             * capture to avoid a copy of the whole data:
+             * \code
+             * Memory<float_v, 16> &m = Memory<float_v, 16>::fromRawData(someAlignedPointerToFloat)
+             * \endcode
+             *
+             * \param ptr An aligned pointer to memory of type \p V::EntryType (e.g. \c float for
+             *            Vc::float_v).
+             * \return A Memory object placed at the given location in memory.
+             *
+             * \warning The pointer \p ptr passed to this function must be aligned according to the
+             * alignment restrictions of \p V.
+             * \warning The size of the accessible memory must match \p Size. This includes the
+             * required padding at the end to allow the last entries to be accessed via vectors. If
+             * you know what you are doing you might violate this constraint.
+             * \warning It is your responsibility to ensure that the memory is released correctly
+             * (not too early/not leaked). This function simply adds convenience functions to \em
+             * access the memory.
+             */
+            static Vc_ALWAYS_INLINE Vc_CONST Memory<V, Size, 0u, false> &fromRawData(EntryType *ptr)
+            {
+                // DANGER! This placement new has to use the right address. If the compiler decides
+                // RowMemory requires padding before the actual data then the address has to be adjusted
+                // accordingly
+                char *addr = reinterpret_cast<char *>(ptr);
+                typedef Memory<V, Size, 0u, false> MM;
+                addr -= offsetof(MM, m_mem);
+                return *new(addr) MM;
+            }
+
+            /**
+             * \return the number of scalar entries in the whole array.
+             *
+             * \note This function can be optimized into a compile-time constant.
+             */
+            static constexpr size_t entriesCount() { return EntriesCount; }
+
+            /**
+             * \return the number of vectors in the whole array.
+             *
+             * \note This function can be optimized into a compile-time constant.
+             */
+            static constexpr size_t vectorsCount() { return VectorsCount; }
+
+            inline Memory(const Memory &rhs)
+            {
+                Detail::copyVectors(*this, rhs);
+            }
+
+            template <size_t S> inline Memory(const Memory<V, S> &rhs)
+            {
+                assert(vectorsCount() == rhs.vectorsCount());
+                Detail::copyVectors(*this, rhs);
+            }
+
+            inline Memory &operator=(const Memory &rhs)
+            {
+                Detail::copyVectors(*this, rhs);
+                return *this;
+            }
+
+            template <size_t S> inline Memory &operator=(const Memory<V, S> &rhs)
+            {
+                assert(vectorsCount() == rhs.vectorsCount());
+                Detail::copyVectors(*this, rhs);
+                return *this;
+            }
+
+            Vc_ALWAYS_INLINE Memory &operator=(const EntryType *rhs) {
+                std::memcpy(m_mem, rhs, entriesCount() * sizeof(EntryType));
+                return *this;
+            }
+            inline Memory &operator=(const V &v) {
+                for (size_t i = 0; i < vectorsCount(); ++i) {
+                    vector(i) = v;
+                }
+                return *this;
+            }
+    };
+
+    /**
+     * A helper class that is very similar to Memory<V, Size> but with dynamically allocated memory and
+     * thus dynamic size.
+     *
+     * Example:
+     * \code
+        size_t size = 11;
+        Vc::Memory<int_v> array(size);
+
+        // scalar access:
+        for (size_t i = 0; i < array.entriesCount(); ++i) {
+            array[i] = i;
+        }
+
+        // vector access:
+        for (size_t i = 0; i < array.vectorsCount(); ++i) {
+            array.vector(i) = int_v::IndexesFromZero() + i * int_v::Size;
+        }
+     * \endcode
+     * This code allocates a small array with 11 scalar entries
+     * and implements two equivalent loops that initialize the memory.
+     * The scalar loop writes each individual int. The vectorized loop writes int_v::Size values to
+     * memory per iteration. Since the size of 11 is not a multiple of int_v::Size (unless you use the
+     * scalar Vc implementation) the last write access of the vector loop would normally be out of
+     * bounds. But the Memory class automatically pads the memory such that the whole array can be
+     * accessed with correctly aligned memory addresses.
+     * (Note: the scalar loop can be auto-vectorized, except for the last three assignments.)
+     *
+     * \note The internal data pointer is not declared with the \c __restrict__ keyword. Therefore
+     * modifying memory of V::EntryType will require the compiler to assume aliasing. If you want to use
+     * the \c __restrict__ keyword you need to use a standard pointer to memory and do the vector
+     * address calculation and loads and stores manually.
+     *
+     * \param V The vector type you want to operate on. (e.g. float_v or uint_v)
+     *
+     * \see Memory<V, Size>
+     *
+     * \ingroup Containers
+     * \headerfile memory.h <Vc/Memory>
+     */
+    template<typename V> class Memory<V, 0u, 0u, true> : public MemoryBase<V, Memory<V, 0u, 0u, true>, 1, void>
+    {
+        public:
+            typedef typename V::EntryType EntryType;
+        private:
+            typedef MemoryBase<V, Memory<V>, 1, void> Base;
+            friend class MemoryBase<V, Memory<V>, 1, void>;
+            friend class MemoryDimensionBase<V, Memory<V>, 1, void>;
+        enum InternalConstants {
+            Alignment = V::Size,
+            AlignmentMask = Alignment - 1
+        };
+        size_t m_entriesCount;
+        size_t m_vectorsCount;
+        EntryType *m_mem;
+        size_t calcPaddedEntriesCount(size_t x)
+        {
+            size_t masked = x & AlignmentMask;
+            return (masked == 0 ? x : x + (Alignment - masked));
+        }
+    public:
+        using Base::vector;
+
+        /**
+         * Allocate enough memory to access \p size values of type \p V::EntryType.
+         *
+         * The allocated memory is aligned and padded correctly for fully vectorized access.
+         *
+         * \param size Determines how many scalar values will fit into the allocated memory.
+         */
+        Vc_ALWAYS_INLINE Memory(size_t size)
+            : m_entriesCount(size),
+            m_vectorsCount(calcPaddedEntriesCount(m_entriesCount)),
+            m_mem(Vc::malloc<EntryType, Vc::AlignOnVector>(m_vectorsCount))
+        {
+            m_vectorsCount /= V::Size;
+            Base::lastVector() = V::Zero();
+        }
+
+        /**
+         * Copy the memory into a new memory area.
+         *
+         * The allocated memory is aligned and padded correctly for fully vectorized access.
+         *
+         * \param rhs The Memory object to copy from.
+         */
+        template<typename Parent, typename RM>
+        Vc_ALWAYS_INLINE Memory(const MemoryBase<V, Parent, 1, RM> &rhs)
+            : m_entriesCount(rhs.entriesCount()),
+            m_vectorsCount(rhs.vectorsCount()),
+            m_mem(Vc::malloc<EntryType, Vc::AlignOnVector>(m_vectorsCount * V::Size))
+        {
+            Detail::copyVectors(*this, rhs);
+        }
+
+        /**
+         * Overload of the above function.
+         *
+         * (Because C++ would otherwise not use the templated cctor and use a default-constructed cctor instead.)
+         *
+         * \param rhs The Memory object to copy from.
+         */
+        Vc_ALWAYS_INLINE Memory(const Memory &rhs)
+            : m_entriesCount(rhs.entriesCount()),
+            m_vectorsCount(rhs.vectorsCount()),
+            m_mem(Vc::malloc<EntryType, Vc::AlignOnVector>(m_vectorsCount * V::Size))
+        {
+            Detail::copyVectors(*this, rhs);
+        }
+
+        /**
+         * Frees the memory which was allocated in the constructor.
+         */
+        Vc_ALWAYS_INLINE ~Memory()
+        {
+            Vc::free(m_mem);
+        }
+
+        /**
+         * Swap the contents and size information of two Memory objects.
+         *
+         * \param rhs The other Memory object to swap.
+         */
+        inline void swap(Memory &rhs) {
+            std::swap(m_mem, rhs.m_mem);
+            std::swap(m_entriesCount, rhs.m_entriesCount);
+            std::swap(m_vectorsCount, rhs.m_vectorsCount);
+        }
+
+        /**
+         * \return the number of scalar entries in the whole array.
+         */
+        Vc_ALWAYS_INLINE Vc_PURE size_t entriesCount() const { return m_entriesCount; }
+
+        /**
+         * \return the number of vectors in the whole array.
+         */
+        Vc_ALWAYS_INLINE Vc_PURE size_t vectorsCount() const { return m_vectorsCount; }
+
+        /**
+         * Overwrite all entries with the values stored in \p rhs.
+         *
+         * \param rhs The object to copy the data from.
+         *
+         * \return reference to the modified Memory object.
+         *
+         * \note this function requires the vectorsCount() of both Memory objects to be equal.
+         */
+        template<typename Parent, typename RM>
+        Vc_ALWAYS_INLINE Memory &operator=(const MemoryBase<V, Parent, 1, RM> &rhs) {
+            assert(vectorsCount() == rhs.vectorsCount());
+            Detail::copyVectors(*this, rhs);
+            return *this;
+        }
+
+        Vc_ALWAYS_INLINE Memory &operator=(const Memory &rhs) {
+            assert(vectorsCount() == rhs.vectorsCount());
+            Detail::copyVectors(*this, rhs);
+            return *this;
+        }
+
+        /**
+         * Overwrite all entries with the values stored in the memory at \p rhs.
+         *
+         * \param rhs The array to copy the data from.
+         *
+         * \return reference to the modified Memory object.
+         *
+         * \note this function requires that there are entriesCount() many values accessible from \p rhs.
+         */
+        Vc_ALWAYS_INLINE Memory &operator=(const EntryType *rhs) {
+            std::memcpy(m_mem, rhs, entriesCount() * sizeof(EntryType));
+            return *this;
+        }
+};
+
+/**
+ * Prefetch the cacheline containing \p addr for a single read access.
+ *
+ * This prefetch completely bypasses the cache, not evicting any other data.
+ *
+ * \param addr The cacheline containing \p addr will be prefetched.
+ *
+ * \ingroup Utilities
+ * \headerfile memory.h <Vc/Memory>
+ */
+Vc_ALWAYS_INLINE void prefetchForOneRead(const void *addr)
+{
+    Vc::Detail::prefetchForOneRead(addr, VectorAbi::Best<float>());
+}
+
+/**
+ * Prefetch the cacheline containing \p addr for modification.
+ *
+ * This prefetch evicts data from the cache. So use it only for data you really will use. When the
+ * target system supports it the cacheline will be marked as modified while prefetching, saving work
+ * later on.
+ *
+ * \param addr The cacheline containing \p addr will be prefetched.
+ *
+ * \ingroup Utilities
+ * \headerfile memory.h <Vc/Memory>
+ */
+Vc_ALWAYS_INLINE void prefetchForModify(const void *addr)
+{
+    Vc::Detail::prefetchForModify(addr, VectorAbi::Best<float>());
+}
+
+/**
+ * Prefetch the cacheline containing \p addr to L1 cache.
+ *
+ * This prefetch evicts data from the cache. So use it only for data you really will use.
+ *
+ * \param addr The cacheline containing \p addr will be prefetched.
+ *
+ * \ingroup Utilities
+ * \headerfile memory.h <Vc/Memory>
+ */
+Vc_ALWAYS_INLINE void prefetchClose(const void *addr)
+{
+    Vc::Detail::prefetchClose(addr, VectorAbi::Best<float>());
+}
+
+/**
+ * Prefetch the cacheline containing \p addr to L2 cache.
+ *
+ * This prefetch evicts data from the cache. So use it only for data you really will use.
+ *
+ * \param addr The cacheline containing \p addr will be prefetched.
+ *
+ * \ingroup Utilities
+ * \headerfile memory.h <Vc/Memory>
+ */
+Vc_ALWAYS_INLINE void prefetchMid(const void *addr)
+{
+    Vc::Detail::prefetchMid(addr, VectorAbi::Best<float>());
+}
+
+/**
+ * Prefetch the cacheline containing \p addr to L3 cache.
+ *
+ * This prefetch evicts data from the cache. So use it only for data you really will use.
+ *
+ * \param addr The cacheline containing \p addr will be prefetched.
+ *
+ * \ingroup Utilities
+ * \headerfile memory.h <Vc/Memory>
+ */
+Vc_ALWAYS_INLINE void prefetchFar(const void *addr)
+{
+    Vc::Detail::prefetchFar(addr, VectorAbi::Best<float>());
+}
+}  // namespace Common
+
+using Common::Memory;
+using Common::prefetchForOneRead;
+using Common::prefetchForModify;
+using Common::prefetchClose;
+using Common::prefetchMid;
+using Common::prefetchFar;
+}  // namespace Vc
+
+namespace std
+{
+    template<typename V> Vc_ALWAYS_INLINE void swap(Vc::Memory<V> &a, Vc::Memory<V> &b) { a.swap(b); }
+} // namespace std
+
+#endif // VC_COMMON_MEMORY_H_
--- a/Vc/common/memorybase.h
+++ b/Vc/common/memorybase.h
@ -0,0 +1,819 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_MEMORYBASE_H_
+#define VC_COMMON_MEMORYBASE_H_
+
+#include <assert.h>
+#include <type_traits>
+#include <iterator>
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+
+#define Vc_MEM_OPERATOR_EQ(op) \
+        template<typename T> \
+        Vc_ALWAYS_INLINE enable_if_mutable<T, MemoryVector &> operator op##=(const T &x) { \
+            const V v = value() op x; \
+            v.store(&m_data[0], Flags()); \
+            return *this; \
+        }
+/*dox{{{*/
+/**
+ * Helper class for the Memory::vector(size_t) class of functions.
+ *
+ * You will never need to directly make use of this class. It is an implementation detail of the
+ * Memory API.
+ *
+ * \headerfile memorybase.h <Vc/Memory>
+ *//*}}}*/
+template<typename _V, typename Flags> class MemoryVector/*{{{*/
+{
+    typedef typename std::remove_cv<_V>::type V;
+
+    template<typename T, typename R> using enable_if_mutable =
+        typename std::enable_if<std::is_same<T, T>::value && !std::is_const<_V>::value, R>::type;
+
+    using EntryType =
+        typename std::conditional<std::is_const<_V>::value, const typename V::EntryType,
+                                  typename V::EntryType>::type;
+    typedef typename V::Mask Mask;
+
+    EntryType m_data[V::Size];
+
+public:
+        // It is important that neither initialization nor cleanup is done as MemoryVector aliases
+        // other memory
+        Vc_INTRINSIC MemoryVector() = default;
+
+        // disable copies because this type is supposed to alias the data in a Memory object,
+        // nothing else
+        MemoryVector(const MemoryVector &) = delete;
+        MemoryVector(MemoryVector &&) = delete;
+        // Do not disable MemoryVector &operator=(const MemoryVector &) = delete; because it is
+        // covered nicely by the operator= below.
+
+        //! \internal
+        Vc_ALWAYS_INLINE Vc_PURE V value() const { return V(&m_data[0], Flags()); }
+
+        /**
+         * Cast to \p V operator.
+         *
+         * This function allows to assign this object to any object of type \p V.
+         */
+        Vc_ALWAYS_INLINE Vc_PURE operator V() const { return value(); }
+
+        template<typename T>
+        Vc_ALWAYS_INLINE enable_if_mutable<T, MemoryVector &> operator=(const T &x) {
+            V v;
+            v = x;
+            v.store(&m_data[0], Flags());
+            return *this;
+        }
+
+        Vc_ALL_BINARY(Vc_MEM_OPERATOR_EQ);
+        Vc_ALL_ARITHMETICS(Vc_MEM_OPERATOR_EQ);
+
+    Vc_ALWAYS_INLINE EntryType &operator[](size_t i) { return m_data[i]; }
+    Vc_ALWAYS_INLINE const EntryType &operator[](size_t i) const { return m_data[i]; }
+};
+
+template<typename _V, typename Flags> class MemoryVectorIterator
+{
+    typedef typename std::remove_cv<_V>::type V;
+
+    template<typename T, typename R> using enable_if_mutable =
+        typename std::enable_if<std::is_same<T, T>::value && !std::is_const<_V>::value, R>::type;
+
+    using iterator_traits = std::iterator_traits<MemoryVector<_V, Flags> *>;
+
+    MemoryVector<_V, Flags> *d;
+public:
+    typedef typename iterator_traits::difference_type difference_type;
+    typedef typename iterator_traits::value_type value_type;
+    typedef typename iterator_traits::pointer pointer;
+    typedef typename iterator_traits::reference reference;
+    typedef typename iterator_traits::iterator_category iterator_category;
+
+    constexpr MemoryVectorIterator(MemoryVector<_V, Flags> *dd) : d(dd) {}
+    constexpr MemoryVectorIterator(const MemoryVectorIterator &) = default;
+    constexpr MemoryVectorIterator(MemoryVectorIterator &&) = default;
+    Vc_ALWAYS_INLINE MemoryVectorIterator &operator=(const MemoryVectorIterator &) = default;
+
+    Vc_ALWAYS_INLINE void *orderBy() const { return d; }
+
+    Vc_ALWAYS_INLINE difference_type operator-(const MemoryVectorIterator &rhs) const { return d - rhs.d; }
+    Vc_ALWAYS_INLINE reference operator[](size_t i) const { return d[i]; }
+    Vc_ALWAYS_INLINE reference operator*() const { return *d; }
+    Vc_ALWAYS_INLINE pointer operator->() const { return d; }
+    Vc_ALWAYS_INLINE MemoryVectorIterator &operator++() { ++d; return *this; }
+    Vc_ALWAYS_INLINE MemoryVectorIterator operator++(int) { MemoryVectorIterator r(*this); ++d; return r; }
+    Vc_ALWAYS_INLINE MemoryVectorIterator &operator--() { --d; return *this; }
+    Vc_ALWAYS_INLINE MemoryVectorIterator operator--(int) { MemoryVectorIterator r(*this); --d; return r; }
+    Vc_ALWAYS_INLINE MemoryVectorIterator &operator+=(size_t n) { d += n; return *this; }
+    Vc_ALWAYS_INLINE MemoryVectorIterator &operator-=(size_t n) { d -= n; return *this; }
+    Vc_ALWAYS_INLINE MemoryVectorIterator operator+(size_t n) const { return MemoryVectorIterator(d + n); }
+    Vc_ALWAYS_INLINE MemoryVectorIterator operator-(size_t n) const { return MemoryVectorIterator(d - n); }
+};
+
+template<typename V, typename FlagsL, typename FlagsR>
+Vc_ALWAYS_INLINE bool operator==(const MemoryVectorIterator<V, FlagsL> &l, const MemoryVectorIterator<V, FlagsR> &r)
+{
+    return l.orderBy() == r.orderBy();
+}
+template<typename V, typename FlagsL, typename FlagsR>
+Vc_ALWAYS_INLINE bool operator!=(const MemoryVectorIterator<V, FlagsL> &l, const MemoryVectorIterator<V, FlagsR> &r)
+{
+    return l.orderBy() != r.orderBy();
+}
+template<typename V, typename FlagsL, typename FlagsR>
+Vc_ALWAYS_INLINE bool operator>=(const MemoryVectorIterator<V, FlagsL> &l, const MemoryVectorIterator<V, FlagsR> &r)
+{
+    return l.orderBy() >= r.orderBy();
+}
+template<typename V, typename FlagsL, typename FlagsR>
+Vc_ALWAYS_INLINE bool operator<=(const MemoryVectorIterator<V, FlagsL> &l, const MemoryVectorIterator<V, FlagsR> &r)
+{
+    return l.orderBy() <= r.orderBy();
+}
+template<typename V, typename FlagsL, typename FlagsR>
+Vc_ALWAYS_INLINE bool operator> (const MemoryVectorIterator<V, FlagsL> &l, const MemoryVectorIterator<V, FlagsR> &r)
+{
+    return l.orderBy() >  r.orderBy();
+}
+template<typename V, typename FlagsL, typename FlagsR>
+Vc_ALWAYS_INLINE bool operator< (const MemoryVectorIterator<V, FlagsL> &l, const MemoryVectorIterator<V, FlagsR> &r)
+{
+    return l.orderBy() <  r.orderBy();
+}
+/*}}}*/
+#undef Vc_MEM_OPERATOR_EQ
+
+#define Vc_VPH_OPERATOR(op)                                                              \
+    template <typename V1, typename Flags1, typename V2, typename Flags2>                \
+    decltype(std::declval<V1>() op std::declval<V2>()) operator op(                      \
+        const MemoryVector<V1, Flags1> &x, const MemoryVector<V2, Flags2> &y)            \
+    {                                                                                    \
+        return x.value() op y.value();                                                   \
+    }
+Vc_ALL_ARITHMETICS(Vc_VPH_OPERATOR);
+Vc_ALL_BINARY     (Vc_VPH_OPERATOR);
+Vc_ALL_COMPARES   (Vc_VPH_OPERATOR);
+#undef Vc_VPH_OPERATOR
+
+template<typename V, typename Parent, typename Flags = Prefetch<>> class MemoryRange/*{{{*/
+{
+    Parent *m_parent;
+    size_t m_first;
+    size_t m_last;
+
+public:
+    MemoryRange(Parent *p, size_t firstIndex, size_t lastIndex)
+        : m_parent(p), m_first(firstIndex), m_last(lastIndex)
+    {}
+
+    MemoryVectorIterator<V, Flags> begin() const { return &m_parent->vector(m_first   , Flags()); }
+    MemoryVectorIterator<V, Flags> end() const   { return &m_parent->vector(m_last + 1, Flags()); }
+};/*}}}*/
+template<typename V, typename Parent, int Dimension, typename RowMemory> class MemoryDimensionBase;
+template<typename V, typename Parent, typename RowMemory> class MemoryDimensionBase<V, Parent, 1, RowMemory> // {{{1
+{
+    private:
+        Parent *p() { return static_cast<Parent *>(this); }
+        const Parent *p() const { return static_cast<const Parent *>(this); }
+    public:
+        /**
+         * The type of the scalar entries in the array.
+         */
+        typedef typename V::EntryType EntryType;
+
+        /**
+         * Returns a pointer to the start of the allocated memory.
+         */
+        Vc_ALWAYS_INLINE Vc_PURE       EntryType *entries()       { return &p()->m_mem[0]; }
+        /// Const overload of the above function.
+        Vc_ALWAYS_INLINE Vc_PURE const EntryType *entries() const { return &p()->m_mem[0]; }
+
+        /**
+         * Returns the \p i-th scalar value in the memory.
+         */
+        Vc_ALWAYS_INLINE Vc_PURE EntryType &scalar(size_t i) { return entries()[i]; }
+        /// Const overload of the above function.
+        Vc_ALWAYS_INLINE Vc_PURE const EntryType scalar(size_t i) const { return entries()[i]; }
+
+#ifdef DOXYGEN
+        /**
+         * Cast operator to the scalar type. This allows to use the object very much like a standard
+         * C array.
+         */
+        Vc_ALWAYS_INLINE Vc_PURE operator       EntryType*()       { return entries(); }
+        /// Const overload of the above function.
+        Vc_ALWAYS_INLINE Vc_PURE operator const EntryType*() const { return entries(); }
+#else
+        // The above conversion operator allows implicit conversion to bool. To prohibit this
+        // conversion we use SFINAE to allow only conversion to EntryType* and void*.
+        template <typename T,
+                  typename std::enable_if<
+                      std::is_same<typename std::remove_const<T>::type, EntryType *>::value ||
+                          std::is_same<typename std::remove_const<T>::type, void *>::value,
+                      int>::type = 0>
+        Vc_ALWAYS_INLINE Vc_PURE operator T()
+        {
+            return entries();
+        }
+        template <typename T,
+                  typename std::enable_if<std::is_same<T, const EntryType *>::value ||
+                                              std::is_same<T, const void *>::value,
+                                          int>::type = 0>
+        Vc_ALWAYS_INLINE Vc_PURE operator T() const
+        {
+            return entries();
+        }
+#endif
+
+        /**
+         *
+         */
+        template<typename Flags>
+        Vc_ALWAYS_INLINE MemoryRange<V, Parent, Flags> range(size_t firstIndex, size_t lastIndex, Flags) {
+            return MemoryRange<V, Parent, Flags>(p(), firstIndex, lastIndex);
+        }
+        Vc_ALWAYS_INLINE MemoryRange<V, Parent> range(size_t firstIndex, size_t lastIndex) {
+            return MemoryRange<V, Parent>(p(), firstIndex, lastIndex);
+        }
+        template<typename Flags>
+        Vc_ALWAYS_INLINE MemoryRange<const V, Parent, Flags> range(size_t firstIndex, size_t lastIndex, Flags) const {
+            return MemoryRange<const V, Parent, Flags>(p(), firstIndex, lastIndex);
+        }
+        Vc_ALWAYS_INLINE MemoryRange<const V, Parent> range(size_t firstIndex, size_t lastIndex) const {
+            return MemoryRange<const V, Parent>(p(), firstIndex, lastIndex);
+        }
+
+        /**
+         * Returns the \p i-th scalar value in the memory.
+         */
+        Vc_ALWAYS_INLINE EntryType &operator[](size_t i) { return entries()[i]; }
+        /// Const overload of the above function.
+        Vc_ALWAYS_INLINE const EntryType &operator[](size_t i) const { return entries()[i]; }
+
+        /**
+         * Uses a vector gather to combine the entries at the indexes in \p i into the returned
+         * vector object.
+         *
+         * \param i  An integer vector. It determines the entries to be gathered.
+         * \returns  A vector object. Modification of this object will not modify the values in
+         *           memory.
+         *
+         * \warning  The API of this function might change in future versions of Vc to additionally
+         *           support scatters.
+         */
+        template<typename IndexT> Vc_ALWAYS_INLINE Vc_PURE V operator[](Vector<IndexT> i) const
+        {
+            return V(entries(), i);
+        }
+};
+template<typename V, typename Parent, typename RowMemory> class MemoryDimensionBase<V, Parent, 2, RowMemory> // {{{1
+{
+    private:
+        Parent *p() { return static_cast<Parent *>(this); }
+        const Parent *p() const { return static_cast<const Parent *>(this); }
+    public:
+        /**
+         * The type of the scalar entries in the array.
+         */
+        typedef typename V::EntryType EntryType;
+
+        static constexpr size_t rowCount() { return Parent::RowCount; }
+
+        /**
+         * Returns a pointer to the start of the allocated memory.
+         */
+        Vc_ALWAYS_INLINE Vc_PURE       EntryType *entries(size_t x = 0)       { return &p()->m_mem[x][0]; }
+        /// Const overload of the above function.
+        Vc_ALWAYS_INLINE Vc_PURE const EntryType *entries(size_t x = 0) const { return &p()->m_mem[x][0]; }
+
+        /**
+         * Returns the \p i,j-th scalar value in the memory.
+         */
+        Vc_ALWAYS_INLINE Vc_PURE EntryType &scalar(size_t i, size_t j) { return entries(i)[j]; }
+        /// Const overload of the above function.
+        Vc_ALWAYS_INLINE Vc_PURE const EntryType scalar(size_t i, size_t j) const { return entries(i)[j]; }
+
+        /**
+         * Returns the \p i-th row in the memory.
+         */
+        Vc_ALWAYS_INLINE Vc_PURE RowMemory &operator[](size_t i) {
+            return p()->m_mem[i];
+        }
+        /// Const overload of the above function.
+        Vc_ALWAYS_INLINE Vc_PURE const RowMemory &operator[](size_t i) const {
+            return p()->m_mem[i];
+        }
+
+        /**
+         * \return the number of rows in the array.
+         *
+         * \note This function can be eliminated by an optimizing compiler.
+         */
+        Vc_ALWAYS_INLINE Vc_PURE size_t rowsCount() const { return p()->rowsCount(); }
+};
+
+//dox{{{1
+/**
+ * \headerfile memorybase.h <Vc/Memory>
+ *
+ * Common interface to all Memory classes, independent of allocation on the stack or heap.
+ *
+ * \param V The vector type you want to operate on. (e.g. float_v or uint_v)
+ * \param Parent This type is the complete type of the class that derives from MemoryBase.
+ * \param Dimension The number of dimensions the implementation provides.
+ * \param RowMemory Class to be used to work on a single row.
+ */
+template<typename V, typename Parent, int Dimension, typename RowMemory> class MemoryBase : public MemoryDimensionBase<V, Parent, Dimension, RowMemory> //{{{1
+{
+    static_assert((V::size() * sizeof(typename V::EntryType)) % V::MemoryAlignment == 0,
+                  "Vc::Memory can only be used for data-parallel types storing a number "
+                  "of values that's a multiple of the memory alignment.");
+
+    private:
+        Parent *p() { return static_cast<Parent *>(this); }
+        const Parent *p() const { return static_cast<const Parent *>(this); }
+
+        template <class Flags>
+        using vector_reference = MayAlias<MemoryVector<V, Flags>> &;
+        template <class Flags>
+        using const_vector_reference = const MayAlias<MemoryVector<const V, Flags>> &;
+
+    public:
+        /**
+         * The type of the scalar entries in the array.
+         */
+        typedef typename V::EntryType EntryType;
+
+        /**
+         * \return the number of scalar entries in the array. This function is optimized away
+         * if a constant size array is used.
+         */
+        Vc_ALWAYS_INLINE Vc_PURE size_t entriesCount() const { return p()->entriesCount(); }
+        /**
+         * \return the number of vector entries that span the array. This function is optimized away
+         * if a constant size array is used.
+         */
+        Vc_ALWAYS_INLINE Vc_PURE size_t vectorsCount() const { return p()->vectorsCount(); }
+
+        using MemoryDimensionBase<V, Parent, Dimension, RowMemory>::entries;
+        using MemoryDimensionBase<V, Parent, Dimension, RowMemory>::scalar;
+
+        /**
+         * Return a (vectorized) iterator to the start of this memory object.
+         */
+        template<typename Flags = AlignedTag>
+        Vc_ALWAYS_INLINE MemoryVectorIterator<      V, Flags> begin(Flags flags = Flags())       { return &firstVector(flags); }
+        //! const overload of the above
+        template<typename Flags = AlignedTag>
+        Vc_ALWAYS_INLINE MemoryVectorIterator<const V, Flags> begin(Flags flags = Flags()) const { return &firstVector(flags); }
+
+        /**
+         * Return a (vectorized) iterator to the end of this memory object.
+         */
+        template<typename Flags = AlignedTag>
+        Vc_ALWAYS_INLINE MemoryVectorIterator<      V, Flags>   end(Flags flags = Flags())       { return &lastVector(flags) + 1; }
+        //! const overload of the above
+        template<typename Flags = AlignedTag>
+        Vc_ALWAYS_INLINE MemoryVectorIterator<const V, Flags>   end(Flags flags = Flags()) const { return &lastVector(flags) + 1; }
+
+        /**
+         * \param i Selects the offset, where the vector should be read.
+         *
+         * \return a smart object to wrap the \p i-th vector in the memory.
+         *
+         * The return value can be used as any other vector object. I.e. you can substitute
+         * something like
+         * \code
+         * float_v a = ..., b = ...;
+         * a += b;
+         * \endcode
+         * with
+         * \code
+         * mem.vector(i) += b;
+         * \endcode
+         *
+         * This function ensures that only \em aligned loads and stores are used. Thus it only allows to
+         * access memory at fixed strides. If access to known offsets from the aligned vectors is
+         * needed the vector(size_t, int) function can be used.
+         */
+        template <typename Flags = AlignedTag>
+        Vc_ALWAYS_INLINE Vc_PURE
+            typename std::enable_if<!std::is_convertible<Flags, int>::value,
+                                    vector_reference<Flags>>::type
+            vector(size_t i, Flags = Flags())
+        {
+            return *aliasing_cast<MemoryVector<V, Flags>>(&entries()[i * V::Size]);
+        }
+        /** \brief Const overload of the above function
+         *
+         * \param i Selects the offset, where the vector should be read.
+         *
+         * \return a smart object to wrap the \p i-th vector in the memory.
+         */
+        template <typename Flags = AlignedTag>
+        Vc_ALWAYS_INLINE Vc_PURE
+            typename std::enable_if<!std::is_convertible<Flags, int>::value,
+                                    const_vector_reference<Flags>>::type
+            vector(size_t i, Flags = Flags()) const
+        {
+            return *aliasing_cast<MemoryVector<const V, Flags>>(&entries()[i * V::Size]);
+        }
+
+        /**
+         * \return a smart object to wrap the vector starting from the \p i-th scalar entry in the memory.
+         *
+         * Example:
+         * \code
+         * Memory<float_v, N> mem;
+         * mem.setZero();
+         * for (int i = 0; i < mem.entriesCount(); i += float_v::Size) {
+         *     mem.vectorAt(i) += b;
+         * }
+         * \endcode
+         *
+         * \param i      Specifies the scalar entry from where the vector will be loaded/stored. I.e. the
+         * values scalar(i), scalar(i + 1), ..., scalar(i + V::Size - 1) will be read/overwritten.
+         *
+         * \param flags  You must take care to determine whether an unaligned load/store is
+         * required. Per default an unaligned load/store is used. If \p i is a multiple of \c V::Size
+         * you may want to pass Vc::Aligned here.
+         */
+        template <typename Flags = UnalignedTag>
+        Vc_ALWAYS_INLINE Vc_PURE vector_reference<Flags> vectorAt(size_t i,
+                                                                  Flags flags = Flags())
+        {
+            return *aliasing_cast<MemoryVector<V, Flags>>(&entries()[i]);
+        }
+        /** \brief Const overload of the above function
+         *
+         * \return a smart object to wrap the vector starting from the \p i-th scalar entry in the memory.
+         *
+         * \param i      Specifies the scalar entry from where the vector will be loaded/stored. I.e. the
+         * values scalar(i), scalar(i + 1), ..., scalar(i + V::Size - 1) will be read/overwritten.
+         *
+         * \param flags  You must take care to determine whether an unaligned load/store is
+         * required. Per default an unaligned load/store is used. If \p i is a multiple of \c V::Size
+         * you may want to pass Vc::Aligned here.
+         */
+        template <typename Flags = UnalignedTag>
+        Vc_ALWAYS_INLINE Vc_PURE const_vector_reference<Flags> vectorAt(
+            size_t i, Flags flags = Flags()) const
+        {
+            return *aliasing_cast<MemoryVector<const V, Flags>>(&entries()[i]);
+        }
+
+        /**
+         * \return a smart object to wrap the \p i-th vector + \p shift in the memory.
+         *
+         * This function ensures that only \em unaligned loads and stores are used.
+         * It allows to access memory at any location aligned to the entry type.
+         *
+         * \param i Selects the memory location of the i-th vector. Thus if \p V::Size == 4 and
+         *          \p i is set to 3 the base address for the load/store will be the 12th entry
+         *          (same as \p &mem[12]).
+         * \param shift Shifts the base address determined by parameter \p i by \p shift many
+         *              entries. Thus \p vector(3, 1) for \p V::Size == 4 will load/store the
+         *              13th - 16th entries (same as \p &mem[13]).
+         *
+         * \note Any shift value is allowed as long as you make sure it stays within bounds of the
+         * allocated memory. Shift values that are a multiple of \p V::Size will \em not result in
+         * aligned loads. You have to use the above vector(size_t) function for aligned loads
+         * instead.
+         *
+         * \note Thus a simple way to access vectors randomly is to set \p i to 0 and use \p shift as the
+         * parameter to select the memory address:
+         * \code
+         * // don't use:
+         * mem.vector(i / V::Size, i % V::Size) += 1;
+         * // instead use:
+         * mem.vector(0, i) += 1;
+         * \endcode
+         */
+        template <typename ShiftT, typename Flags = decltype(Unaligned)>
+        Vc_ALWAYS_INLINE Vc_PURE typename std::enable_if<
+            std::is_convertible<ShiftT, int>::value,
+            vector_reference<decltype(std::declval<Flags>() | Unaligned)>>::type
+        vector(size_t i, ShiftT shift, Flags = Flags())
+        {
+            return *aliasing_cast<
+                MemoryVector<V, decltype(std::declval<Flags>() | Unaligned)>>(
+                &entries()[i * V::Size + shift]);
+        }
+        /// Const overload of the above function.
+        template <typename ShiftT, typename Flags = decltype(Unaligned)>
+        Vc_ALWAYS_INLINE Vc_PURE typename std::enable_if<
+            std::is_convertible<ShiftT, int>::value,
+            const_vector_reference<decltype(std::declval<Flags>() | Unaligned)>>::type
+        vector(size_t i, ShiftT shift, Flags = Flags()) const
+        {
+            return *aliasing_cast<
+                MemoryVector<const V, decltype(std::declval<Flags>() | Unaligned)>>(
+                &entries()[i * V::Size + shift]);
+        }
+
+        /**
+         * \return the first vector in the allocated memory.
+         *
+         * This function is simply a shorthand for vector(0).
+         */
+        template <typename Flags = AlignedTag>
+        Vc_ALWAYS_INLINE Vc_PURE vector_reference<Flags> firstVector(Flags f = Flags())
+        {
+            return vector(0, f);
+        }
+        /// Const overload of the above function.
+        template <typename Flags = AlignedTag>
+        Vc_ALWAYS_INLINE Vc_PURE const_vector_reference<Flags> firstVector(
+            Flags f = Flags()) const
+        {
+            return vector(0, f);
+        }
+
+        /**
+         * \return the last vector in the allocated memory.
+         *
+         * This function is simply a shorthand for vector(vectorsCount() - 1).
+         */
+        template <typename Flags = AlignedTag>
+        Vc_ALWAYS_INLINE Vc_PURE vector_reference<Flags> lastVector(Flags f = Flags())
+        {
+            return vector(vectorsCount() - 1, f);
+        }
+        /// Const overload of the above function.
+        template <typename Flags = AlignedTag>
+        Vc_ALWAYS_INLINE Vc_PURE const_vector_reference<Flags> lastVector(
+            Flags f = Flags()) const
+        {
+            return vector(vectorsCount() - 1, f);
+        }
+
+        Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned char  *indexes) const { return V(entries(), typename V::IndexType(indexes, Vc::Unaligned)); }
+        Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned short *indexes) const { return V(entries(), typename V::IndexType(indexes, Vc::Unaligned)); }
+        Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned int   *indexes) const { return V(entries(), typename V::IndexType(indexes, Vc::Unaligned)); }
+        Vc_ALWAYS_INLINE Vc_PURE V gather(const unsigned long  *indexes) const { return V(entries(), typename V::IndexType(indexes, Vc::Unaligned)); }
+
+        /**
+         * Zero the whole memory area.
+         */
+        Vc_ALWAYS_INLINE void setZero() {
+            V zero(Vc::Zero);
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                vector(i) = zero;
+            }
+        }
+
+        /**
+         * Assign a value to all vectors in the array.
+         */
+        template<typename U>
+        Vc_ALWAYS_INLINE Parent &operator=(U &&x) {
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                vector(i) = std::forward<U>(x);
+            }
+        }
+
+        /**
+         * (Inefficient) shorthand to add up two arrays.
+         */
+        template<typename P2, typename RM>
+        inline Parent &operator+=(const MemoryBase<V, P2, Dimension, RM> &rhs) {
+            assert(vectorsCount() == rhs.vectorsCount());
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                vector(i) += rhs.vector(i);
+            }
+            return static_cast<Parent &>(*this);
+        }
+
+        /**
+         * (Inefficient) shorthand to subtract two arrays.
+         */
+        template<typename P2, typename RM>
+        inline Parent &operator-=(const MemoryBase<V, P2, Dimension, RM> &rhs) {
+            assert(vectorsCount() == rhs.vectorsCount());
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                vector(i) -= rhs.vector(i);
+            }
+            return static_cast<Parent &>(*this);
+        }
+
+        /**
+         * (Inefficient) shorthand to multiply two arrays.
+         */
+        template<typename P2, typename RM>
+        inline Parent &operator*=(const MemoryBase<V, P2, Dimension, RM> &rhs) {
+            assert(vectorsCount() == rhs.vectorsCount());
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                vector(i) *= rhs.vector(i);
+            }
+            return static_cast<Parent &>(*this);
+        }
+
+        /**
+         * (Inefficient) shorthand to divide two arrays.
+         */
+        template<typename P2, typename RM>
+        inline Parent &operator/=(const MemoryBase<V, P2, Dimension, RM> &rhs) {
+            assert(vectorsCount() == rhs.vectorsCount());
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                vector(i) /= rhs.vector(i);
+            }
+            return static_cast<Parent &>(*this);
+        }
+
+        /**
+         * (Inefficient) shorthand to add a value to an array.
+         */
+        inline Parent &operator+=(EntryType rhs) {
+            V v(rhs);
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                vector(i) += v;
+            }
+            return static_cast<Parent &>(*this);
+        }
+
+        /**
+         * (Inefficient) shorthand to subtract a value from an array.
+         */
+        inline Parent &operator-=(EntryType rhs) {
+            V v(rhs);
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                vector(i) -= v;
+            }
+            return static_cast<Parent &>(*this);
+        }
+
+        /**
+         * (Inefficient) shorthand to multiply a value to an array.
+         */
+        inline Parent &operator*=(EntryType rhs) {
+            V v(rhs);
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                vector(i) *= v;
+            }
+            return static_cast<Parent &>(*this);
+        }
+
+        /**
+         * (Inefficient) shorthand to divide an array with a value.
+         */
+        inline Parent &operator/=(EntryType rhs) {
+            V v(rhs);
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                vector(i) /= v;
+            }
+            return static_cast<Parent &>(*this);
+        }
+
+        /**
+         * (Inefficient) shorthand compare equality of two arrays.
+         */
+        template<typename P2, typename RM>
+        inline bool operator==(const MemoryBase<V, P2, Dimension, RM> &rhs) const {
+            assert(vectorsCount() == rhs.vectorsCount());
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                if (!(V(vector(i)) == V(rhs.vector(i))).isFull()) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        /**
+         * (Inefficient) shorthand compare two arrays.
+         */
+        template<typename P2, typename RM>
+        inline bool operator!=(const MemoryBase<V, P2, Dimension, RM> &rhs) const {
+            assert(vectorsCount() == rhs.vectorsCount());
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                if (!(V(vector(i)) == V(rhs.vector(i))).isEmpty()) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        /**
+         * (Inefficient) shorthand compare two arrays.
+         */
+        template<typename P2, typename RM>
+        inline bool operator<(const MemoryBase<V, P2, Dimension, RM> &rhs) const {
+            assert(vectorsCount() == rhs.vectorsCount());
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                if (!(V(vector(i)) < V(rhs.vector(i))).isFull()) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        /**
+         * (Inefficient) shorthand compare two arrays.
+         */
+        template<typename P2, typename RM>
+        inline bool operator<=(const MemoryBase<V, P2, Dimension, RM> &rhs) const {
+            assert(vectorsCount() == rhs.vectorsCount());
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                if (!(V(vector(i)) <= V(rhs.vector(i))).isFull()) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        /**
+         * (Inefficient) shorthand compare two arrays.
+         */
+        template<typename P2, typename RM>
+        inline bool operator>(const MemoryBase<V, P2, Dimension, RM> &rhs) const {
+            assert(vectorsCount() == rhs.vectorsCount());
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                if (!(V(vector(i)) > V(rhs.vector(i))).isFull()) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        /**
+         * (Inefficient) shorthand compare two arrays.
+         */
+        template<typename P2, typename RM>
+        inline bool operator>=(const MemoryBase<V, P2, Dimension, RM> &rhs) const {
+            assert(vectorsCount() == rhs.vectorsCount());
+            for (size_t i = 0; i < vectorsCount(); ++i) {
+                if (!(V(vector(i)) >= V(rhs.vector(i))).isFull()) {
+                    return false;
+                }
+            }
+            return true;
+        }
+};
+
+namespace Detail
+{
+template <typename V,
+          typename ParentL,
+          typename ParentR,
+          int Dimension,
+          typename RowMemoryL,
+          typename RowMemoryR>
+inline void copyVectors(MemoryBase<V, ParentL, Dimension, RowMemoryL> &dst,
+                        const MemoryBase<V, ParentR, Dimension, RowMemoryR> &src)
+{
+    const size_t vectorsCount = dst.vectorsCount();
+    size_t i = 3;
+    for (; i < vectorsCount; i += 4) {
+        const V tmp3 = src.vector(i - 3);
+        const V tmp2 = src.vector(i - 2);
+        const V tmp1 = src.vector(i - 1);
+        const V tmp0 = src.vector(i - 0);
+        dst.vector(i - 3) = tmp3;
+        dst.vector(i - 2) = tmp2;
+        dst.vector(i - 1) = tmp1;
+        dst.vector(i - 0) = tmp0;
+    }
+    for (i -= 3; i < vectorsCount; ++i) {
+        dst.vector(i) = src.vector(i);
+    }
+}
+} // namespace Detail
+
+}  // namespace Common
+}  // namespace Vc
+
+#endif // VC_COMMON_MEMORYBASE_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/memoryfwd.h
+++ b/Vc/common/memoryfwd.h
@ -0,0 +1,46 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_MEMORYFWD_H_
+#define VC_COMMON_MEMORYFWD_H_
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+template <typename V, std::size_t Size1 = 0, std::size_t Size2 = 0,
+          bool InitPadding = true>
+class Memory;
+
+template <typename V, typename Parent, int Dimension, typename RowMemory>
+class MemoryBase;
+}  // namespace Common
+
+using Common::Memory;
+}  // namespace Vc
+
+#endif // VC_COMMON_MEMORYFWD_H_
--- a/Vc/common/operators.h
+++ b/Vc/common/operators.h
@ -0,0 +1,258 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2012-2016 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef COMMON_OPERATORS_H_
+#define COMMON_OPERATORS_H_
+#include "simdarray.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Detail
+{
+template <typename T, typename Abi, typename U>
+enable_if<!std::is_same<T, U>::value, U> is_convertible_to_any_vector(Vector<U, Abi>);
+template <typename T, typename Abi> T is_convertible_to_any_vector(Vector<T, Abi>);
+
+template <typename T, typename U, bool = std::is_integral<T>::value,
+          bool = std::is_integral<U>::value>
+struct FundamentalReturnType;
+template <class T, class U>
+using fundamental_return_t = typename FundamentalReturnType<T, U>::type;
+
+template <typename T, typename U> struct FundamentalReturnType<T, U, false, false> {
+    using type = typename std::conditional<
+        std::is_arithmetic<U>::value,
+        typename std::conditional<(sizeof(T) < sizeof(U)), U, T>::type,
+        // U is not arithmetic, e.g. an enum or a type with e.g. operator int()
+        T>::type;
+};
+template <typename T, typename U> struct FundamentalReturnType<T, U, true, false> {
+    using type = typename std::conditional<
+        std::is_arithmetic<U>::value, U,
+        // U is not arithmetic, e.g. an enum or a type with e.g. operator int()
+        T>::type;
+};
+template <typename T, typename U> struct FundamentalReturnType<T, U, false, true> {
+    using type = T;
+};
+
+template <typename T> struct my_make_signed : public std::make_signed<T> {
+};
+template <> struct my_make_signed<bool> {
+    using type = bool;
+};
+
+template <typename TT, typename UU>
+struct higher_conversion_rank {
+    template <typename A>
+    using fix_sign =
+        typename std::conditional<(std::is_unsigned<TT>::value ||
+                                   std::is_unsigned<UU>::value),
+                                  typename std::make_unsigned<A>::type, A>::type;
+    using T = typename my_make_signed<TT>::type;
+    using U = typename my_make_signed<UU>::type;
+    template <typename Test, typename Otherwise>
+    using c = typename std::conditional<std::is_same<T, Test>::value ||
+                                            std::is_same<U, Test>::value,
+                                        Test, Otherwise>::type;
+
+    using type = fix_sign<c<long long, c<long, c<int, c<short, c<signed char, void>>>>>>;
+};
+
+template <typename T, typename U> struct FundamentalReturnType<T, U, true, true> {
+    template <bool B, class Then, class E>
+    using c = typename std::conditional<B, Then, E>::type;
+    using type =
+        c<(sizeof(T) > sizeof(U)), T,
+          c<(sizeof(T) < sizeof(U)), U, typename higher_conversion_rank<T, U>::type>>;
+};
+
+template <class V, class T, class Tq, class = void> struct ReturnTypeImpl {
+    // no type => SFINAE
+};
+// 1. Vector × Vector
+template <class T, class U, class Abi, class Uq>
+struct ReturnTypeImpl<Vector<T, Abi>, Vector<U, Abi>, Uq, void> {
+    using type = Vc::Vector<fundamental_return_t<T, U>, Abi>;
+};
+// 2. Vector × int
+template <class T, class Abi, class Uq>
+struct ReturnTypeImpl<Vector<T, Abi>, int, Uq, void> {
+    // conversion from int is always allowed (because its the default when you hardcode a
+    // number)
+    using type = Vc::Vector<T, Abi>;
+};
+// 3. Vector × unsigned
+template <class T, class Abi, class Uq>
+struct ReturnTypeImpl<Vector<T, Abi>, uint, Uq, void> {
+    // conversion from unsigned int is allowed for all integral Vector<T>, but ensures
+    // unsigned result
+    using type = Vc::Vector<
+        typename std::conditional<std::is_integral<T>::value, std::make_unsigned<T>,
+                                  std::enable_if<true, T>>::type::type,
+        Abi>;
+};
+// 4. Vector × {enum, arithmetic}
+template <class T, class U, class Abi, class Uq>
+struct ReturnTypeImpl<
+    Vector<T, Abi>, U, Uq,
+    enable_if<!std::is_class<U>::value && !std::is_same<U, int>::value &&
+                  !std::is_same<U, uint>::value &&
+                  Traits::is_valid_vector_argument<fundamental_return_t<T, U>>::value,
+              void>> {
+    using type = Vc::Vector<fundamental_return_t<T, U>, Abi>;
+};
+// 5. Vector × UDT
+template <class T, class U, class Abi, class Uq>
+struct ReturnTypeImpl<
+    Vector<T, Abi>, U, Uq,
+    enable_if<std::is_class<U>::value && !Traits::is_simd_vector<U>::value &&
+                  Traits::is_valid_vector_argument<decltype(
+                      is_convertible_to_any_vector<T, Abi>(std::declval<Uq>()))>::value,
+              void>> {
+    using type =
+        Vc::Vector<fundamental_return_t<T, decltype(is_convertible_to_any_vector<T, Abi>(
+                                               std::declval<Uq>()))>,
+                   Abi>;
+};
+template <class V, class Tq, class T = remove_cvref_t<Tq>>
+using ReturnType = typename ReturnTypeImpl<V, T, Tq>::type;
+
+template <class T> struct is_a_type : public std::true_type {
+};
+
+#ifdef Vc_ENABLE_FLOAT_BIT_OPERATORS
+#define Vc_TEST_FOR_BUILTIN_OPERATOR(op_) true
+#else
+#define Vc_TEST_FOR_BUILTIN_OPERATOR(op_)                                                \
+    Detail::is_a_type<decltype(std::declval<typename R::value_type>()                    \
+                                   op_ std::declval<typename R::value_type>())>::value
+#endif
+}  // namespace Detail
+
+#define Vc_GENERIC_OPERATOR(op_)                                                         \
+    template <class T, class Abi, class U,                                               \
+              class R = Detail::ReturnType<Vector<T, Abi>, U>>                           \
+    Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) &&                      \
+                                   std::is_convertible<Vector<T, Abi>, R>::value &&      \
+                                   std::is_convertible<U, R>::value,                     \
+                               R>                                                        \
+    operator op_(Vector<T, Abi> x, U &&y)                                                \
+    {                                                                                    \
+        return Detail::operator op_(R(x), R(std::forward<U>(y)));                        \
+    }                                                                                    \
+    template <class T, class Abi, class U,                                               \
+              class R = Detail::ReturnType<Vector<T, Abi>, U>>                           \
+    Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) &&                      \
+                                   !Traits::is_simd_vector<U>::value &&                  \
+                                   std::is_convertible<Vector<T, Abi>, R>::value &&      \
+                                   std::is_convertible<U, R>::value,                     \
+                               R>                                                        \
+    operator op_(U &&x, Vector<T, Abi> y)                                                \
+    {                                                                                    \
+        return Detail::operator op_(R(std::forward<U>(x)), R(y));                        \
+    }                                                                                    \
+    template <class T, class Abi, class U,                                               \
+              class R = Detail::ReturnType<Vector<T, Abi>, U>>                           \
+    Vc_ALWAYS_INLINE enable_if<Vc_TEST_FOR_BUILTIN_OPERATOR(op_) &&                      \
+                                   std::is_convertible<Vector<T, Abi>, R>::value &&      \
+                                   std::is_convertible<U, R>::value,                     \
+                               Vector<T, Abi> &>                                         \
+    operator op_##=(Vector<T, Abi> &x, U &&y)                                            \
+    {                                                                                    \
+        x = Detail::operator op_(R(x), R(std::forward<U>(y)));                           \
+        return x;                                                                        \
+    }
+
+#define Vc_LOGICAL_OPERATOR(op_)                                                         \
+    template <class T, class Abi>                                                        \
+    Vc_ALWAYS_INLINE typename Vector<T, Abi>::Mask operator op_(Vector<T, Abi> x,        \
+                                                                Vector<T, Abi> y)        \
+    {                                                                                    \
+        return !!x op_ !!y;                                                              \
+    }                                                                                    \
+    template <class T, class Abi, class U>                                               \
+    Vc_ALWAYS_INLINE                                                                     \
+        enable_if<std::is_convertible<Vector<T, Abi>, Vector<U, Abi>>::value &&          \
+                      std::is_convertible<Vector<U, Abi>, Vector<T, Abi>>::value,        \
+                  typename Detail::ReturnType<Vector<T, Abi>, Vector<U, Abi>>::Mask>     \
+        operator op_(Vector<T, Abi> x, Vector<U, Abi> y)                                 \
+    {                                                                                    \
+        return !!x op_ !!y;                                                              \
+    }                                                                                    \
+    template <class T, class Abi, class U>                                               \
+    Vc_ALWAYS_INLINE enable_if<std::is_same<bool, decltype(!std::declval<U>())>::value,  \
+                               typename Vector<T, Abi>::Mask>                            \
+    operator op_(Vector<T, Abi> x, U &&y)                                                \
+    {                                                                                    \
+        using M = typename Vector<T, Abi>::Mask;                                         \
+        return !!x op_ M(!!std::forward<U>(y));                                          \
+    }                                                                                    \
+    template <class T, class Abi, class U>                                               \
+    Vc_ALWAYS_INLINE enable_if<std::is_same<bool, decltype(!std::declval<U>())>::value,  \
+                               typename Vector<T, Abi>::Mask>                            \
+    operator op_(U &&x, Vector<T, Abi> y)                                                \
+    {                                                                                    \
+        using M = typename Vector<T, Abi>::Mask;                                         \
+        return M(!!std::forward<U>(x)) op_ !!y;                                          \
+    }
+
+#define Vc_COMPARE_OPERATOR(op_)                                                         \
+    template <class T, class Abi, class U,                                               \
+              class R = Detail::ReturnType<Vector<T, Abi>, U>>                           \
+    Vc_ALWAYS_INLINE enable_if<std::is_convertible<Vector<T, Abi>, R>::value &&          \
+                                   std::is_convertible<U, R>::value,                     \
+                               typename R::Mask>                                         \
+    operator op_(Vector<T, Abi> x, U &&y)                                                \
+    {                                                                                    \
+        return Detail::operator op_(R(x), R(std::forward<U>(y)));                        \
+    }                                                                                    \
+    template <class T, class Abi, class U,                                               \
+              class R = Detail::ReturnType<Vector<T, Abi>, U>>                           \
+    Vc_ALWAYS_INLINE                                                                     \
+        enable_if<!Traits::is_simd_vector_internal<remove_cvref_t<U>>::value &&          \
+                      std::is_convertible<Vector<T, Abi>, R>::value &&                   \
+                      std::is_convertible<U, R>::value,                                  \
+                  typename R::Mask>                                                      \
+        operator op_(U &&x, Vector<T, Abi> y)                                            \
+    {                                                                                    \
+        return Detail::operator op_(R(std::forward<U>(x)), R(y));                        \
+    }
+
+Vc_ALL_LOGICAL    (Vc_LOGICAL_OPERATOR);
+Vc_ALL_BINARY     (Vc_GENERIC_OPERATOR);
+Vc_ALL_ARITHMETICS(Vc_GENERIC_OPERATOR);
+Vc_ALL_COMPARES   (Vc_COMPARE_OPERATOR);
+
+#undef Vc_LOGICAL_OPERATOR
+#undef Vc_GENERIC_OPERATOR
+#undef Vc_COMPARE_OPERATOR
+#undef Vc_INVALID_OPERATOR
+
+}  // namespace Vc
+#endif  // COMMON_OPERATORS_H_
--- a/Vc/common/permutation.h
+++ b/Vc/common/permutation.h
@ -0,0 +1,44 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_PERMUTATION_H_
+#define VC_COMMON_PERMUTATION_H_
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Permutation
+{
+struct ReversedTag {};
+constexpr ReversedTag Reversed{};
+}  // namespace Permutation
+}
+
+#endif  // VC_COMMON_PERMUTATION_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/scatterimplementation.h
+++ b/Vc/common/scatterimplementation.h
@ -0,0 +1,270 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_SCATTERIMPLEMENTATION_H_
+#define VC_COMMON_SCATTERIMPLEMENTATION_H_
+
+#include "gatherimplementation.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeScatter(SetIndexZeroT,
+                                    V &v,
+                                    MT *mem,
+                                    IT indexes,
+                                    typename V::MaskArgument mask)
+{
+    indexes.setZeroInverted(static_cast<typename IT::Mask>(mask));
+    // Huh?
+    const V tmp(mem, indexes);
+    where(mask) | v = tmp;
+}
+
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeScatter(SimpleLoopT,
+                                    V &v,
+                                    MT *mem,
+                                    const IT &indexes,
+                                    typename V::MaskArgument mask)
+{
+    if (Vc_IS_UNLIKELY(mask.isEmpty())) {
+        return;
+    }
+    Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
+        if (mask[i])
+            mem[indexes[i]] = v[i];
+    });
+}
+
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeScatter(BitScanLoopT,
+                                    V &v,
+                                    MT *mem,
+                                    const IT &indexes,
+                                    typename V::MaskArgument mask)
+{
+    size_t bits = mask.toInt();
+    while (Vc_IS_LIKELY(bits > 0)) {
+        size_t i, j;
+        asm("bsf %[bits],%[i]\n\t"
+            "bsr %[bits],%[j]\n\t"
+            "btr %[i],%[bits]\n\t"
+            "btr %[j],%[bits]\n\t"
+            : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
+        mem[indexes[i]] = v[i];
+        mem[indexes[j]] = v[j];
+    }
+
+    /* Alternative from Vc::SSE (0.7)
+    int bits = mask.toInt();
+    while (bits) {
+        const int i = _bit_scan_forward(bits);
+        bits ^= (1 << i); // btr?
+        mem[indexes[i]] = v[i];
+    }
+    */
+}
+
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
+                                    V &v,
+                                    MT *mem,
+                                    const IT &indexes,
+                                    typename V::MaskArgument mask,
+                                    enable_if<V::Size == 16> = nullarg)
+{
+    unsigned int bits = mask.toInt();
+    unsigned int low, high = 0;
+    switch (Vc::Detail::popcnt16(bits)) {
+    case 16:
+        v.scatter(mem, indexes);
+        break;
+    case 15:
+        low = _bit_scan_forward(bits);
+        bits ^= 1 << low;
+        mem[indexes[low]] = v[low];
+    case 14:
+        high = _bit_scan_reverse(bits);
+        mem[indexes[high]] = v[high];
+        high = (1 << high);
+    case 13:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        mem[indexes[low]] = v[low];
+    case 12:
+        high = _bit_scan_reverse(bits);
+        mem[indexes[high]] = v[high];
+        high = (1 << high);
+    case 11:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        mem[indexes[low]] = v[low];
+    case 10:
+        high = _bit_scan_reverse(bits);
+        mem[indexes[high]] = v[high];
+        high = (1 << high);
+    case 9:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        mem[indexes[low]] = v[low];
+    case 8:
+        high = _bit_scan_reverse(bits);
+        mem[indexes[high]] = v[high];
+        high = (1 << high);
+    case 7:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        mem[indexes[low]] = v[low];
+    case 6:
+        high = _bit_scan_reverse(bits);
+        mem[indexes[high]] = v[high];
+        high = (1 << high);
+    case 5:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        mem[indexes[low]] = v[low];
+    case 4:
+        high = _bit_scan_reverse(bits);
+        mem[indexes[high]] = v[high];
+        high = (1 << high);
+    case 3:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        mem[indexes[low]] = v[low];
+    case 2:
+        high = _bit_scan_reverse(bits);
+        mem[indexes[high]] = v[high];
+    case 1:
+        low = _bit_scan_forward(bits);
+        mem[indexes[low]] = v[low];
+    case 0:
+        break;
+    }
+}
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
+                                    V &v,
+                                    MT *mem,
+                                    const IT &indexes,
+                                    typename V::MaskArgument mask,
+                                    enable_if<V::Size == 8> = nullarg)
+{
+    unsigned int bits = mask.toInt();
+    unsigned int low, high = 0;
+    switch (Vc::Detail::popcnt8(bits)) {
+    case 8:
+        v.scatter(mem, indexes);
+        break;
+    case 7:
+        low = _bit_scan_forward(bits);
+        bits ^= 1 << low;
+        mem[indexes[low]] = v[low];
+    case 6:
+        high = _bit_scan_reverse(bits);
+        mem[indexes[high]] = v[high];
+        high = (1 << high);
+    case 5:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        mem[indexes[low]] = v[low];
+    case 4:
+        high = _bit_scan_reverse(bits);
+        mem[indexes[high]] = v[high];
+        high = (1 << high);
+    case 3:
+        low = _bit_scan_forward(bits);
+        bits ^= high | (1 << low);
+        mem[indexes[low]] = v[low];
+    case 2:
+        high = _bit_scan_reverse(bits);
+        mem[indexes[high]] = v[high];
+    case 1:
+        low = _bit_scan_forward(bits);
+        mem[indexes[low]] = v[low];
+    case 0:
+        break;
+    }
+}
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
+                                    V &v,
+                                    MT *mem,
+                                    const IT &indexes,
+                                    typename V::MaskArgument mask,
+                                    enable_if<V::Size == 4> = nullarg)
+{
+    unsigned int bits = mask.toInt();
+    unsigned int low, high = 0;
+    switch (Vc::Detail::popcnt4(bits)) {
+    case 4:
+        v.scatter(mem, indexes);
+        break;
+    case 3:
+        low = _bit_scan_forward(bits);
+        bits ^= 1 << low;
+        mem[indexes[low]] = v[low];
+    case 2:
+        high = _bit_scan_reverse(bits);
+        mem[indexes[high]] = v[high];
+    case 1:
+        low = _bit_scan_forward(bits);
+        mem[indexes[low]] = v[low];
+    case 0:
+        break;
+    }
+}
+template <typename V, typename MT, typename IT>
+Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
+                                    V &v,
+                                    MT *mem,
+                                    const IT &indexes,
+                                    typename V::MaskArgument mask,
+                                    enable_if<V::Size == 2> = nullarg)
+{
+    unsigned int bits = mask.toInt();
+    unsigned int low;
+    switch (Vc::Detail::popcnt4(bits)) {
+    case 2:
+        v.scatter(mem, indexes);
+        break;
+    case 1:
+        low = _bit_scan_forward(bits);
+        mem[indexes[low]] = v[low];
+    case 0:
+        break;
+    }
+}
+
+}  // namespace Common
+}  // namespace Vc
+
+#endif // VC_COMMON_SCATTERIMPLEMENTATION_H_
--- a/Vc/common/scatterinterface.h
+++ b/Vc/common/scatterinterface.h
@ -0,0 +1,136 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// scatters
+// A scatter takes the following arguments:
+// 1. A pointer to memory of any type that EntryType can convert to.
+// 2. An indexes “vector”. The requirement is that the type implements the subscript operator,
+//    stores «Size» valid index values, and each offset to the pointer above yields a valid
+//    memory location for reading.
+// 3. Optionally the third argument may be a mask. The mask disables several memory stores and
+//    thus removes the requirements in (2.) for the disabled entries.
+
+private:
+    /**\internal
+     * This function implements a scatter given a pointer to memory \p mem and some
+     * container object storing the scatter \p indexes.
+     *
+     * \param mem This pointer must be aligned correctly for the type \p MT. This is the
+     * natural behavior of C++, so this is typically the case.
+     * \param indexes This object contains at least \VSize{T} indexes that denote the
+     * offset in \p mem where the components for the current vector should be copied to.
+     * The offset is not in Bytes, but in multiples of `sizeof(MT)`.
+     */
+    // enable_if<std::can_convert<MT, EntryType>::value && has_subscript_operator<IT>::value>
+    template <typename MT, typename IT>
+    inline void scatterImplementation(MT *mem, IT &&indexes) const;
+
+    /**\internal
+     * This overload of the above function adds a \p mask argument to disable memory
+     * accesses at the \p indexes offsets where \p mask is \c false.
+     */
+    template <typename MT, typename IT>
+    inline void scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const;
+
+public:
+#define Vc_ASSERT_SCATTER_PARAMETER_TYPES_                                               \
+    static_assert(                                                                       \
+        std::is_convertible<EntryType, MT>::value,                                       \
+        "The memory pointer needs to point to a type that the EntryType of this "        \
+        "SIMD vector type can be converted to.");                                        \
+    static_assert(                                                                       \
+        Vc::Traits::has_subscript_operator<IT>::value,                                   \
+        "The indexes argument must be a type that implements the subscript operator.");  \
+    static_assert(                                                                       \
+        !Traits::is_simd_vector<IT>::value ||                                            \
+            Traits::simd_vector_size<IT>::value >= Size,                                 \
+        "If you use a SIMD vector for the indexes parameter, the index vector must "     \
+        "have at least as many entries as this SIMD vector.");                           \
+    static_assert(                                                                       \
+        !std::is_array<T>::value ||                                                      \
+            (std::rank<T>::value == 1 &&                                                 \
+             (std::extent<T>::value == 0 || std::extent<T>::value >= Size)),             \
+        "If you use a simple array for the indexes parameter, the array must have "      \
+        "at least as many entries as this SIMD vector.")
+
+    /**
+     * \name Scatter functions
+     *
+     * Stores a vector to the objects at `mem[indexes[0]]`, `mem[indexes[1]]`,
+     * `mem[indexes[2]]`, ...
+     *
+     * \param mem A pointer to memory which contains objects of type \p MT at the offsets
+     *            given by \p indexes.
+     * \param indexes
+     * \param mask
+     */
+    ///@{
+
+    /// Scatter function
+    template <typename MT,
+              typename IT,
+              typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
+    Vc_INTRINSIC void scatter(MT *mem, IT &&indexes) const
+    {
+        Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
+        scatterImplementation(mem, std::forward<IT>(indexes));
+    }
+
+    /// Masked scatter function
+    template <typename MT,
+              typename IT,
+              typename = enable_if<Vc::Traits::has_subscript_operator<IT>::value>>
+    Vc_INTRINSIC void scatter(MT *mem, IT &&indexes, MaskArgument mask) const
+    {
+        Vc_ASSERT_SCATTER_PARAMETER_TYPES_;
+        scatterImplementation(mem, std::forward<IT>(indexes), mask);
+    }
+    ///@}
+
+#include "scatterinterface_deprecated.h"
+
+    /**\internal
+     * \name Scatter function to use from Vc::Common::subscript_operator
+     *
+     * \param args
+     * \param mask
+     */
+    ///@{
+    template <typename MT, typename IT>
+    Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args) const
+    {
+        scatter(args.address, args.indexes);
+    }
+
+    template <typename MT, typename IT>
+    Vc_INTRINSIC void scatter(const Common::ScatterArguments<MT, IT> &args, MaskArgument mask) const
+    {
+        scatter(args.address, args.indexes, mask);
+    }
+    ///@}
+#undef Vc_ASSERT_SCATTER_PARAMETER_TYPES_
--- a/Vc/common/scatterinterface_deprecated.h
+++ b/Vc/common/scatterinterface_deprecated.h
@ -0,0 +1,147 @@
+    /// \name Deprecated Members
+    ///@{
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
+     *                to. The type of indexes can either be an integer vector or a type that supports
+     *                operator[] access.
+     */
+    template <typename S1, typename IT>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline void scatter(S1 *array, EntryType S1::*member1,
+                                                  IT indexes) const
+    {
+        scatter(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
+                    array, indexes)[member1]
+                    .scatterArguments());
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
+     *                to. The type of indexes can either be an integer vector or a type that supports
+     *                operator[] access.
+     * \param mask    If a mask is given only the active entries will be gathered/scattered.
+     */
+    template <typename S1, typename IT>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline void scatter(S1 *array, EntryType S1::*member1,
+                                                  IT indexes, MaskArgument mask) const
+    {
+        scatter(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
+                    array, indexes)[member1]
+                    .scatterArguments(),
+                mask);
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
+     *                struct (i.e. array[i].*member1.*member2 is read).
+     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
+     *                to. The type of indexes can either be an integer vector or a type that supports
+     *                operator[] access.
+     */
+    template <typename S1, typename S2, typename IT>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline void scatter(S1 *array, S2 S1::*member1,
+                                                  EntryType S2::*member2,
+                                                  IT indexes) const
+    {
+        scatter(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
+                    array, indexes)[member1][member2]
+                    .scatterArguments());
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param member1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param member2 If \p member1 is a struct then \p member2 selects the member to be read from that
+     *                struct (i.e. array[i].*member1.*member2 is read).
+     * \param indexes Determines the offsets into \p array where the values are gathered from/scattered
+     *                to. The type of indexes can either be an integer vector or a type that supports
+     *                operator[] access.
+     * \param mask    If a mask is given only the active entries will be gathered/scattered.
+     */
+    template <typename S1, typename S2, typename IT>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline void scatter(S1 *array, S2 S1::*member1,
+                                                  EntryType S2::*member2, IT indexes,
+                                                  MaskArgument mask) const
+    {
+        scatter(Common::SubscriptOperation<S1, IT, std::ratio<1, 1>, true>(
+                    array, indexes)[member1][member2]
+                    .scatterArguments(),
+                mask);
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param outerIndexes
+     * \param innerIndexes
+     */
+    template <typename S1, typename IT1, typename IT2>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline void scatter(S1 *array, EntryType *S1::*ptrMember1,
+                                                  IT1 outerIndexes,
+                                                  IT2 innerIndexes) const
+    {
+        scatter(Common::SubscriptOperation<S1, IT1, std::ratio<1, 1>, true>(
+                    array, outerIndexes)[ptrMember1][innerIndexes]
+                    .scatterArguments());
+    }
+
+    /**
+     * \deprecated Use Vc::array or Vc::vector subscripting instead.
+     *
+     * \param array   A pointer into memory (without alignment restrictions).
+     * \param ptrMember1 If \p array points to a struct, \p member1 determines the member in the struct to
+     *                be read. Thus the offsets in \p indexes are relative to the \p array and not to
+     *                the size of the gathered type (i.e. array[i].*member1 is accessed instead of
+     *                (&(array->*member1))[i])
+     * \param outerIndexes
+     * \param innerIndexes
+     * \param mask    If a mask is given only the active entries will be gathered/scattered.
+     */
+    template <typename S1, typename IT1, typename IT2>
+    Vc_DEPRECATED("use the subscript operator to Vc::array or Vc::vector "
+                  "instead.") inline void scatter(S1 *array, EntryType *S1::*ptrMember1,
+                                                  IT1 outerIndexes, IT2 innerIndexes,
+                                                  MaskArgument mask) const
+    {
+        scatter(Common::SubscriptOperation<S1, IT1, std::ratio<1, 1>, true>(
+                    array, outerIndexes)[ptrMember1][innerIndexes]
+                    .scatterArguments(),
+                mask);
+    }
+    ///@}
--- a/Vc/common/set.h
+++ b/Vc/common/set.h
@ -0,0 +1,92 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_SET_H_
+#define VC_COMMON_SET_H_
+
+#include "macros.h"
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace
+{
+    static Vc_INTRINSIC Vc_CONST __m128i set(unsigned short x0, unsigned short x1, unsigned short x2, unsigned short x3,
+            unsigned short x4, unsigned short x5, unsigned short x6, unsigned short x7)
+    {
+#if defined(Vc_GNU_ASM)
+#if 0 // defined(__x86_64__)
+        // it appears that the 32bit variant is always faster
+        __m128i r;
+        unsigned long long tmp0 = x3; tmp0 = (tmp0 << 16) | x2;
+        unsigned long long tmp1 = x1; tmp1 = (tmp1 << 16) | x0;
+        asm("vmovq %1,%0" : "=x"(r) : "r"((tmp0 << 32) | tmp1));
+        unsigned long long tmp2 = x7; tmp2 = (tmp2 << 16) | x6;
+        unsigned long long tmp3 = x5; tmp3 = (tmp3 << 16) | x4;
+        asm("vpinsrq $1,%1,%0,%0" : "+x"(r) : "r"((tmp2 << 32) | tmp3));
+        return r;
+#elif defined(Vc_USE_VEX_CODING)
+        __m128i r0, r1;
+        unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
+        unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
+        unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
+        unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
+        asm("vmovd %1,%0" : "=x"(r0) : "r"(tmp0));
+        asm("vpinsrd $1,%1,%0,%0" : "+x"(r0) : "r"(tmp1));
+        asm("vmovd %1,%0" : "=x"(r1) : "r"(tmp2));
+        asm("vpinsrd $1,%1,%0,%0" : "+x"(r1) : "r"(tmp3));
+        asm("vpunpcklqdq %1,%0,%0" : "+x"(r0) : "x"(r1));
+        return r0;
+#else
+        __m128i r0, r1;
+        unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
+        unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
+        unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
+        unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
+        asm("movd %1,%0" : "=x"(r0) : "r"(tmp0));
+        asm("pinsrd $1,%1,%0" : "+x"(r0) : "r"(tmp1));
+        asm("movd %1,%0" : "=x"(r1) : "r"(tmp2));
+        asm("pinsrd $1,%1,%0" : "+x"(r1) : "r"(tmp3));
+        asm("punpcklqdq %1,%0" : "+x"(r0) : "x"(r1));
+        return r0;
+#endif
+#else
+        unsigned int tmp0 = x1; tmp0 = (tmp0 << 16) | x0;
+        unsigned int tmp1 = x3; tmp1 = (tmp1 << 16) | x2;
+        unsigned int tmp2 = x5; tmp2 = (tmp2 << 16) | x4;
+        unsigned int tmp3 = x7; tmp3 = (tmp3 << 16) | x6;
+        return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
+#endif
+    }
+    static Vc_INTRINSIC Vc_CONST __m128i set(short x0, short x1, short x2, short x3, short x4, short x5, short x6, short x7)
+    {
+        return set(static_cast<unsigned short>(x0), static_cast<unsigned short>(x1), static_cast<unsigned short>(x2),
+                static_cast<unsigned short>(x3), static_cast<unsigned short>(x4), static_cast<unsigned short>(x5),
+                static_cast<unsigned short>(x6), static_cast<unsigned short>(x7));
+    }
+}  // anonymous namespace
+}  // namespace Vc
+
+#endif // VC_COMMON_SET_H_
--- a/Vc/common/simd_cast.h
+++ b/Vc/common/simd_cast.h
@ -0,0 +1,68 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_SIMD_CAST_H_
+#define VC_COMMON_SIMD_CAST_H_
+
+#include <type_traits>
+#include "macros.h"
+
+// declare a bogus simd_cast function template in the global namespace to enable ADL for
+// simd_cast<T>
+template <class> void simd_cast();
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+/**
+ * Casts the argument \p x from type \p From to type \p To.
+ *
+ * This function implements the trivial case where \p To and \p From are the same type.
+ *
+ * \param x The object of type \p From to be converted to type \p To.
+ * \returns An object of type \p To with all vector components converted according to
+ *          standard conversion behavior as mandated by the C++ standard for the
+ *          underlying arithmetic types.
+ */
+template <typename To, typename From>
+Vc_INTRINSIC Vc_CONST To
+simd_cast(From &&x, enable_if<std::is_same<To, Traits::decay<From>>::value> = nullarg)
+{
+    return std::forward<From>(x);
+}
+
+/**
+ * A cast from nothing results in default-initialization of \p To.
+ *
+ * This function can be useful in generic code where a parameter pack expands to nothing.
+ *
+ * \returns A zero-initialized object of type \p To.
+ */
+template <typename To> Vc_INTRINSIC Vc_CONST To simd_cast() { return To(); }
+
+}  // namespace Vc
+
+#endif // VC_COMMON_SIMD_CAST_H_
--- a/Vc/common/simd_cast_caller.tcc
+++ b/Vc/common/simd_cast_caller.tcc
@ -0,0 +1,79 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_SIMD_CAST_CALLER_TCC_
+#define VC_COMMON_SIMD_CAST_CALLER_TCC_
+
+#include "macros.h"
+namespace Vc_VERSIONED_NAMESPACE {
+template <class T, std::size_t N, class VectorType>
+template <class U, class V, class>
+Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
+    const SimdMaskArray<U, N, V> &x)
+    : data(simd_cast<mask_type>(internal_data(x)))
+{
+}
+template <class T, std::size_t N, class VectorType>
+template <class U, class V, class, class>
+Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
+    const SimdMaskArray<U, N, V> &x)
+    : data(simd_cast<mask_type>(internal_data(internal_data0(x)),
+                                internal_data(internal_data1(x))))
+{
+}
+template <class T, std::size_t N, class VectorType>
+template <class U, class V, class, class, class>
+Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
+    const SimdMaskArray<U, N, V> &x)
+    : data(simd_cast<mask_type>(internal_data(internal_data0(internal_data0(x))),
+                                internal_data(internal_data1(internal_data0(x))),
+                                internal_data(internal_data0(internal_data1(x))),
+                                internal_data(internal_data1(internal_data1(x)))))
+{
+}
+// conversion from any Segment object (could be SimdMaskArray or Mask<T>)
+template <class T, std::size_t N, class VectorType>
+template <class M, std::size_t Pieces, std::size_t Index>
+Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(
+    Common::Segment<M, Pieces, Index> &&x,
+    enable_if<Traits::simd_vector_size<M>::value == Size * Pieces>)
+    : data(simd_cast<mask_type, Index>(x.data))
+{
+}
+// conversion from Mask<T>
+template <class T, std::size_t N, class VectorType>
+template <class M, class>
+Vc_INTRINSIC SimdMaskArray<T, N, VectorType, N>::SimdMaskArray(M k)
+    : data(simd_cast<mask_type>(k))
+{
+}
+
+}  // namespace Vc_VERSIONED_NAMESPACE
+
+#endif  // VC_COMMON_SIMD_CAST_CALLER_TCC_
+
+// vim: foldmethod=marker
--- a/Vc/common/simdarray.h
+++ b/Vc/common/simdarray.h
--- a/Vc/common/simdarrayfwd.h
+++ b/Vc/common/simdarrayfwd.h
@ -0,0 +1,210 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_SIMDARRAYFWD_H_
+#define VC_COMMON_SIMDARRAYFWD_H_
+
+#include "../scalar/types.h"
+#include "../sse/types.h"
+#include "../avx/types.h"
+
+#include "utility.h"
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+// specialization of Vector for fixed_size<N> {{{
+template <class T, int N>
+class Vector<T, simd_abi::fixed_size<N>> : public SimdArray<T, N>
+{
+    using SimdArray<T, N>::SimdArray;
+
+public:
+    // overload copy to force argument passing via the stack. This makes the type more
+    // usable on ABI boundaries
+    Vc_INTRINSIC Vector(const Vector &x) : SimdArray<T, N>(x) {}
+    Vc_INTRINSIC Vector &operator=(const Vector &x)
+    {
+        SimdArray<T, N>::operator=(x);
+        return *this;
+    }
+    Vector() = default;
+
+    using abi_type = simd_abi::fixed_size<N>;
+    using abi = abi_type;
+
+    Vc_DEPRECATED("use Vector([](int n) { return n; }) instead of "
+                  "Vector::IndexesFromZero()") static Vector IndexesFromZero()
+    {
+        return Vector([](size_t i) -> T { return i; });
+    }
+    Vc_DEPRECATED("use 0 instead of Vector::Zero()") static Vector Zero() { return 0; }
+    Vc_DEPRECATED("use 1 instead of Vector::One()") static Vector One() { return 1; }
+};
+
+template <class T, int N>
+class Mask<T, simd_abi::fixed_size<N>> : public SimdMaskArray<T, N>
+{
+    using SimdMaskArray<T, N>::SimdMaskArray;
+
+public:
+    // overload copy to force argument passing via the stack. This makes the type more
+    // usable on ABI boundaries
+    Vc_INTRINSIC Mask(const Mask &x) : SimdMaskArray<T, N>(x) {}
+    Vc_INTRINSIC Mask &operator=(const Mask &x)
+    {
+        SimdMaskArray<T, N>::operator=(x);
+        return *this;
+    }
+    Mask() = default;
+
+    using abi_type = simd_abi::fixed_size<N>;
+    using abi = abi_type;
+};
+// }}}
+
+/** \internal
+ * Simple traits for SimdArray to easily access internal types of non-atomic SimdArray
+ * types.
+ */
+template <typename T, std::size_t N> struct SimdArrayTraits {
+    static constexpr std::size_t N0 = Common::left_size<N>();
+    static constexpr std::size_t N1 = Common::right_size<N>();
+
+    using storage_type0 = fixed_size_simd<T, N0>;
+    using storage_type1 = fixed_size_simd<T, N1>;
+};
+
+template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
+Vc_INTRINSIC_L typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
+    SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
+template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
+Vc_INTRINSIC_L typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
+    SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
+template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
+Vc_INTRINSIC_L const typename SimdArrayTraits<T, N>::storage_type0 &internal_data0(
+    const SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
+template <typename T, std::size_t N, typename VectorType, std::size_t VectorSize>
+Vc_INTRINSIC_L const typename SimdArrayTraits<T, N>::storage_type1 &internal_data1(
+    const SimdArray<T, N, VectorType, VectorSize> &x) Vc_INTRINSIC_R;
+
+template <typename T, std::size_t N, typename V>
+Vc_INTRINSIC_L V &internal_data(SimdArray<T, N, V, N> &x) Vc_INTRINSIC_R;
+template <typename T, std::size_t N, typename V>
+Vc_INTRINSIC_L const V &internal_data(const SimdArray<T, N, V, N> &x) Vc_INTRINSIC_R;
+
+namespace Traits
+{
+// is_fixed_size_simd {{{1
+template <class T> struct is_fixed_size_simd : std::false_type {
+};
+template <class T, int N>
+struct is_fixed_size_simd<fixed_size_simd<T, N>> : std::true_type {
+};
+template <class T, int N>
+struct is_fixed_size_simd<fixed_size_simd_mask<T, N>> : std::true_type {
+};
+
+// is_simd_vector_internal {{{1
+template <class T, int N>
+struct is_simd_vector_internal<fixed_size_simd<T, N>> : is_valid_vector_argument<T> {};
+
+// is_simd_mask_internal {{{1
+template <class T, int N>
+struct is_simd_mask_internal<fixed_size_simd_mask<T, N>> : is_valid_vector_argument<T> {};
+
+// is_atomic_simdarray_internal {{{1
+template <typename T, std::size_t N, typename V>
+struct is_atomic_simdarray_internal<SimdArray<T, N, V, N>> : is_valid_vector_argument<T> {};
+template <typename T, int N>
+struct is_atomic_simdarray_internal<fixed_size_simd<T, N>>
+    : is_atomic_simdarray_internal<SimdArray<T, N>> {
+};
+
+// is_atomic_simd_mask_array_internal {{{1
+template <typename T, std::size_t N, typename V>
+struct is_atomic_simd_mask_array_internal<SimdMaskArray<T, N, V, N>>
+    : is_valid_vector_argument<T> {
+};
+template <typename T, int N>
+struct is_atomic_simd_mask_array_internal<fixed_size_simd_mask<T, N>>
+    : is_atomic_simd_mask_array_internal<SimdMaskArray<T, N>> {
+};
+
+// is_simdarray_internal {{{1
+template <typename T, std::size_t N, typename VectorType, std::size_t M>
+struct is_simdarray_internal<SimdArray<T, N, VectorType, M>>
+    : is_valid_vector_argument<T> {
+};
+template <typename T, int N>
+struct is_simdarray_internal<fixed_size_simd<T, N>> : is_valid_vector_argument<T> {
+};
+
+// is_simd_mask_array_internal {{{1
+template <typename T, std::size_t N, typename VectorType, std::size_t M>
+struct is_simd_mask_array_internal<SimdMaskArray<T, N, VectorType, M>>
+    : is_valid_vector_argument<T> {
+};
+template <typename T, int N>
+struct is_simd_mask_array_internal<fixed_size_simd_mask<T, N>>
+    : is_valid_vector_argument<T> {
+};
+
+// is_integral_internal {{{1
+template <typename T, std::size_t N, typename V, std::size_t M>
+struct is_integral_internal<SimdArray<T, N, V, M>, false> : std::is_integral<T> {
+};
+
+// is_floating_point_internal {{{1
+template <typename T, std::size_t N, typename V, std::size_t M>
+struct is_floating_point_internal<SimdArray<T, N, V, M>, false>
+    : std::is_floating_point<T> {
+};
+
+// is_signed_internal {{{1
+template <typename T, std::size_t N, typename V, std::size_t M>
+struct is_signed_internal<SimdArray<T, N, V, M>, false> : std::is_signed<T> {
+};
+
+// is_unsigned_internal {{{1
+template <typename T, std::size_t N, typename V, std::size_t M>
+struct is_unsigned_internal<SimdArray<T, N, V, M>, false> : std::is_unsigned<T> {
+};
+
+// has_no_allocated_data_impl {{{1
+template <typename T, std::size_t N>
+struct has_no_allocated_data_impl<Vc::SimdArray<T, N>> : std::true_type {
+};
+
+// }}}1
+}  // namespace Traits
+
+}  // namespace Vc
+
+#endif  // VC_COMMON_SIMDARRAYFWD_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/simdarrayhelper.h
+++ b/Vc/common/simdarrayhelper.h
@ -0,0 +1,593 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_SIMDARRAYHELPER_H_
+#define VC_COMMON_SIMDARRAYHELPER_H_
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+// private_init {{{
+namespace
+{
+static constexpr struct private_init_t {} private_init = {};
+}  // unnamed namespace
+// }}}
+
+namespace Common
+{
+
+/// \addtogroup SimdArray
+/// @{
+
+namespace Operations/*{{{*/
+{
+struct tag {};
+#define Vc_DEFINE_OPERATION(name_)                                                       \
+    struct name_ : public tag {                                                          \
+        template <typename V, typename... Args>                                          \
+        Vc_INTRINSIC void operator()(V &v, Args &&... args)                              \
+        {                                                                                \
+            v.name_(std::forward<Args>(args)...);                                        \
+        }                                                                                \
+    }
+Vc_DEFINE_OPERATION(gather);
+Vc_DEFINE_OPERATION(scatter);
+Vc_DEFINE_OPERATION(load);
+Vc_DEFINE_OPERATION(store);
+Vc_DEFINE_OPERATION(setZero);
+Vc_DEFINE_OPERATION(setZeroInverted);
+Vc_DEFINE_OPERATION(assign);
+#undef Vc_DEFINE_OPERATION
+#define Vc_DEFINE_OPERATION(name_, code_)                                                \
+    struct name_ : public tag {                                                          \
+        template <typename V> Vc_INTRINSIC void operator()(V &v) { code_; }              \
+    }
+Vc_DEFINE_OPERATION(increment, ++(v));
+Vc_DEFINE_OPERATION(decrement, --(v));
+Vc_DEFINE_OPERATION(random, v = V::Random());
+#undef Vc_DEFINE_OPERATION
+#define Vc_DEFINE_OPERATION_FORWARD(name_)                                               \
+    struct Forward_##name_ : public tag                                                  \
+    {                                                                                    \
+        template <typename... Args, typename = decltype(name_(std::declval<Args>()...))> \
+        Vc_INTRINSIC void operator()(decltype(name_(std::declval<Args>()...)) &v,        \
+                                     Args &&... args)                                    \
+        {                                                                                \
+            v = name_(std::forward<Args>(args)...);                                      \
+        }                                                                                \
+        template <typename... Args, typename = decltype(name_(std::declval<Args>()...))> \
+        Vc_INTRINSIC void operator()(std::nullptr_t, Args && ... args)                   \
+        {                                                                                \
+            name_(std::forward<Args>(args)...);                                          \
+        }                                                                                \
+    }
+Vc_DEFINE_OPERATION_FORWARD(abs);
+Vc_DEFINE_OPERATION_FORWARD(asin);
+Vc_DEFINE_OPERATION_FORWARD(atan);
+Vc_DEFINE_OPERATION_FORWARD(atan2);
+Vc_DEFINE_OPERATION_FORWARD(cos);
+Vc_DEFINE_OPERATION_FORWARD(ceil);
+Vc_DEFINE_OPERATION_FORWARD(copysign);
+Vc_DEFINE_OPERATION_FORWARD(exp);
+Vc_DEFINE_OPERATION_FORWARD(exponent);
+Vc_DEFINE_OPERATION_FORWARD(fma);
+Vc_DEFINE_OPERATION_FORWARD(floor);
+Vc_DEFINE_OPERATION_FORWARD(frexp);
+Vc_DEFINE_OPERATION_FORWARD(isfinite);
+Vc_DEFINE_OPERATION_FORWARD(isinf);
+Vc_DEFINE_OPERATION_FORWARD(isnan);
+Vc_DEFINE_OPERATION_FORWARD(isnegative);
+Vc_DEFINE_OPERATION_FORWARD(ldexp);
+Vc_DEFINE_OPERATION_FORWARD(log);
+Vc_DEFINE_OPERATION_FORWARD(log10);
+Vc_DEFINE_OPERATION_FORWARD(log2);
+Vc_DEFINE_OPERATION_FORWARD(reciprocal);
+Vc_DEFINE_OPERATION_FORWARD(round);
+Vc_DEFINE_OPERATION_FORWARD(rsqrt);
+Vc_DEFINE_OPERATION_FORWARD(sin);
+Vc_DEFINE_OPERATION_FORWARD(sincos);
+Vc_DEFINE_OPERATION_FORWARD(sqrt);
+Vc_DEFINE_OPERATION_FORWARD(trunc);
+Vc_DEFINE_OPERATION_FORWARD(min);
+Vc_DEFINE_OPERATION_FORWARD(max);
+#undef Vc_DEFINE_OPERATION_FORWARD
+template<typename T> using is_operation = std::is_base_of<tag, T>;
+}  // namespace Operations }}}
+
+/**
+ * \internal
+ * Helper type to statically communicate segmentation of one vector register into 2^n parts
+ * (Pieces).
+ *
+ * Forward declaration in common/types.h.
+ */
+template <typename T_, std::size_t Pieces_, std::size_t Index_> struct Segment/*{{{*/
+{
+    static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report.");
+
+    using type = T_;
+    using type_decayed = typename std::decay<type>::type;
+    static constexpr std::size_t Pieces = Pieces_;
+    static constexpr std::size_t Index = Index_;
+    using fixed_size_type =
+        fixed_size_simd<conditional_t<Traits::is_simd_vector<type_decayed>::value,
+                                      typename type_decayed::EntryType, float>,
+                        type_decayed::Size / Pieces>;
+
+    type data;
+
+    static constexpr std::size_t EntryOffset = Index * type_decayed::Size / Pieces;
+
+    // no non-const operator[] needed
+    decltype(std::declval<const type &>()[0]) operator[](size_t i) const { return data[i + EntryOffset]; }
+
+    fixed_size_type to_fixed_size() const
+    {
+        return simd_cast<fixed_size_type, Index>(data);
+    }
+};/*}}}*/
+
+//Segment<T *, ...> specialization {{{
+template <typename T_, std::size_t Pieces_, std::size_t Index_>
+struct Segment<T_ *, Pieces_, Index_> {
+    static_assert(Index_ < Pieces_, "You found a bug in Vc. Please report.");
+
+    using type = T_ *;
+    using type_decayed = typename std::decay<T_>::type;
+    static constexpr size_t Pieces = Pieces_;
+    static constexpr size_t Index = Index_;
+    using fixed_size_type = fixed_size_simd<
+        typename std::conditional<Traits::is_simd_vector<type_decayed>::value,
+                                  typename type_decayed::VectorEntryType, float>::type,
+        type_decayed::Size / Pieces> *;
+
+    type data;
+
+    static constexpr std::size_t EntryOffset = Index * type_decayed::size() / Pieces;
+
+    fixed_size_type to_fixed_size() const
+    {
+        return reinterpret_cast<
+#ifdef Vc_GCC
+                   // GCC might ICE if this type is declared with may_alias. If it doesn't
+                   // ICE it warns about ignoring the attribute.
+                   typename std::remove_pointer<fixed_size_type>::type
+#else
+                   MayAlias<typename std::remove_pointer<fixed_size_type>::type>
+#endif
+                       *>(data) +
+               Index;
+    }
+
+    //decltype(std::declval<type>()[0]) operator[](size_t i) { return data[i + EntryOffset]; }
+    //decltype(std::declval<type>()[0]) operator[](size_t i) const { return data[i + EntryOffset]; }
+};/*}}}*/
+
+/** \internal
+  Template class that is used to attach an offset value to an existing type. It is used
+  for IndexesFromZero construction in SimdArray. The \c data1 constructor needs to know
+  that the IndexesFromZero constructor requires an offset so that the whole data is
+  constructed as a correct sequence from `0` to `Size - 1`.
+
+  \tparam T The original type that needs the offset attached.
+  \tparam Offset An integral value that determines the offset in the complete SimdArray.
+ */
+template <typename T, std::size_t Offset> struct AddOffset
+{
+    constexpr AddOffset() = default;
+};
+
+// class Split {{{1
+/** \internal
+  Helper type with static functions to generically adjust arguments for the \c data0 and
+  \c data1 members of SimdArray and SimdMaskArray.
+
+  \tparam secondOffset The offset in number of elements that \c data1 has in the SimdArray
+                       / SimdMaskArray. This is essentially equal to the number of
+                       elements in \c data0.
+ */
+template <std::size_t secondOffset> class Split
+{
+    // split composite SimdArray
+    template <typename U, std::size_t N, typename V, std::size_t M,
+              typename = enable_if<N != M>>
+    static Vc_INTRINSIC auto loImpl(const SimdArray<U, N, V, M> &x)
+        -> decltype(internal_data0(x))
+    {
+        return internal_data0(x);
+    }
+    template <typename U, std::size_t N, typename V, std::size_t M,
+              typename = enable_if<N != M>>
+    static Vc_INTRINSIC auto hiImpl(const SimdArray<U, N, V, M> &x)
+        -> decltype(internal_data1(x))
+    {
+        return internal_data1(x);
+    }
+    template <typename U, std::size_t N, typename V, std::size_t M,
+              typename = enable_if<N != M>>
+    static Vc_INTRINSIC auto loImpl(SimdArray<U, N, V, M> *x)
+        -> decltype(&internal_data0(*x))
+    {
+        return &internal_data0(*x);
+    }
+    template <typename U, std::size_t N, typename V, std::size_t M,
+              typename = enable_if<N != M>>
+    static Vc_INTRINSIC auto hiImpl(SimdArray<U, N, V, M> *x)
+        -> decltype(&internal_data1(*x))
+    {
+        return &internal_data1(*x);
+    }
+
+    // split atomic SimdArray
+    template <typename U, std::size_t N, typename V>
+    static Vc_INTRINSIC Segment<V, 2, 0> loImpl(const SimdArray<U, N, V, N> &x)
+    {
+        return {internal_data(x)};
+    }
+    template <typename U, std::size_t N, typename V>
+    static Vc_INTRINSIC Segment<V, 2, 1> hiImpl(const SimdArray<U, N, V, N> &x)
+    {
+        return {internal_data(x)};
+    }
+    template <typename U, std::size_t N, typename V>
+    static Vc_INTRINSIC Segment<V *, 2, 0> loImpl(SimdArray<U, N, V, N> *x)
+    {
+        return {&internal_data(*x)};
+    }
+    template <typename U, std::size_t N, typename V>
+    static Vc_INTRINSIC Segment<V *, 2, 1> hiImpl(SimdArray<U, N, V, N> *x)
+    {
+        return {&internal_data(*x)};
+    }
+
+    // split composite SimdMaskArray
+    template <typename U, std::size_t N, typename V, std::size_t M>
+    static Vc_INTRINSIC auto loImpl(const SimdMaskArray<U, N, V, M> &x) -> decltype(internal_data0(x))
+    {
+        return internal_data0(x);
+    }
+    template <typename U, std::size_t N, typename V, std::size_t M>
+    static Vc_INTRINSIC auto hiImpl(const SimdMaskArray<U, N, V, M> &x) -> decltype(internal_data1(x))
+    {
+        return internal_data1(x);
+    }
+
+    template <typename U, std::size_t N, typename V>
+    static Vc_INTRINSIC Segment<typename SimdMaskArray<U, N, V, N>::mask_type, 2, 0> loImpl(
+        const SimdMaskArray<U, N, V, N> &x)
+    {
+        return {internal_data(x)};
+    }
+    template <typename U, std::size_t N, typename V>
+    static Vc_INTRINSIC Segment<typename SimdMaskArray<U, N, V, N>::mask_type, 2, 1> hiImpl(
+        const SimdMaskArray<U, N, V, N> &x)
+    {
+        return {internal_data(x)};
+    }
+
+    // split Vector<T> and Mask<T>
+#ifdef Vc_IMPL_AVX
+    template <class T>
+    static Vc_INTRINSIC SSE::Vector<T> loImpl(Vector<T, VectorAbi::Avx> &&x)
+    {
+        return simd_cast<SSE::Vector<T>, 0>(x);
+    }
+    template <class T>
+    static Vc_INTRINSIC SSE::Vector<T> hiImpl(Vector<T, VectorAbi::Avx> &&x)
+    {
+        return simd_cast<SSE::Vector<T>, 1>(x);
+    }
+    template <class T>
+    static Vc_INTRINSIC SSE::Mask<T> loImpl(Mask<T, VectorAbi::Avx> &&x)
+    {
+        return simd_cast<SSE::Mask<T>, 0>(x);
+    }
+    template <class T>
+    static Vc_INTRINSIC SSE::Mask<T> hiImpl(Mask<T, VectorAbi::Avx> &&x)
+    {
+        return simd_cast<SSE::Mask<T>, 1>(x);
+    }
+#endif  // Vc_IMPL_AVX
+    template <typename T>
+    static constexpr bool is_vector_or_mask(){
+        return (Traits::is_simd_vector<T>::value && !Traits::isSimdArray<T>::value) ||
+               (Traits::is_simd_mask<T>::value && !Traits::isSimdMaskArray<T>::value);
+    }
+    template <typename V>
+    static Vc_INTRINSIC Segment<V, 2, 0> loImpl(V &&x, enable_if<is_vector_or_mask<V>()> = nullarg)
+    {
+        return {std::forward<V>(x)};
+    }
+    template <typename V>
+    static Vc_INTRINSIC Segment<V, 2, 1> hiImpl(V &&x, enable_if<is_vector_or_mask<V>()> = nullarg)
+    {
+        return {std::forward<V>(x)};
+    }
+
+    // split std::vector<T>
+    template <class T, class A>
+    static Vc_INTRINSIC const T *loImpl(const std::vector<T, A> &x)
+    {
+        return x.data();
+    }
+    template <class T, class A>
+    static Vc_INTRINSIC const T *hiImpl(const std::vector<T, A> &x)
+    {
+        return x.data() + secondOffset;
+    }
+
+    // generically split Segments
+    template <typename V, std::size_t Pieces, std::size_t Index>
+    static Vc_INTRINSIC Segment<V, 2 * Pieces, 2 * Index> loImpl(
+        const Segment<V, Pieces, Index> &x)
+    {
+        return {x.data};
+    }
+    template <typename V, std::size_t Pieces, std::size_t Index>
+    static Vc_INTRINSIC Segment<V, 2 * Pieces, 2 * Index + 1> hiImpl(
+        const Segment<V, Pieces, Index> &x)
+    {
+        return {x.data};
+    }
+
+    /** \internal
+     * \name Checks for existence of \c loImpl / \c hiImpl
+     */
+    //@{
+    template <typename T, typename = decltype(loImpl(std::declval<T>()))>
+    static std::true_type have_lo_impl(int);
+    template <typename T> static std::false_type have_lo_impl(float);
+    template <typename T> static constexpr bool have_lo_impl()
+    {
+        return decltype(have_lo_impl<T>(1))::value;
+    }
+
+    template <typename T, typename = decltype(hiImpl(std::declval<T>()))>
+    static std::true_type have_hi_impl(int);
+    template <typename T> static std::false_type have_hi_impl(float);
+    template <typename T> static constexpr bool have_hi_impl()
+    {
+        return decltype(have_hi_impl<T>(1))::value;
+    }
+    //@}
+
+public:
+    /** \internal
+     * \name with Operations tag
+     *
+     * These functions don't overload on the data parameter. The first parameter (the tag) clearly
+     * identifies the intended function.
+     */
+    //@{
+    template <typename U>
+    static Vc_INTRINSIC const U *lo(Operations::gather, const U *ptr)
+    {
+        return ptr;
+    }
+    template <typename U>
+    static Vc_INTRINSIC const U *hi(Operations::gather, const U *ptr)
+    {
+        return ptr + secondOffset;
+    }
+    template <typename U, typename = enable_if<!std::is_pointer<U>::value>>
+    static Vc_ALWAYS_INLINE decltype(loImpl(std::declval<U>()))
+        lo(Operations::gather, U &&x)
+    {
+        return loImpl(std::forward<U>(x));
+    }
+    template <typename U, typename = enable_if<!std::is_pointer<U>::value>>
+    static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval<U>()))
+        hi(Operations::gather, U &&x)
+    {
+        return hiImpl(std::forward<U>(x));
+    }
+    template <typename U>
+    static Vc_INTRINSIC const U *lo(Operations::scatter, const U *ptr)
+    {
+        return ptr;
+    }
+    template <typename U>
+    static Vc_INTRINSIC const U *hi(Operations::scatter, const U *ptr)
+    {
+        return ptr + secondOffset;
+    }
+    //@}
+
+    /** \internal
+      \name without Operations tag
+
+      These functions are not clearly tagged as to where they are used and therefore
+      behave differently depending on the type of the parameter. Different behavior is
+      implemented via overloads of \c loImpl and \c hiImpl. They are not overloads of \c
+      lo and \c hi directly because it's hard to compete against a universal reference
+      (i.e. an overload for `int` requires overloads for `int &`, `const int &`, and `int
+      &&`. If one of them were missing `U &&` would win in overload resolution).
+     */
+    //@{
+    template <typename U>
+    static Vc_ALWAYS_INLINE decltype(loImpl(std::declval<U>())) lo(U &&x)
+    {
+        return loImpl(std::forward<U>(x));
+    }
+    template <typename U>
+    static Vc_ALWAYS_INLINE decltype(hiImpl(std::declval<U>())) hi(U &&x)
+    {
+        return hiImpl(std::forward<U>(x));
+    }
+
+    template <typename U>
+    static Vc_ALWAYS_INLINE enable_if<!have_lo_impl<U>(), U> lo(U &&x)
+    {
+        return std::forward<U>(x);
+    }
+    template <typename U>
+    static Vc_ALWAYS_INLINE enable_if<!have_hi_impl<U>(), U> hi(U &&x)
+    {
+        return std::forward<U>(x);
+    }
+    //@}
+};
+
+// actual_value {{{1
+template <typename Op, typename U, std::size_t M, typename V>
+static Vc_INTRINSIC const V &actual_value(Op, const SimdArray<U, M, V, M> &x)
+{
+  return internal_data(x);
+}
+template <typename Op, typename U, std::size_t M, typename V>
+static Vc_INTRINSIC V *actual_value(Op, SimdArray<U, M, V, M> *x)
+{
+  return &internal_data(*x);
+}
+template <typename Op, typename T, size_t Pieces, size_t Index>
+static Vc_INTRINSIC typename Segment<T, Pieces, Index>::fixed_size_type actual_value(
+    Op, Segment<T, Pieces, Index> &&seg)
+{
+    return seg.to_fixed_size();
+}
+
+template <typename Op, typename U, std::size_t M, typename V>
+static Vc_INTRINSIC const typename V::Mask &actual_value(Op, const SimdMaskArray<U, M, V, M> &x)
+{
+  return internal_data(x);
+}
+template <typename Op, typename U, std::size_t M, typename V>
+static Vc_INTRINSIC typename V::Mask *actual_value(Op, SimdMaskArray<U, M, V, M> *x)
+{
+  return &internal_data(*x);
+}
+
+// unpackArgumentsAuto {{{1
+/**\internal
+ * \name unpackArgumentsAuto
+ *
+ * Search for the right amount of SimdArray "unpacking" (via actual_value) to match the
+ * interface of the function to be called.
+ *
+ * The compiler can figure this out for us thanks to SFINAE. The approach is to have a
+ * number \c I that determines the indexes of the arguments to be transformed via
+ * actual_value.  Each bit of \c I identifies an argument. unpackArgumentsAuto starts the
+ * recursion with `I = 0`, i.e. no actual_value transformations. If the overload calling
+ * \c op is unavailable due to a substitution failure \c I is incremented and the function
+ * recurses. Otherwise there are two unpackArgumentsAutoImpl functions in the overload
+ * set. The first argument (\c int / \c float) leads to a preference of the function
+ * calling \c op, thus ending the recursion.
+ */
+///@{
+
+///\internal transforms \p arg via actual_value
+template <typename Op, typename Arg>
+Vc_INTRINSIC decltype(actual_value(std::declval<Op &>(), std::declval<Arg>()))
+conditionalUnpack(std::true_type, Op op, Arg &&arg)
+{
+    return actual_value(op, std::forward<Arg>(arg));
+}
+///\internal forwards \p arg to its return value
+template <typename Op, typename Arg>
+Vc_INTRINSIC Arg conditionalUnpack(std::false_type, Op, Arg &&arg)
+{
+    return std::forward<Arg>(arg);
+}
+
+///\internal true-/false_type that selects whether the argument with index B should be unpacked
+template <size_t A, size_t B>
+struct selectorType : public std::integral_constant<bool, !((A & (size_t(1) << B)) != 0)> {
+};
+
+///\internal ends the recursion, transforms arguments, and calls \p op
+template <size_t I, typename Op, typename R, typename... Args, size_t... Indexes>
+Vc_INTRINSIC decltype(std::declval<Op &>()(std::declval<R &>(),
+                                           conditionalUnpack(selectorType<I, Indexes>(),
+                                                             std::declval<Op &>(),
+                                                             std::declval<Args>())...))
+unpackArgumentsAutoImpl(int, index_sequence<Indexes...>, Op op, R &&r, Args &&... args)
+{
+    op(std::forward<R>(r),
+       conditionalUnpack(selectorType<I, Indexes>(), op, std::forward<Args>(args))...);
+}
+
+///\internal the current actual_value calls don't work: recurse to I + 1
+template <size_t I, typename Op, typename R, typename... Args, size_t... Indexes>
+Vc_INTRINSIC enable_if<(I <= (size_t(1) << sizeof...(Args))), void> unpackArgumentsAutoImpl(
+    float, index_sequence<Indexes...> is, Op op, R &&r, Args &&... args)
+{
+    // if R is nullptr_t then the return type cannot enforce that actually any unwrapping
+    // of the SimdArray types happens. Thus, you could get an endless loop of the
+    // SimdArray function overload calling itself, if the index goes up to (1 <<
+    // sizeof...(Args)) - 1 (which means no argument transformations via actual_value).
+    static_assert(
+        I < (1 << sizeof...(Args)) - (std::is_same<R, std::nullptr_t>::value ? 1 : 0),
+        "Vc or compiler bug. Please report. Failed to find a combination of "
+        "actual_value(arg) transformations that allows calling Op.");
+    unpackArgumentsAutoImpl<I + 1, Op, R, Args...>(int(), is, op, std::forward<R>(r),
+                                                   std::forward<Args>(args)...);
+}
+
+#ifdef Vc_ICC
+template <size_t, typename... Ts> struct IccWorkaround {
+    using type = void;
+};
+template <typename... Ts> struct IccWorkaround<2, Ts...> {
+    using type = typename std::remove_pointer<typename std::decay<
+        typename std::tuple_element<1, std::tuple<Ts...>>::type>::type>::type;
+};
+#endif
+
+///\internal The interface to start the machinery.
+template <typename Op, typename R, typename... Args>
+Vc_INTRINSIC void unpackArgumentsAuto(Op op, R &&r, Args &&... args)
+{
+#ifdef Vc_ICC
+    // ugly hacky workaround for ICC:
+    // The compiler fails to do SFINAE right on recursion. We have to hit the right
+    // recursionStart number from the start.
+    const int recursionStart =
+        Traits::isSimdArray<
+            typename IccWorkaround<sizeof...(Args), Args...>::type>::value &&
+                (std::is_same<Op, Common::Operations::Forward_frexp>::value ||
+                 std::is_same<Op, Common::Operations::Forward_ldexp>::value)
+            ? 2
+            : 0;
+#else
+    const int recursionStart = 0;
+#endif
+    unpackArgumentsAutoImpl<recursionStart>(
+        int(), make_index_sequence<sizeof...(Args)>(), op, std::forward<R>(r),
+        std::forward<Args>(args)...);
+}
+///@}
+
+//}}}1
+///@}
+}  // namespace Common
+}  // namespace Vc
+
+#endif  // VC_COMMON_SIMDARRAYHELPER_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/simdize.h
+++ b/Vc/common/simdize.h
--- a/Vc/common/simdmaskarray.h
+++ b/Vc/common/simdmaskarray.h
@ -0,0 +1,719 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_SIMDMASKARRAY_H_
+#define VC_COMMON_SIMDMASKARRAY_H_
+
+#include <type_traits>
+#include <array>
+#include "simdarrayhelper.h"
+#include "utility.h"
+#include "maskbool.h"
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+/// \addtogroup SimdArray
+/// @{
+// atomic SimdMaskArray {{{1
+/**\internal
+ * Specialization of `SimdMaskArray<T, N, VectorType, VectorSize>` for the case where `N
+ * == VectorSize`.
+ *
+ * This is specialized for implementation purposes: Since the general implementation uses
+ * two SimdMaskArray data members it recurses over different SimdMaskArray instantiations.
+ * The recursion is ended by this specialization, which has a single \p storage_type data
+ * member to which all functions are forwarded more or less directly.
+ */
+template <typename T, std::size_t N, typename VectorType_>
+class SimdMaskArray<T, N, VectorType_, N>
+{
+public:
+    using VectorType = VectorType_;
+    using vector_type = VectorType;
+    using mask_type = typename vector_type::Mask;
+    using storage_type = mask_type;
+
+    friend storage_type &internal_data(SimdMaskArray &m) { return m.data; }
+    friend const storage_type &internal_data(const SimdMaskArray &m) { return m.data; }
+
+    static constexpr std::size_t size() { return N; }
+    static constexpr std::size_t Size = size();
+    static constexpr std::size_t MemoryAlignment = storage_type::MemoryAlignment;
+    static_assert(Size == vector_type::Size, "size mismatch");
+
+    using vectorentry_type = typename mask_type::VectorEntryType;
+    using value_type = typename mask_type::EntryType;
+    using Mask = mask_type;
+    using VectorEntryType = vectorentry_type;
+    using EntryType = value_type;
+    using EntryReference = Vc::Detail::ElementReference<storage_type, SimdMaskArray>;
+    using reference = EntryReference;
+    using Vector = fixed_size_simd<T, N>;
+
+    Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(mask_type));
+
+    // zero init
+    SimdMaskArray() = default;
+
+    // default copy ctor/operator
+    SimdMaskArray(const SimdMaskArray &) = default;
+    SimdMaskArray(SimdMaskArray &&) = default;
+    SimdMaskArray &operator=(const SimdMaskArray &) = default;
+    SimdMaskArray &operator=(SimdMaskArray &&) = default;
+
+    // broadcasts
+    Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerOne one) : data(one) {}
+    Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerZero zero) : data(zero) {}
+    Vc_INTRINSIC explicit SimdMaskArray(bool b) : data(b) {}
+    Vc_INTRINSIC static SimdMaskArray Zero() { return {private_init, storage_type::Zero()}; }
+    Vc_INTRINSIC static SimdMaskArray One() { return {private_init, storage_type::One()}; }
+
+    // conversion (casts); implemented in simd_cast_caller.tcc
+    template <class U, class V, class = enable_if<N == V::Size>>
+    Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
+    template <class U, class V, class = enable_if<(N > V::Size && N <= 2 * V::Size)>,
+              class = U>
+    Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
+    template <class U, class V, class = enable_if<(N > 2 * V::Size && N <= 4 * V::Size)>,
+              class = U, class = U>
+    Vc_INTRINSIC_L SimdMaskArray(const SimdMaskArray<U, N, V> &x) Vc_INTRINSIC_R;
+
+    // conversion from any Segment object (could be SimdMaskArray or Mask<T>)
+    template <typename M, std::size_t Pieces, std::size_t Index>
+    Vc_INTRINSIC_L SimdMaskArray(
+        Common::Segment<M, Pieces, Index> &&x,
+        enable_if<Traits::simd_vector_size<M>::value == Size * Pieces> = nullarg) Vc_INTRINSIC_R;
+
+    // conversion from Mask<T>
+    template <class M, class = enable_if<(Traits::is_simd_mask<M>::value &&
+                                          !Traits::isSimdMaskArray<M>::value &&
+                                          Traits::simd_vector_size<M>::value == Size)>>
+    Vc_INTRINSIC_L SimdMaskArray(M k) Vc_INTRINSIC_R;
+
+    // implicit conversion to Mask<U, AnyAbi> for if Mask<U, AnyAbi>::size() == N
+    template <class U, class A,
+              class = enable_if<Vc::Mask<U, A>::Size == N &&
+                                !detail::is_fixed_size_abi<A>::value>>
+    operator Vc::Mask<U, A>() const
+    {
+        return simd_cast<Vc::Mask<U, A>>(data);
+    }
+    operator fixed_size_simd_mask<T, N> &()
+    {
+        return static_cast<fixed_size_simd_mask<T, N> &>(*this);
+    }
+    operator const fixed_size_simd_mask<T, N> &() const
+    {
+        return static_cast<const fixed_size_simd_mask<T, N> &>(*this);
+    }
+
+    // load/store (from/to bool arrays)
+    template <typename Flags = DefaultLoadTag>
+    Vc_INTRINSIC explicit SimdMaskArray(const bool *mem, Flags f = Flags())
+        : data(mem, f)
+    {
+    }
+
+    Vc_INTRINSIC void load(const bool *mem) { data.load(mem); }
+    template <typename Flags> Vc_INTRINSIC void load(const bool *mem, Flags f)
+    {
+        data.load(mem, f);
+    }
+
+    Vc_INTRINSIC void store(bool *mem) const { data.store(mem); }
+    template <typename Flags> Vc_INTRINSIC void store(bool *mem, Flags f) const
+    {
+        data.store(mem, f);
+    }
+
+    // compares
+    Vc_INTRINSIC Vc_PURE bool operator==(const SimdMaskArray &rhs) const
+    {
+        return data == rhs.data;
+    }
+    Vc_INTRINSIC Vc_PURE bool operator!=(const SimdMaskArray &rhs) const
+    {
+        return data != rhs.data;
+    }
+
+    // inversion
+    Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator!() const
+    {
+        return {private_init, !data};
+    }
+
+    // binary operators
+    Vc_INTRINSIC SimdMaskArray &operator&=(const SimdMaskArray &rhs)
+    {
+        data &= rhs.data;
+        return *this;
+    }
+    Vc_INTRINSIC SimdMaskArray &operator|=(const SimdMaskArray &rhs)
+    {
+        data |= rhs.data;
+        return *this;
+    }
+    Vc_INTRINSIC SimdMaskArray &operator^=(const SimdMaskArray &rhs)
+    {
+        data ^= rhs.data;
+        return *this;
+    }
+
+    Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&(
+        const SimdMaskArray &rhs) const
+    {
+        return {private_init, data & rhs.data};
+    }
+    Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator|(
+        const SimdMaskArray &rhs) const
+    {
+        return {private_init, data | rhs.data};
+    }
+    Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator^(
+        const SimdMaskArray &rhs) const
+    {
+        return {private_init, data ^ rhs.data};
+    }
+
+    Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&&(
+        const SimdMaskArray &rhs) const
+    {
+        return {private_init, data && rhs.data};
+    }
+    Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator||(
+        const SimdMaskArray &rhs) const
+    {
+        return {private_init, data || rhs.data};
+    }
+
+    Vc_INTRINSIC Vc_PURE bool isFull() const { return data.isFull(); }
+    Vc_INTRINSIC Vc_PURE bool isNotEmpty() const { return data.isNotEmpty(); }
+    Vc_INTRINSIC Vc_PURE bool isEmpty() const { return data.isEmpty(); }
+    Vc_INTRINSIC Vc_PURE bool isMix() const { return data.isMix(); }
+
+    Vc_INTRINSIC Vc_PURE int shiftMask() const { return data.shiftMask(); }
+
+    Vc_INTRINSIC Vc_PURE int toInt() const { return data.toInt(); }
+
+private:
+    friend reference;
+    static Vc_INTRINSIC value_type get(const storage_type &k, int i) noexcept
+    {
+        return k[i];
+    }
+    template <typename U>
+    static Vc_INTRINSIC void set(storage_type &k, int i, U &&v) noexcept(
+        noexcept(std::declval<storage_type &>()[0] = std::declval<U>()))
+    {
+        k[i] = std::forward<U>(v);
+    }
+
+public:
+    /**
+     * \note the returned object models the concept of a reference and
+     * as such it can exist longer than the data it is referencing.
+     * \note to avoid lifetime issues, we strongly advice not to store
+     * any reference objects.
+     */
+    Vc_INTRINSIC Vc_PURE reference operator[](size_t index) noexcept
+    {
+        return {data, int(index)};
+    }
+    Vc_INTRINSIC Vc_PURE value_type operator[](size_t index) const noexcept
+    {
+        return data[index];
+    }
+
+    Vc_INTRINSIC Vc_PURE int count() const { return data.count(); }
+
+    /**
+     * Returns the index of the first one in the mask.
+     *
+     * The return value is undefined if the mask is empty.
+     */
+    Vc_INTRINSIC Vc_PURE int firstOne() const { return data.firstOne(); }
+
+    template <typename G>
+    static Vc_INTRINSIC fixed_size_simd_mask<T, N> generate(const G &gen)
+    {
+        return {private_init, mask_type::generate(gen)};
+    }
+
+    Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> shifted(int amount) const
+    {
+        return {private_init, data.shifted(amount)};
+    }
+
+    /// \internal execute specified Operation
+    template <typename Op, typename... Args>
+    static Vc_INTRINSIC fixed_size_simd_mask<T, N> fromOperation(Op op, Args &&... args)
+    {
+        fixed_size_simd_mask<T, N> r;
+        Common::unpackArgumentsAuto(op, r.data, std::forward<Args>(args)...);
+        return r;
+    }
+
+    /// \internal
+    Vc_INTRINSIC SimdMaskArray(private_init_t, mask_type &&x) : data(std::move(x)) {}
+
+private:
+    // The alignas attribute attached to the class declaration above is ignored by ICC
+    // 17.0.0 (at least). So just move the alignas attribute down here where it works for
+    // all compilers.
+    alignas(static_cast<std::size_t>(
+        Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(VectorType_) /
+                                 VectorType_::size()>::value)) storage_type data;
+};
+
+template <typename T, std::size_t N, typename VectorType> constexpr std::size_t SimdMaskArray<T, N, VectorType, N>::Size;
+template <typename T, std::size_t N, typename VectorType>
+constexpr std::size_t SimdMaskArray<T, N, VectorType, N>::MemoryAlignment;
+
+// generic SimdMaskArray {{{1
+/**
+ * Data-parallel mask type with user-defined number of boolean elements.
+ *
+ * \tparam T The value type of the corresponding SimdArray. Depending on the target
+ *           platform this type determines a different bit representation to work most
+ *           efficient with SimdArray types instantiated for \p T.
+ *
+ * \tparam N The number of boolean elements to store and process concurrently. You can
+ *           choose an arbitrary number, though not every number is a good idea.
+ *           Generally, a power of two value or the sum of two power of two values might
+ *           work efficiently, though this depends a lot on the target system.
+ *
+ * \tparam V Don't change the default value unless you really know what you are doing.
+ *           This type is set to the underlying native Vc::Vector type used in the
+ *           implementation of the type.
+ *           Having it as part of the type name guards against some cases of ODR
+ *           violations (i.e. linking incompatible translation units / libraries).
+ *
+ * \tparam Wt Don't ever change the default value.
+ *           This parameter is an unfortunate implementation detail shining through.
+ *
+ * \headerfile simdmaskarray.h <Vc/SimdArray>
+ */
+template <typename T, size_t N, typename V, size_t Wt>
+class SimdMaskArray
+{
+    static constexpr std::size_t N0 = Common::left_size<N>();
+
+    using Split = Common::Split<N0>;
+
+public:
+    using storage_type0 = fixed_size_simd_mask<T, N0>;
+    using storage_type1 = fixed_size_simd_mask<T, N - N0>;
+    static_assert(storage_type0::size() == N0, "");
+
+    using vector_type = fixed_size_simd<T, N>;
+
+    friend storage_type0 &internal_data0(SimdMaskArray &m) { return m.data0; }
+    friend storage_type1 &internal_data1(SimdMaskArray &m) { return m.data1; }
+    friend const storage_type0 &internal_data0(const SimdMaskArray &m) { return m.data0; }
+    friend const storage_type1 &internal_data1(const SimdMaskArray &m) { return m.data1; }
+
+    using mask_type = SimdMaskArray;
+
+    ///\copydoc Mask::size()
+    static constexpr std::size_t size() { return N; }
+    ///\copydoc Mask::Size
+    static constexpr std::size_t Size = size();
+    ///\copydoc Mask::MemoryAlignment
+    static constexpr std::size_t MemoryAlignment =
+        storage_type0::MemoryAlignment > storage_type1::MemoryAlignment
+            ? storage_type0::MemoryAlignment
+            : storage_type1::MemoryAlignment;
+    static_assert(Size == vector_type::Size, "size mismatch");
+
+    ///\internal
+    using vectorentry_type = typename storage_type0::VectorEntryType;
+
+    ///\copydoc Mask::value_type
+    using value_type = typename storage_type0::EntryType;
+    ///\copydoc Mask::Mask
+    using MaskType = mask_type;
+    ///\copydoc Mask::VectorEntryType
+    using VectorEntryType = vectorentry_type;
+    ///\copydoc Mask::EntryType
+    using EntryType = value_type;
+    ///\copydoc Mask::EntryReference
+    using EntryReference = Vc::Detail::ElementReference<SimdMaskArray>;
+    using reference = EntryReference;
+    /// An alias for the corresponding SimdArray type.
+    using Vector = fixed_size_simd<T, N>;
+
+    Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(mask_type));
+
+    // zero init
+    ///\copydoc Mask::Mask()
+    SimdMaskArray() = default;
+
+    // default copy ctor/operator
+    SimdMaskArray(const SimdMaskArray &) = default;
+    SimdMaskArray(SimdMaskArray &&) = default;
+    SimdMaskArray &operator=(const SimdMaskArray &) = default;
+    SimdMaskArray &operator=(SimdMaskArray &&) = default;
+
+    // implicit conversion from SimdMaskArray with same N
+    template <typename U, typename W>
+    Vc_INTRINSIC SimdMaskArray(const SimdMaskArray<U, N, W> &rhs)
+        : data0(Split::lo(rhs)), data1(Split::hi(rhs))
+    {
+    }
+
+    // conversion from any Segment object (could be SimdMaskArray or Mask<T>)
+    template <typename M, std::size_t Pieces, std::size_t Index>
+    Vc_INTRINSIC SimdMaskArray(
+        Common::Segment<M, Pieces, Index> &&rhs,
+        enable_if<Traits::simd_vector_size<M>::value == Size * Pieces> = nullarg)
+        : data0(Split::lo(rhs)), data1(Split::hi(rhs))
+    {
+    }
+
+    // conversion from Mask<T>
+    template <class M, class = enable_if<(Traits::is_simd_mask<M>::value &&
+                                          !Traits::isSimdMaskArray<M>::value &&
+                                          Traits::simd_vector_size<M>::value == Size)>>
+    Vc_INTRINSIC SimdMaskArray(M k) : data0(Split::lo(k)), data1(Split::hi(k))
+    {
+    }
+
+    // implicit conversion to Mask<U, AnyAbi> for if Mask<U, AnyAbi>::size() == N
+    template <class U, class A,
+              class = enable_if<Vc::Mask<U, A>::Size == N &&
+                                !detail::is_fixed_size_abi<A>::value>>
+    operator Vc::Mask<U, A>() const
+    {
+        return simd_cast<Vc::Mask<U, A>>(data0, data1);
+    }
+    Vc_INTRINSIC operator fixed_size_simd_mask<T, N> &()
+    {
+        return static_cast<fixed_size_simd_mask<T, N> &>(*this);
+    }
+    Vc_INTRINSIC operator const fixed_size_simd_mask<T, N> &() const
+    {
+        return static_cast<const fixed_size_simd_mask<T, N> &>(*this);
+    }
+
+    ///\copybrief Mask::Mask(VectorSpecialInitializerOne)
+    Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerOne one)
+        : data0(one), data1(one)
+    {
+    }
+    ///\copybrief Mask::Mask(VectorSpecialInitializerZero)
+    Vc_INTRINSIC explicit SimdMaskArray(VectorSpecialInitializerZero zero)
+        : data0(zero), data1(zero)
+    {
+    }
+    ///\copydoc Mask::Mask(bool)
+    Vc_INTRINSIC explicit SimdMaskArray(bool b) : data0(b), data1(b) {}
+
+    ///\copydoc Mask::Zero()
+    Vc_INTRINSIC static fixed_size_simd_mask<T, N> Zero()
+    {
+        return {storage_type0::Zero(), storage_type1::Zero()};
+    }
+    ///\copydoc Mask::One()
+    Vc_INTRINSIC static fixed_size_simd_mask<T, N> One()
+    {
+        return {storage_type0::One(), storage_type1::One()};
+    }
+
+    ///\name Loads & Stores
+    ///@{
+
+    /**
+     * Load N boolean values from the consecutive addresses starting at \p mem.
+     *
+     * \param mem A pointer to an array of booleans.
+     * \param f A combination of flags to modify specific behavior of the load.
+     */
+    template <typename Flags = DefaultLoadTag>
+    Vc_INTRINSIC explicit SimdMaskArray(const bool *mem, Flags f = Flags())
+        : data0(mem, f), data1(mem + storage_type0::size(), f)
+    {
+    }
+
+    /**
+     * Load N boolean values from the consecutive addresses starting at \p mem.
+     *
+     * \param mem A pointer to an array of booleans.
+     */
+    Vc_INTRINSIC void load(const bool *mem)
+    {
+        data0.load(mem);
+        data1.load(mem + storage_type0::size());
+    }
+
+    /**
+     * Load N boolean values from the consecutive addresses starting at \p mem.
+     *
+     * \param mem A pointer to an array of booleans.
+     * \param f A combination of flags to modify specific behavior of the load.
+     */
+    template <typename Flags> Vc_INTRINSIC void load(const bool *mem, Flags f)
+    {
+        data0.load(mem, f);
+        data1.load(mem + storage_type0::size(), f);
+    }
+
+    /**
+     * Store N boolean values to the consecutive addresses starting at \p mem.
+     *
+     * \param mem A pointer to an array of booleans.
+     */
+    Vc_INTRINSIC void store(bool *mem) const
+    {
+        data0.store(mem);
+        data1.store(mem + storage_type0::size());
+    }
+
+    /**
+     * Store N boolean values to the consecutive addresses starting at \p mem.
+     *
+     * \param mem A pointer to an array of booleans.
+     * \param f A combination of flags to modify specific behavior of the load.
+     */
+    template <typename Flags> Vc_INTRINSIC void store(bool *mem, Flags f) const
+    {
+        data0.store(mem, f);
+        data1.store(mem + storage_type0::size(), f);
+    }
+    ///@}
+
+    ///\copydoc Mask::operator==
+    Vc_INTRINSIC Vc_PURE bool operator==(const SimdMaskArray &mask) const
+    {
+        return data0 == mask.data0 && data1 == mask.data1;
+    }
+    ///\copydoc Mask::operator!=
+    Vc_INTRINSIC Vc_PURE bool operator!=(const SimdMaskArray &mask) const
+    {
+        return data0 != mask.data0 || data1 != mask.data1;
+    }
+
+    ///\copybrief Mask::operator!
+    Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator!() const
+    {
+        return {!data0, !data1};
+    }
+
+    ///\copybrief Mask::operator&=
+    Vc_INTRINSIC SimdMaskArray &operator&=(const SimdMaskArray &rhs)
+    {
+        data0 &= rhs.data0;
+        data1 &= rhs.data1;
+        return *this;
+    }
+    ///\copybrief Mask::operator|=
+    Vc_INTRINSIC SimdMaskArray &operator|=(const SimdMaskArray &rhs)
+    {
+        data0 |= rhs.data0;
+        data1 |= rhs.data1;
+        return *this;
+    }
+    ///\copybrief Mask::operator^=
+    Vc_INTRINSIC SimdMaskArray &operator^=(const SimdMaskArray &rhs)
+    {
+        data0 ^= rhs.data0;
+        data1 ^= rhs.data1;
+        return *this;
+    }
+
+    ///\copybrief Mask::operator&
+    Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&(
+        const SimdMaskArray &rhs) const
+    {
+        return {data0 & rhs.data0, data1 & rhs.data1};
+    }
+    ///\copybrief Mask::operator|
+    Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator|(
+        const SimdMaskArray &rhs) const
+    {
+        return {data0 | rhs.data0, data1 | rhs.data1};
+    }
+    ///\copybrief Mask::operator^
+    Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator^(
+        const SimdMaskArray &rhs) const
+    {
+        return {data0 ^ rhs.data0, data1 ^ rhs.data1};
+    }
+
+    ///\copybrief Mask::operator&&
+    Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator&&(
+        const SimdMaskArray &rhs) const
+    {
+        return {data0 && rhs.data0, data1 && rhs.data1};
+    }
+    ///\copybrief Mask::operator||
+    Vc_INTRINSIC Vc_PURE fixed_size_simd_mask<T, N> operator||(
+        const SimdMaskArray &rhs) const
+    {
+        return {data0 || rhs.data0, data1 || rhs.data1};
+    }
+
+    ///\copybrief Mask::isFull
+    Vc_INTRINSIC Vc_PURE bool isFull() const { return data0.isFull() && data1.isFull(); }
+    ///\copybrief Mask::isNotEmpty
+    Vc_INTRINSIC Vc_PURE bool isNotEmpty() const { return data0.isNotEmpty() || data1.isNotEmpty(); }
+    ///\copybrief Mask::isEmpty
+    Vc_INTRINSIC Vc_PURE bool isEmpty() const { return data0.isEmpty() && data1.isEmpty(); }
+    ///\copybrief Mask::isMix
+    Vc_INTRINSIC Vc_PURE bool isMix() const { return !isFull() && !isEmpty(); }
+
+    ///\copydoc Mask::toInt
+    Vc_INTRINSIC Vc_PURE int toInt() const
+    {
+        return data0.toInt() | (data1.toInt() << data0.size());
+    }
+
+private:
+    friend reference;
+    static Vc_INTRINSIC value_type get(const SimdMaskArray &o, int i) noexcept
+    {
+        if (i < int(o.data0.size())) {
+            return o.data0[i];
+        } else {
+            return o.data1[i - o.data0.size()];
+        }
+    }
+    template <typename U>
+    static Vc_INTRINSIC void set(SimdMaskArray &o, int i, U &&v) noexcept(
+        noexcept(std::declval<storage_type0 &>()[0] = std::declval<U>()) &&
+        noexcept(std::declval<storage_type1 &>()[0] = std::declval<U>()))
+    {
+        if (i < int(o.data0.size())) {
+            o.data0[i] = std::forward<U>(v);
+        } else {
+            o.data1[i - o.data0.size()] = std::forward<U>(v);
+        }
+    }
+
+public:
+    /**
+     * Return a smart reference to the boolean element at index \p index.
+     *
+     * \param index The element index to be accessed.
+     *
+     * \returns A temporary smart reference object which acts as much as an lvalue
+     * reference as possible.
+     */
+    Vc_INTRINSIC Vc_PURE reference operator[](size_t index) noexcept
+    {
+        return {*this, int(index)};
+    }
+    /**
+     * Return a copy of the boolean element at index \p index.
+     *
+     * \param index The element index to be accessed.
+     *
+     * \returns A temporary boolean object with the value of the element at index \p
+     * index.
+     */
+    Vc_INTRINSIC Vc_PURE value_type operator[](size_t index) const noexcept
+    {
+        return get(*this, index);
+    }
+
+    ///\copybrief Mask::count
+    Vc_INTRINSIC Vc_PURE int count() const { return data0.count() + data1.count(); }
+
+    ///\copydoc Mask::firstOne
+    Vc_INTRINSIC Vc_PURE int firstOne() const {
+        if (data0.isEmpty()) {
+            return data1.firstOne() + storage_type0::size();
+        }
+        return data0.firstOne();
+    }
+
+    ///\copybrief Mask::generate
+    template <typename G>
+    static Vc_INTRINSIC fixed_size_simd_mask<T, N> generate(const G &gen)
+    {
+        return {storage_type0::generate(gen),
+                storage_type1::generate([&](std::size_t i) { return gen(i + N0); })};
+    }
+
+    ///\copybrief Mask::shifted
+    inline Vc_PURE fixed_size_simd_mask<T, N> shifted(int amount) const
+    {
+        if (Vc_IS_UNLIKELY(amount == 0)) {
+            return *this;
+        }
+        return generate([&](unsigned i) {
+            // modulo arithmetic of unsigned makes the check for j >= 0 unnecessary
+            const unsigned j = i + amount;
+            return j < size() ? get(*this, j) : false;
+        });
+    }
+
+    /// \internal execute specified Operation
+    template <typename Op, typename... Args>
+    static Vc_INTRINSIC fixed_size_simd_mask<T, N> fromOperation(Op op, Args &&... args)
+    {
+        fixed_size_simd_mask<T, N> r = {
+            storage_type0::fromOperation(op, Split::lo(args)...),  // no forward here - it
+                                                                   // could move and thus
+                                                                   // break the next line
+            storage_type1::fromOperation(op, Split::hi(std::forward<Args>(args))...)};
+        return r;
+    }
+
+    /// \internal
+    Vc_INTRINSIC SimdMaskArray(storage_type0 &&x, storage_type1 &&y)
+        : data0(std::move(x)), data1(std::move(y))
+    {
+    }
+
+private:
+    // The alignas attribute attached to the class declaration above is ignored by ICC
+    // 17.0.0 (at least). So just move the alignas attribute down here where it works for
+    // all compilers.
+    alignas(static_cast<std::size_t>(
+        Common::BoundedAlignment<Common::NextPowerOfTwo<N>::value * sizeof(V) /
+                                 V::size()>::value)) storage_type0 data0;
+    storage_type1 data1;
+};
+template <typename T, std::size_t N, typename V, std::size_t M>
+constexpr std::size_t SimdMaskArray<T, N, V, M>::Size;
+template <typename T, std::size_t N, typename V, std::size_t M>
+constexpr std::size_t SimdMaskArray<T, N, V, M>::MemoryAlignment;
+
+///}}}1
+/// @}
+
+}  // namespace Vc
+
+// XXX: this include should be in <Vc/vector.h>. But at least clang 3.4 then fails to compile the
+// code. Not sure yet what is going on, but it looks a lot like a bug in clang.
+#include "simd_cast_caller.tcc"
+
+#endif // VC_COMMON_SIMDMASKARRAY_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/span.h
+++ b/Vc/common/span.h
@ -0,0 +1,653 @@
+// -*- C++ -*-
+//===------------------------------ span ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+// Adapted for use with Vc:
+// Copyright © 2018 Matthias Kretz <kretz@kde.org>
+//===---------------------------------------------------------------------===//
+
+#ifndef VC_COMMON_SPAN_H_
+#define VC_COMMON_SPAN_H_
+
+#include <array>        // for array
+#include <cstddef>      // for ptrdiff_t
+#include <cstddef>      // for std::byte
+#include <iterator>     // for iterators
+#include <type_traits>  // for remove_cv, etc
+#include "subscript.h"  // for AdaptSubscriptOperator
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+#ifdef __cpp_inline_variables
+inline
+#endif
+    constexpr ptrdiff_t dynamic_extent = -1;
+namespace Common
+{
+template <typename T, ptrdiff_t Extent = dynamic_extent> class span;
+
+template <typename T, ptrdiff_t Extent>
+constexpr auto begin(const span<T, Extent>& s) noexcept -> decltype(s.begin())
+{
+    return s.begin();
+}
+template <typename T, ptrdiff_t Extent>
+constexpr auto end(const span<T, Extent>& s) noexcept -> decltype(s.end())
+{
+    return s.end();
+}
+
+template <class T> struct _is_span_impl : public std::false_type {
+};
+
+template <class T, ptrdiff_t Extent>
+struct _is_span_impl<span<T, Extent>> : public std::true_type {
+};
+
+template <class T>
+struct _is_span : public _is_span_impl<typename std::remove_cv<T>::type> {
+};
+
+template <class T> struct _is_std_array_impl : public std::false_type {
+};
+
+template <class T, size_t Sz>
+struct _is_std_array_impl<array<T, Sz>> : public std::true_type {
+};
+
+template <class T>
+struct _is_std_array : public _is_std_array_impl<typename std::remove_cv<T>::type> {
+};
+
+template <class T, class ElementType, class = void>
+struct _is_span_compatible_container : public std::false_type {
+};
+
+template <class... Ts> using _void_t = void;
+
+template <class C> constexpr auto _std_data(C& c) -> decltype(c.data())
+{
+    return c.data();
+}
+template <class C> constexpr auto _std_data(const C& c) -> decltype(c.data())
+{
+    return c.data();
+}
+template <class T, std::size_t N> constexpr T* _std_data(T (&array)[N]) noexcept
+{
+    return array;
+}
+template <class E> constexpr const E* _std_data(std::initializer_list<E> il) noexcept
+{
+    return il.begin();
+}
+
+template <class C> constexpr auto _std_size(const C& c) -> decltype(c.size())
+{
+    return c.size();
+}
+template <class T, std::size_t N>
+constexpr std::size_t _std_size(const T (&array)[N]) noexcept
+{
+    return N;
+}
+
+template <class T, class ElementType>
+struct _is_span_compatible_container<
+    T, ElementType,
+    _void_t<
+        // is not a specialization of span
+        typename std::enable_if<!_is_span<T>::value, std::nullptr_t>::type,
+        // is not a specialization of array
+        typename std::enable_if<!_is_std_array<T>::value, std::nullptr_t>::type,
+        // is_array_v<Container> is false,
+        typename std::enable_if<!std::is_array<T>::value, std::nullptr_t>::type,
+        // data(cont) and size(cont) are well formed
+        decltype(data(std::declval<T>())), decltype(size(std::declval<T>())),
+        // remove_pointer_t<decltype(data(cont))>(*)[] is convertible to ElementType(*)[]
+        typename std::enable_if<
+            std::is_convertible<typename std::remove_pointer<decltype(
+                                    data(std::declval<T&>()))>::type (*)[],
+                                ElementType (*)[]>::value,
+            std::nullptr_t>::type>> : public std::true_type {
+};
+
+#if defined Vc_MSVC || (defined Vc_GCC && Vc_GCC < 0x50100) || defined Vc_ICC || !defined __cpp_constexpr || __cpp_constexpr < 201304
+#define Vc_CONSTEXPR
+#else
+#define Vc_CONSTEXPR constexpr
+#endif
+
+template <typename T, ptrdiff_t Extent> class span
+{
+public:
+    //  constants and types
+    using element_type = T;
+    using value_type = typename std::remove_cv<T>::type;
+    using index_type = ptrdiff_t;
+    using difference_type = ptrdiff_t;
+    using pointer = T*;
+    using const_pointer = const T*;  // not in standard
+    using reference = T&;
+    using const_reference = const T&;  // not in standard
+    using iterator = pointer;
+    using const_iterator = const_pointer;
+    using reverse_iterator = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+    static constexpr index_type extent = Extent;
+    static_assert(Extent >= 0, "Can't have a span with an extent < 0");
+
+    // [span.cons], span constructors, copy, assignment, and destructor
+    Vc_CONSTEXPR span() noexcept : data_{nullptr}
+    {
+        static_assert(Extent == 0,
+                      "Can't default construct a statically sized span with size > 0");
+    }
+
+    Vc_CONSTEXPR span(const span&) noexcept = default;
+    Vc_CONSTEXPR span& operator=(const span&) noexcept = default;
+
+    Vc_CONSTEXPR span(pointer _ptr, index_type _count) : data_{_ptr}
+    {
+        (void)_count;
+        Vc_ASSERT(((void)"size mismatch in span's constructor (ptr, len)", Extent == _count));
+    }
+    Vc_CONSTEXPR span(pointer _f, pointer _l) : data_{_f}
+    {
+        (void)_l;
+        Vc_ASSERT(((void)"size mismatch in span's constructor (ptr, ptr)",
+                   Extent == distance(_f, _l)));
+    }
+
+    Vc_CONSTEXPR span(element_type (&_arr)[Extent]) noexcept : data_{_arr} {}
+    Vc_CONSTEXPR span(array<value_type, Extent>& _arr) noexcept : data_{_arr.data()} {}
+    Vc_CONSTEXPR span(const array<value_type, Extent>& _arr) noexcept : data_{_arr.data()} {}
+
+    template <class Container>
+    inline Vc_CONSTEXPR span(
+        Container& _c,
+        typename std::enable_if<_is_span_compatible_container<Container, T>::value,
+                                std::nullptr_t>::type = nullptr)
+        : data_{_std_data(_c)}
+    {
+        Vc_ASSERT(("size mismatch in span's constructor (container))",
+                   Extent == _std_size(_c)));
+    }
+
+    template <class Container>
+    inline Vc_CONSTEXPR span(
+        const Container& _c,
+        typename std::enable_if<_is_span_compatible_container<const Container, T>::value,
+                                std::nullptr_t>::type = nullptr)
+        : data_{_std_data(_c)}
+    {
+        Vc_ASSERT(("size mismatch in span's constructor (const container)",
+                   Extent == _std_size(_c)));
+    }
+
+    template <class OtherElementType>
+    inline Vc_CONSTEXPR span(
+        const span<OtherElementType, Extent>& _other,
+        typename std::enable_if<
+            std::is_convertible<OtherElementType (*)[], element_type (*)[]>::value,
+            std::nullptr_t>::type = nullptr)
+        : data_{_other.data()}
+    {
+    }
+
+    template <class OtherElementType>
+    inline Vc_CONSTEXPR span(
+        const span<OtherElementType, dynamic_extent>& _other,
+        typename std::enable_if<
+            std::is_convertible<OtherElementType (*)[], element_type (*)[]>::value,
+            std::nullptr_t>::type = nullptr) noexcept
+        : data_{_other.data()}
+    {
+        Vc_ASSERT(("size mismatch in span's constructor (other span)",
+                   Extent == _other.size()));
+    }
+
+    //  ~span() noexcept = default;
+
+    template <ptrdiff_t Count>
+    inline Vc_CONSTEXPR span<element_type, Count> first() const noexcept
+    {
+        static_assert(Count >= 0, "Count must be >= 0 in span::first()");
+        static_assert(Count <= Extent, "Count out of range in span::first()");
+        return {data(), Count};
+    }
+
+    template <ptrdiff_t Count>
+    inline Vc_CONSTEXPR span<element_type, Count> last() const noexcept
+    {
+        static_assert(Count >= 0, "Count must be >= 0 in span::last()");
+        static_assert(Count <= Extent, "Count out of range in span::last()");
+        return {data() + size() - Count, Count};
+    }
+
+    Vc_CONSTEXPR span<element_type, dynamic_extent> first(index_type _count) const noexcept
+    {
+        Vc_ASSERT(("Count out of range in span::first(count)",
+                   _count >= 0 && _count <= size()));
+        return {data(), _count};
+    }
+
+    Vc_CONSTEXPR span<element_type, dynamic_extent> last(index_type _count) const noexcept
+    {
+        Vc_ASSERT(
+            ("Count out of range in span::last(count)", _count >= 0 && _count <= size()));
+        return {data() + size() - _count, _count};
+    }
+
+#ifndef Vc_MSVC
+    // MSVC 190024215 fails with "error C2059: syntax error: '<end Parse>'" somewhere in
+    // this file.  Unless someone needs this function on MSVC, I don't see a reason to
+    // invest time into working around their bugs.
+    template <ptrdiff_t Offset, ptrdiff_t Count = dynamic_extent>
+    inline Vc_CONSTEXPR auto subspan() const noexcept
+        -> span<element_type, Count != dynamic_extent ? Count : Extent - Offset>
+    {
+        Vc_ASSERT(
+            ("Offset out of range in span::subspan()", Offset >= 0 && Offset <= size()));
+        return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
+    }
+
+    inline Vc_CONSTEXPR span<element_type, dynamic_extent> subspan(
+        index_type offset, index_type count = dynamic_extent) const noexcept
+    {
+        Vc_ASSERT(("Offset out of range in span::subspan(offset, count)",
+                   offset >= 0 && offset <= size()));
+        Vc_ASSERT(("Count out of range in span::subspan(offset, count)",
+                   (count >= 0 && count <= size()) || count == dynamic_extent));
+        if (count == dynamic_extent) {
+            return {data() + offset, size() - offset};
+        }
+        Vc_ASSERT(("count + offset out of range in span::subspan(offset, count)",
+                   offset + count <= size()));
+        return {data() + offset, count};
+    }
+#endif  // Vc_MSVC
+
+    Vc_CONSTEXPR index_type size() const noexcept { return Extent; }
+    Vc_CONSTEXPR index_type size_bytes() const noexcept
+    {
+        return Extent * sizeof(element_type);
+    }
+    Vc_CONSTEXPR bool empty() const noexcept { return Extent == 0; }
+
+    Vc_CONSTEXPR reference operator[](index_type _idx) const noexcept
+    {
+        Vc_ASSERT(("span<T,N>[] index out of bounds", _idx >= 0 && _idx < size()));
+        return data_[_idx];
+    }
+
+    Vc_CONSTEXPR reference operator()(index_type _idx) const noexcept
+    {
+        Vc_ASSERT(("span<T,N>() index out of bounds", _idx >= 0 && _idx < size()));
+        return data_[_idx];
+    }
+
+    Vc_CONSTEXPR pointer data() const noexcept { return data_; }
+
+    // [span.iter], span iterator support
+    Vc_CONSTEXPR iterator begin() const noexcept { return iterator(data()); }
+    Vc_CONSTEXPR iterator end() const noexcept { return iterator(data() + size()); }
+    Vc_CONSTEXPR const_iterator cbegin() const noexcept { return const_iterator(data()); }
+    Vc_CONSTEXPR const_iterator cend() const noexcept
+    {
+        return const_iterator(data() + size());
+    }
+    Vc_CONSTEXPR reverse_iterator rbegin() const noexcept { return reverse_iterator(end()); }
+    Vc_CONSTEXPR reverse_iterator rend() const noexcept { return reverse_iterator(begin()); }
+    Vc_CONSTEXPR const_reverse_iterator crbegin() const noexcept
+    {
+        return const_reverse_iterator(cend());
+    }
+    Vc_CONSTEXPR const_reverse_iterator crend() const noexcept
+    {
+        return const_reverse_iterator(cbegin());
+    }
+
+    Vc_CONSTEXPR void swap(span& _other) noexcept
+    {
+        pointer _p = data_;
+        data_ = _other.data_;
+        _other.data_ = _p;
+    }
+
+#ifdef __cpp_lib_byte
+    span<const std::byte, Extent * sizeof(element_type)> _as_bytes() const noexcept
+    {
+        return {reinterpret_cast<const std::byte*>(data()), size_bytes()};
+    }
+
+    span<std::byte, Extent * sizeof(element_type)> _as_writeable_bytes() const noexcept
+    {
+        return {reinterpret_cast<std::byte*>(data()), size_bytes()};
+    }
+#endif  // __cpp_lib_byte
+
+private:
+    pointer data_;
+};
+
+template <typename T> class span<T, dynamic_extent>
+{
+private:
+public:
+    //  constants and types
+    using element_type = T;
+    using value_type = typename std::remove_cv<T>::type;
+    using index_type = ptrdiff_t;
+    using difference_type = ptrdiff_t;
+    using pointer = T*;
+    using const_pointer = const T*;  // not in standard
+    using reference = T&;
+    using const_reference = const T&;  // not in standard
+    using iterator = pointer;
+    using const_iterator = const_pointer;
+    using reverse_iterator = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+    static constexpr index_type extent = dynamic_extent;
+
+    // [span.cons], span constructors, copy, assignment, and destructor
+    Vc_CONSTEXPR span() noexcept : data_{nullptr}, size_{0} {}
+
+    Vc_CONSTEXPR span(const span&) noexcept = default;
+    Vc_CONSTEXPR span& operator=(const span&) noexcept = default;
+
+    Vc_CONSTEXPR span(pointer _ptr, index_type _count) : data_{_ptr}, size_{_count} {}
+    Vc_CONSTEXPR span(pointer _f, pointer _l) : data_{_f}, size_{distance(_f, _l)} {}
+
+    template <size_t Sz>
+    inline Vc_CONSTEXPR span(element_type (&_arr)[Sz]) noexcept : data_{_arr}, size_{Sz}
+    {
+    }
+
+    template <size_t Sz>
+    inline Vc_CONSTEXPR span(array<value_type, Sz>& _arr) noexcept
+        : data_{_arr.data()}, size_{Sz}
+    {
+    }
+
+    template <size_t Sz>
+    inline Vc_CONSTEXPR span(const array<value_type, Sz>& _arr) noexcept
+        : data_{_arr.data()}, size_{Sz}
+    {
+    }
+
+    template <class Container>
+    inline Vc_CONSTEXPR span(
+        Container& _c,
+        typename std::enable_if<_is_span_compatible_container<Container, T>::value,
+                                std::nullptr_t>::type = nullptr)
+        : data_{_std_data(_c)}, size_{index_type(_std_size(_c))}
+    {
+    }
+
+    template <class Container>
+    inline Vc_CONSTEXPR span(
+        const Container& _c,
+        typename std::enable_if<_is_span_compatible_container<const Container, T>::value,
+                                std::nullptr_t>::type = nullptr)
+        : data_{_std_data(_c)}, size_{index_type(_std_size(_c))}
+    {
+    }
+
+    template <class OtherElementType, ptrdiff_t OtherExtent>
+    inline Vc_CONSTEXPR span(
+        const span<OtherElementType, OtherExtent>& _other,
+        typename std::enable_if<
+            std::is_convertible<OtherElementType (*)[], element_type (*)[]>::value,
+            std::nullptr_t>::type = nullptr) noexcept
+        : data_{_other.data()}, size_{_other.size()}
+    {
+    }
+
+    //    ~span() noexcept = default;
+
+    template <ptrdiff_t Count>
+    inline Vc_CONSTEXPR span<element_type, Count> first() const noexcept
+    {
+        static_assert(Count >= 0, "");
+        Vc_ASSERT(("Count out of range in span::first()", Count <= size()));
+        return {data(), Count};
+    }
+
+    template <ptrdiff_t Count>
+    inline Vc_CONSTEXPR span<element_type, Count> last() const noexcept
+    {
+        static_assert(Count >= 0, "");
+        Vc_ASSERT(("Count out of range in span::last()", Count <= size()));
+        return {data() + size() - Count, Count};
+    }
+
+    Vc_CONSTEXPR span<element_type, dynamic_extent> first(index_type _count) const noexcept
+    {
+        Vc_ASSERT(("Count out of range in span::first(count)",
+                   _count >= 0 && _count <= size()));
+        return {data(), _count};
+    }
+
+    Vc_CONSTEXPR span<element_type, dynamic_extent> last(index_type _count) const noexcept
+    {
+        Vc_ASSERT(
+            ("Count out of range in span::last(count)", _count >= 0 && _count <= size()));
+        return {data() + size() - _count, _count};
+    }
+
+    template <ptrdiff_t Offset, ptrdiff_t Count = dynamic_extent>
+    inline Vc_CONSTEXPR span<T, dynamic_extent> subspan() const noexcept
+    {
+        Vc_ASSERT(
+            ("Offset out of range in span::subspan()", Offset >= 0 && Offset <= size()));
+        Vc_ASSERT(("Count out of range in span::subspan()",
+                   Count == dynamic_extent || Offset + Count <= size()));
+        return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
+    }
+
+    Vc_CONSTEXPR span<element_type, dynamic_extent> inline subspan(
+        index_type _offset, index_type _count = dynamic_extent) const noexcept
+    {
+        Vc_ASSERT(("Offset out of range in span::subspan(offset, count)",
+                   _offset >= 0 && _offset <= size()));
+        Vc_ASSERT(("count out of range in span::subspan(offset, count)",
+                   (_count >= 0 && _count <= size()) || _count == dynamic_extent));
+        if (_count == dynamic_extent)
+            return {data() + _offset, size() - _offset};
+        Vc_ASSERT(("Offset + count out of range in span::subspan(offset, count)",
+                   _offset + _count <= size()));
+        return {data() + _offset, _count};
+    }
+
+    Vc_CONSTEXPR index_type size() const noexcept { return size_; }
+    Vc_CONSTEXPR index_type size_bytes() const noexcept
+    {
+        return size_ * sizeof(element_type);
+    }
+    Vc_CONSTEXPR bool empty() const noexcept { return size_ == 0; }
+
+    Vc_CONSTEXPR reference operator[](index_type _idx) const noexcept
+    {
+        Vc_ASSERT(("span<T>[] index out of bounds", _idx >= 0 && _idx < size()));
+        return data_[_idx];
+    }
+
+    Vc_CONSTEXPR reference operator()(index_type _idx) const noexcept
+    {
+        Vc_ASSERT(("span<T>() index out of bounds", _idx >= 0 && _idx < size()));
+        return data_[_idx];
+    }
+
+    Vc_CONSTEXPR pointer data() const noexcept { return data_; }
+
+    // [span.iter], span iterator support
+    Vc_CONSTEXPR iterator begin() const noexcept { return iterator(data()); }
+    Vc_CONSTEXPR iterator end() const noexcept { return iterator(data() + size()); }
+    Vc_CONSTEXPR const_iterator cbegin() const noexcept { return const_iterator(data()); }
+    Vc_CONSTEXPR const_iterator cend() const noexcept
+    {
+        return const_iterator(data() + size());
+    }
+    Vc_CONSTEXPR reverse_iterator rbegin() const noexcept { return reverse_iterator(end()); }
+    Vc_CONSTEXPR reverse_iterator rend() const noexcept { return reverse_iterator(begin()); }
+    Vc_CONSTEXPR const_reverse_iterator crbegin() const noexcept
+    {
+        return const_reverse_iterator(cend());
+    }
+    Vc_CONSTEXPR const_reverse_iterator crend() const noexcept
+    {
+        return const_reverse_iterator(cbegin());
+    }
+
+    Vc_CONSTEXPR void swap(span& _other) noexcept
+    {
+        pointer _p = data_;
+        data_ = _other.data_;
+        _other.data_ = _p;
+
+        index_type _sz = size_;
+        size_ = _other.size_;
+        _other.size_ = _sz;
+    }
+
+#ifdef __cpp_lib_byte
+// Disable _as_bytes() for older MSVC versions as it leads to a compilation error due to a compiler bug.
+// When parsing the return type, MSVC will instantiate the primary template of span<> and static_assert().
+#if _MSC_VER > 1928
+    span<const std::byte, dynamic_extent> _as_bytes() const noexcept
+    {
+        return {reinterpret_cast<const std::byte*>(data()), size_bytes()};
+    }
+
+    span<std::byte, dynamic_extent> _as_writeable_bytes() const noexcept
+    {
+        return {reinterpret_cast<std::byte*>(data()), size_bytes()};
+    }
+#endif
+#endif  // __cpp_lib_byte
+
+private:
+    pointer data_;
+    index_type size_;
+};
+
+template <class T1, ptrdiff_t Extent1, class T2, ptrdiff_t Extent2>
+Vc_CONSTEXPR bool operator==(const span<T1, Extent1>& lhs, const span<T2, Extent2>& rhs)
+{
+    return equal(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+}
+
+template <class T1, ptrdiff_t Extent1, class T2, ptrdiff_t Extent2>
+Vc_CONSTEXPR bool operator!=(const span<T1, Extent1>& lhs, const span<T2, Extent2>& rhs)
+{
+    return !(rhs == lhs);
+}
+
+template <class T1, ptrdiff_t Extent1, class T2, ptrdiff_t Extent2>
+Vc_CONSTEXPR bool operator<(const span<T1, Extent1>& lhs, const span<T2, Extent2>& rhs)
+{
+    return lexicographical_compare(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+}
+
+template <class T1, ptrdiff_t Extent1, class T2, ptrdiff_t Extent2>
+Vc_CONSTEXPR bool operator<=(const span<T1, Extent1>& lhs, const span<T2, Extent2>& rhs)
+{
+    return !(rhs < lhs);
+}
+
+template <class T1, ptrdiff_t Extent1, class T2, ptrdiff_t Extent2>
+Vc_CONSTEXPR bool operator>(const span<T1, Extent1>& lhs, const span<T2, Extent2>& rhs)
+{
+    return rhs < lhs;
+}
+
+template <class T1, ptrdiff_t Extent1, class T2, ptrdiff_t Extent2>
+Vc_CONSTEXPR bool operator>=(const span<T1, Extent1>& lhs, const span<T2, Extent2>& rhs)
+{
+    return !(lhs < rhs);
+}
+
+//  as_bytes & as_writeable_bytes
+template <class T, ptrdiff_t Extent>
+auto as_bytes(span<T, Extent> _s) noexcept -> decltype(_s._as_bytes())
+{
+    return _s._as_bytes();
+}
+
+template <class T, ptrdiff_t Extent>
+auto as_writeable_bytes(span<T, Extent> _s) noexcept ->
+    typename std::enable_if<!std::is_const<T>::value,
+                            decltype(_s._as_writeable_bytes())>::type
+{
+    return _s._as_writeable_bytes();
+}
+
+template <class T, ptrdiff_t Extent>
+Vc_CONSTEXPR void swap(span<T, Extent>& lhs, span<T, Extent>& rhs) noexcept
+{
+    lhs.swap(rhs);
+}
+
+#undef Vc_CONSTEXPR
+
+//  Deduction guides
+#ifdef __cpp_deduction_guides
+template <class T, size_t Sz> span(T (&)[Sz])->span<T, Sz>;
+
+template <class T, size_t Sz> span(array<T, Sz>&)->span<T, Sz>;
+
+template <class T, size_t Sz> span(const array<T, Sz>&)->span<const T, Sz>;
+
+template <class Container> span(Container&)->span<typename Container::value_type>;
+
+template <class Container>
+span(const Container&)->span<const typename Container::value_type>;
+#endif  // __cpp_deduction_guides
+
+}  // namespace Common
+
+/**
+ * \ingroup Containers
+ * \headerfile span.h <Vc/span>
+ *
+ * An adapted `std::span` with additional subscript operators supporting gather and scatter operations.
+ *
+ * The [std::span](https://en.cppreference.com/w/cpp/container/span) documentation applies.
+ *
+ * Example:
+ * \code
+ * struct Point {
+ *   float x, y;
+ * };
+ * Point data[100];
+ * // initialize values in data
+ *
+ * Vc::span<Point, 100> view(data);
+ * float_v::IndexType indexes = ...;  // values between 0-99
+ * float_v x = view[indexes][&Point::x];
+ * float_v y = view[indexes][&Point::y];
+ * \endcode
+ */
+template <typename T, ptrdiff_t Extent = dynamic_extent>
+using span = Common::AdaptSubscriptOperator<Common::span<T, Extent>>;
+
+namespace Traits
+{
+template <typename T, ptrdiff_t Extent>
+struct has_contiguous_storage_impl<Vc::span<T, Extent>> : public std::true_type {
+};
+template <typename T, ptrdiff_t Extent>
+struct has_contiguous_storage_impl<Vc::Common::span<T, Extent>> : public std::true_type {
+};
+}  // namespace Traits
+
+}  // namespace Vc_VERSIONED_NAMESPACE
+
+#endif  // VC_COMMON_SPAN_H_
--- a/Vc/common/storage.h
+++ b/Vc/common/storage.h
@ -0,0 +1,381 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_STORAGE_H_
+#define VC_COMMON_STORAGE_H_
+
+#include "aliasingentryhelper.h"
+#include "types.h"
+#include "maskbool.h"
+#ifdef Vc_IMPL_AVX
+#include "../avx/intrinsics.h"
+#endif
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Detail
+{
+template <typename V> inline V zero();
+}  // namespace Detail
+namespace Common
+{
+namespace Detail
+{
+#ifdef Vc_IMPL_AVX
+template <typename ValueType, size_t Size> struct IntrinsicType {
+    using type = typename std::conditional<
+        std::is_integral<ValueType>::value,
+        typename std::conditional<sizeof(ValueType) * Size == 16, __m128i, __m256i>::type,
+        typename std::conditional<
+            std::is_same<ValueType, double>::value,
+            typename std::conditional<sizeof(ValueType) * Size == 16, __m128d,
+                                      __m256d>::type,
+            typename std::conditional<sizeof(ValueType) * Size == 16, __m128,
+                                      __m256>::type>::type>::type;
+};
+#elif defined Vc_IMPL_SSE
+template <typename ValueType, size_t Size> struct IntrinsicType {
+    using type = typename std::conditional<
+        std::is_integral<ValueType>::value, __m128i,
+        typename std::conditional<std::is_same<ValueType, double>::value, __m128d,
+                                  __m128>::type>::type;
+};
+#else
+template <typename ValueType, size_t Size> struct IntrinsicType {
+    static_assert(Size == 1,
+                  "IntrinsicType without SIMD target support may only have Size = 1");
+    using type = ValueType;
+};
+#endif
+template <typename ValueType, size_t Size, size_t Bytes = sizeof(ValueType) * Size>
+struct BuiltinType;
+#ifdef Vc_USE_BUILTIN_VECTOR_TYPES
+#define Vc_VECBUILTIN __attribute__((__vector_size__(16)))
+template <size_t Size> struct BuiltinType<         double   , Size, 16> { typedef          double    type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         float    , Size, 16> { typedef          float     type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         long long, Size, 16> { typedef          long long type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<unsigned long long, Size, 16> { typedef unsigned long long type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         long     , Size, 16> { typedef          long      type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<unsigned long     , Size, 16> { typedef unsigned long      type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         int      , Size, 16> { typedef          int       type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<unsigned int      , Size, 16> { typedef unsigned int       type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         short    , Size, 16> { typedef          short     type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<unsigned short    , Size, 16> { typedef unsigned short     type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         char     , Size, 16> { typedef          char      type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<unsigned char     , Size, 16> { typedef unsigned char      type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<  signed char     , Size, 16> { typedef   signed char      type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         bool     , Size, 16> { typedef unsigned char      type Vc_VECBUILTIN; };
+#undef Vc_VECBUILTIN
+#define Vc_VECBUILTIN __attribute__((__vector_size__(32)))
+template <size_t Size> struct BuiltinType<         double   , Size, 32> { typedef          double    type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         float    , Size, 32> { typedef          float     type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         long long, Size, 32> { typedef          long long type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<unsigned long long, Size, 32> { typedef unsigned long long type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         long     , Size, 32> { typedef          long      type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<unsigned long     , Size, 32> { typedef unsigned long      type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         int      , Size, 32> { typedef          int       type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<unsigned int      , Size, 32> { typedef unsigned int       type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         short    , Size, 32> { typedef          short     type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<unsigned short    , Size, 32> { typedef unsigned short     type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         char     , Size, 32> { typedef          char      type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<unsigned char     , Size, 32> { typedef unsigned char      type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<  signed char     , Size, 32> { typedef   signed char      type Vc_VECBUILTIN; };
+template <size_t Size> struct BuiltinType<         bool     , Size, 32> { typedef unsigned char      type Vc_VECBUILTIN; };
+#undef Vc_VECBUILTIN
+#endif
+}  // namespace Detail
+
+template <typename ValueType, size_t Size>
+using IntrinsicType = typename Detail::IntrinsicType<ValueType, Size>::type;
+
+template <typename ValueType, size_t Size>
+using BuiltinType = typename Detail::BuiltinType<ValueType, Size>::type;
+
+namespace AliasStrategy
+{
+struct Union {};
+struct MayAlias {};
+struct VectorBuiltin {};
+struct UnionMembers {};
+}  // namespace AliasStrategy
+
+using DefaultStrategy =
+#if defined Vc_USE_BUILTIN_VECTOR_TYPES
+    AliasStrategy::VectorBuiltin;
+#elif defined Vc_MSVC
+    AliasStrategy::UnionMembers;
+#elif defined Vc_ICC
+    AliasStrategy::Union;
+#elif defined __GNUC__
+    AliasStrategy::MayAlias;
+#else
+    AliasStrategy::Union;
+#endif
+
+template <typename ValueType, size_t Size, typename Strategy = DefaultStrategy>
+class Storage;
+
+// GCC 6 forbids `EntryType m[]` altogether
+template <typename ValueType, size_t Size>
+class Storage<ValueType, Size, AliasStrategy::Union>
+{
+    static_assert(std::is_fundamental<ValueType>::value &&
+                      std::is_arithmetic<ValueType>::value,
+                  "Only works for fundamental arithmetic types.");
+
+public:
+    using VectorType = IntrinsicType<ValueType, Size>;
+    using EntryType = ValueType;
+
+    union Alias {
+        Vc_INTRINSIC Alias(VectorType vv) : v(vv) {}
+        VectorType v;
+        EntryType m[Size];
+    };
+
+    Vc_INTRINSIC Storage() : data(Vc::Detail::zero<VectorType>()) {}
+    Vc_INTRINSIC Storage(const VectorType &x) : data(x) { assertCorrectAlignment(&data); }
+    template <typename U>
+    Vc_INTRINSIC explicit Storage(const U &x,
+                                  enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
+        : data(reinterpret_cast<VectorType>(x))
+    {
+        assertCorrectAlignment(&data);
+    }
+
+    Vc_INTRINSIC Storage(const Storage &) = default;
+    Vc_INTRINSIC Storage &operator=(const Storage &) = default;
+
+    Vc_INTRINSIC operator const VectorType &() const { return data; }
+    Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
+    Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
+    Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return Alias(data).m[i]; }
+    Vc_INTRINSIC void set(size_t i, EntryType x)
+    {
+        Alias a(data);
+        a.m[i] = x;
+        data = a.v;
+    }
+
+private:
+    VectorType data;
+};
+
+template <typename ValueType, size_t Size>
+class Storage<ValueType, Size, AliasStrategy::MayAlias>
+{
+    static_assert(std::is_fundamental<ValueType>::value &&
+                      std::is_arithmetic<ValueType>::value,
+                  "Only works for fundamental arithmetic types.");
+
+public:
+    using VectorType = IntrinsicType<ValueType, Size>;
+    using EntryType = ValueType;
+
+    Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
+    Vc_INTRINSIC Storage(const VectorType &x) : data(x)
+    {
+        assertCorrectAlignment(&data);
+    }
+    template <typename U>
+    Vc_INTRINSIC explicit Storage(const U &x,
+                                  enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
+        : data(reinterpret_cast<const VectorType &>(x))
+    {
+        assertCorrectAlignment(&data);
+    }
+    Vc_INTRINSIC Storage &operator=(const VectorType &x)
+    {
+        data = x;
+        return *this;
+    }
+
+    Vc_INTRINSIC Storage(const Storage &) = default;
+    Vc_INTRINSIC Storage &operator=(const Storage &) = default;
+
+    Vc_INTRINSIC operator const VectorType &() const { return v(); }
+    Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
+    Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
+
+    Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const
+    {
+        return aliasing_cast<EntryType>(&data)[i];
+    }
+    Vc_INTRINSIC void set(size_t i, EntryType x)
+    {
+        aliasing_cast<EntryType>(&data)[i] = x;
+    }
+
+private:
+    VectorType data;
+};
+
+template <typename ValueType, size_t Size>
+class Storage<ValueType, Size, AliasStrategy::VectorBuiltin>
+{
+    static_assert(std::is_fundamental<ValueType>::value &&
+                      std::is_arithmetic<ValueType>::value,
+                  "Only works for fundamental arithmetic types.");
+
+    using Builtin = BuiltinType<ValueType, Size>;
+
+public:
+    using VectorType =
+#ifdef Vc_TEMPLATES_DROP_ATTRIBUTES
+        MayAlias<IntrinsicType<ValueType, Size>>;
+#else
+        IntrinsicType<ValueType, Size>;
+#endif
+    using EntryType = ValueType;
+
+    Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
+    Vc_INTRINSIC Storage(const Storage &) = default;
+    Vc_INTRINSIC Storage &operator=(const Storage &) = default;
+
+    Vc_INTRINSIC Storage(const VectorType &x)
+        : data(aliasing_cast<Builtin>(x))
+    {
+        assertCorrectAlignment(&data);
+    }
+    template <typename U>
+    Vc_INTRINSIC explicit Storage(const U &x,
+                                  enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
+        : data(aliasing_cast<Builtin>(x))
+    {
+        assertCorrectAlignment(&data);
+    }
+    Vc_INTRINSIC Storage &operator=(const VectorType &x)
+    {
+        data = aliasing_cast<Builtin>(x);
+        return *this;
+    }
+
+    Vc_INTRINSIC operator const VectorType &() const { return v(); }
+    Vc_INTRINSIC Vc_PURE VectorType &v() { return reinterpret_cast<VectorType &>(data); }
+    Vc_INTRINSIC Vc_PURE const VectorType &v() const { return reinterpret_cast<const VectorType &>(data); }
+
+    Vc_INTRINSIC Vc_PURE EntryType m(size_t i) const { return data[i]; }
+    Vc_INTRINSIC void set(size_t i, EntryType x) { data[i] = x; }
+
+    Vc_INTRINSIC Builtin &builtin() { return data; }
+    Vc_INTRINSIC const Builtin &builtin() const { return data; }
+
+private:
+    Builtin data;
+};
+
+template <typename ValueType, size_t Size>
+class Storage<ValueType, Size, AliasStrategy::UnionMembers>
+{
+    static_assert(std::is_fundamental<ValueType>::value &&
+                      std::is_arithmetic<ValueType>::value,
+                  "Only works for fundamental arithmetic types.");
+
+public:
+    using VectorType = IntrinsicType<ValueType, Size>;
+    using EntryType = ValueType;
+
+    Vc_INTRINSIC Storage() : data() { assertCorrectAlignment(&data); }
+    Vc_INTRINSIC Storage(const VectorType &x) : data(x)
+    {
+        assertCorrectAlignment(&data);
+    }
+    template <typename U>
+    Vc_INTRINSIC explicit Storage(const U &x,
+                                  enable_if<sizeof(U) == sizeof(VectorType)> = nullarg)
+        : data(reinterpret_cast<const VectorType &>(x))
+    {
+        assertCorrectAlignment(&data);
+    }
+    Vc_INTRINSIC Storage &operator=(const VectorType &x)
+    {
+        data = x;
+        return *this;
+    }
+
+    Vc_INTRINSIC Storage(const Storage &) = default;
+    Vc_INTRINSIC Storage &operator=(const Storage &) = default;
+
+    Vc_INTRINSIC Vc_PURE VectorType &v() { return data; }
+    Vc_INTRINSIC Vc_PURE const VectorType &v() const { return data; }
+
+    Vc_INTRINSIC_L Vc_PURE_L EntryType m(size_t i) const Vc_INTRINSIC_R Vc_PURE_R;
+    Vc_INTRINSIC void set(size_t i, EntryType x) { ref(i) = x; }
+
+private:
+    Vc_INTRINSIC_L Vc_PURE_L EntryType &ref(size_t i) Vc_INTRINSIC_R Vc_PURE_R;
+    VectorType data;
+};
+
+#ifdef Vc_MSVC
+template <> Vc_INTRINSIC Vc_PURE          double Storage<         double, 2, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128d_f64[i]; }
+template <> Vc_INTRINSIC Vc_PURE          float  Storage<         float , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128_f32[i]; }
+template <> Vc_INTRINSIC Vc_PURE   signed int    Storage<  signed int   , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i32[i]; }
+template <> Vc_INTRINSIC Vc_PURE   signed short  Storage<  signed short , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i16[i]; }
+template <> Vc_INTRINSIC Vc_PURE   signed char   Storage<  signed char  ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_i8[i]; }
+template <> Vc_INTRINSIC Vc_PURE unsigned int    Storage<unsigned int   , 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u32[i]; }
+template <> Vc_INTRINSIC Vc_PURE unsigned short  Storage<unsigned short , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u16[i]; }
+template <> Vc_INTRINSIC Vc_PURE unsigned char   Storage<unsigned char  ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m128i_u8[i]; }
+
+template <> Vc_INTRINSIC Vc_PURE          double &Storage<         double, 2, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128d_f64[i]; }
+template <> Vc_INTRINSIC Vc_PURE          float  &Storage<         float , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128_f32[i]; }
+template <> Vc_INTRINSIC Vc_PURE   signed int    &Storage<  signed int   , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i32[i]; }
+template <> Vc_INTRINSIC Vc_PURE   signed short  &Storage<  signed short , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_i16[i]; }
+template <> Vc_INTRINSIC Vc_PURE   signed char   &Storage<  signed char  ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast<signed char &>(data.m128i_i8[i]); }
+template <> Vc_INTRINSIC Vc_PURE unsigned int    &Storage<unsigned int   , 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u32[i]; }
+template <> Vc_INTRINSIC Vc_PURE unsigned short  &Storage<unsigned short , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u16[i]; }
+template <> Vc_INTRINSIC Vc_PURE unsigned char   &Storage<unsigned char  ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m128i_u8[i]; }
+
+#ifdef Vc_IMPL_AVX
+template <> Vc_INTRINSIC Vc_PURE          double Storage<         double, 4, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256d_f64[i]; }
+template <> Vc_INTRINSIC Vc_PURE          float  Storage<         float , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256_f32[i]; }
+template <> Vc_INTRINSIC Vc_PURE   signed int    Storage<  signed int   , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i32[i]; }
+template <> Vc_INTRINSIC Vc_PURE   signed short  Storage<  signed short ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i16[i]; }
+template <> Vc_INTRINSIC Vc_PURE   signed char   Storage<  signed char  ,32, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_i8[i]; }
+template <> Vc_INTRINSIC Vc_PURE unsigned int    Storage<unsigned int   , 8, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u32[i]; }
+template <> Vc_INTRINSIC Vc_PURE unsigned short  Storage<unsigned short ,16, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u16[i]; }
+template <> Vc_INTRINSIC Vc_PURE unsigned char   Storage<unsigned char  ,32, AliasStrategy::UnionMembers>::m(size_t i) const { return data.m256i_u8[i]; }
+
+template <> Vc_INTRINSIC Vc_PURE          double &Storage<         double, 4, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256d_f64[i]; }
+template <> Vc_INTRINSIC Vc_PURE          float  &Storage<         float , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256_f32[i]; }
+template <> Vc_INTRINSIC Vc_PURE   signed int    &Storage<  signed int   , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i32[i]; }
+template <> Vc_INTRINSIC Vc_PURE   signed short  &Storage<  signed short ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_i16[i]; }
+template <> Vc_INTRINSIC Vc_PURE   signed char   &Storage<  signed char  ,32, AliasStrategy::UnionMembers>::ref(size_t i) { return reinterpret_cast<signed char &>(data.m256i_i8[i]); }
+template <> Vc_INTRINSIC Vc_PURE unsigned int    &Storage<unsigned int   , 8, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u32[i]; }
+template <> Vc_INTRINSIC Vc_PURE unsigned short  &Storage<unsigned short ,16, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u16[i]; }
+template <> Vc_INTRINSIC Vc_PURE unsigned char   &Storage<unsigned char  ,32, AliasStrategy::UnionMembers>::ref(size_t i) { return data.m256i_u8[i]; }
+#endif
+#endif  // Vc_MSVC
+
+template <typename VectorType, typename EntryType>
+using VectorMemoryUnion = Storage<EntryType, sizeof(VectorType) / sizeof(EntryType)>;
+
+}  // namespace Common
+}  // namespace Vc
+
+#endif // VC_COMMON_STORAGE_H_
--- a/Vc/common/storeinterface.h
+++ b/Vc/common/storeinterface.h
@ -0,0 +1,92 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+///////////////////////////////////////////////////////////////////////////////////////////
+// stores
+
+/**
+ * Store the vector data to \p mem.
+ *
+ * \param mem A pointer to memory, where \VSize{T} consecutive values will be stored.
+ * \param flags The flags parameter can be used to select e.g. the Vc::Aligned,
+ *              Vc::Unaligned, Vc::Streaming, and/or Vc::PrefetchDefault flags.
+ */
+template <
+    typename U,
+    typename Flags = DefaultStoreTag,
+    typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
+Vc_INTRINSIC_L void store(U *mem, Flags flags = Flags()) const Vc_INTRINSIC_R;
+
+/**
+ * Store the vector data to \p mem where \p mask is set.
+ *
+ * \param mem A pointer to memory, where \VSize{T} consecutive values will be stored.
+ * \param mask A mask object that determines which entries of the vector should be stored
+ *             to \p mem.
+ * \param flags The flags parameter can be used to select e.g. the Vc::Aligned,
+ *              Vc::Unaligned, Vc::Streaming, and/or Vc::PrefetchDefault flags.
+ *
+ * \note
+ * The masked store does not pack the values into memory. I.e. the value at offset \c i
+ * will be stored to `mem[i]`, independent of whether `mask[j]` for any `j < i` is \c
+ * false.
+ */
+template <
+    typename U,
+    typename Flags = DefaultStoreTag,
+    typename = enable_if<std::is_arithmetic<U>::value &&Traits::is_load_store_flag<Flags>::value>>
+Vc_INTRINSIC_L void Vc_VDECL store(U *mem, MaskType mask, Flags flags = Flags()) const Vc_INTRINSIC_R;
+
+//@{
+/**
+ * The following store overloads support classes that have a cast operator to `EntryType
+ * *`.
+ */
+Vc_INTRINSIC void store(EntryType *mem) const
+{
+    store<EntryType, DefaultStoreTag>(mem, DefaultStoreTag());
+}
+
+template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
+Vc_INTRINSIC void store(EntryType *mem, Flags flags) const
+{
+    store<EntryType, Flags>(mem, flags);
+}
+
+Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask) const
+{
+    store<EntryType, DefaultStoreTag>(mem, mask, DefaultStoreTag());
+}
+
+template <typename Flags, typename = enable_if<Traits::is_load_store_flag<Flags>::value>>
+Vc_INTRINSIC void Vc_VDECL store(EntryType *mem, MaskType mask, Flags flags) const
+{
+    store<EntryType, Flags>(mem, mask, flags);
+}
+//@}
+
+// vim: foldmethod=marker
--- a/Vc/common/subscript.h
+++ b/Vc/common/subscript.h
@ -0,0 +1,526 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_SUBSCRIPT_H_
+#define VC_COMMON_SUBSCRIPT_H_
+
+#include <initializer_list>
+#include <type_traits>
+#include <vector>
+#include "types.h"
+#include "macros.h"
+#include <assert.h>
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+// AdaptSubscriptOperator {{{
+template <typename Base> class AdaptSubscriptOperator : public Base
+{
+public:
+    // perfect forward all Base constructors
+    template <typename... Args>
+    Vc_ALWAYS_INLINE AdaptSubscriptOperator(Args &&... arguments)
+        : Base(std::forward<Args>(arguments)...)
+    {
+    }
+
+    // perfect forward all Base constructors
+    template <typename T>
+    Vc_ALWAYS_INLINE AdaptSubscriptOperator(std::initializer_list<T> l)
+        : Base(l)
+    {
+    }
+
+    // explicitly enable Base::operator[] because the following would hide it
+    using Base::operator[];
+
+    /// \internal forward to non-member subscript_operator function
+    template <typename I,
+              typename = enable_if<!std::is_arithmetic<
+                  typename std::decay<I>::type>::value>  // arithmetic types
+                                                         // should always use
+                                                         // Base::operator[] and
+                                                         // never match this one
+              >
+    Vc_ALWAYS_INLINE auto operator[](I &&arg_)
+        -> decltype(subscript_operator(*this, std::forward<I>(arg_)))
+    {
+        return subscript_operator(*this, std::forward<I>(arg_));
+    }
+
+    // const overload of the above
+    template <typename I, typename = enable_if<
+                              !std::is_arithmetic<typename std::decay<I>::type>::value>>
+    Vc_ALWAYS_INLINE auto operator[](I &&arg_) const
+        -> decltype(subscript_operator(*this, std::forward<I>(arg_)))
+    {
+        return subscript_operator(*this, std::forward<I>(arg_));
+    }
+};
+
+// }}}
+// is_valid_indexvector {{{
+template <class T, class = decltype(convertIndexVector(std::declval<T>()))>
+std::true_type is_valid_indexvector(T &&);
+std::false_type is_valid_indexvector(...);
+
+template <class IndexVector, class Test = decltype(is_valid_indexvector(
+                                 std::declval<const IndexVector &>()))>
+struct is_valid_indexvector_ : public std::integral_constant<bool, Test::value> {
+};
+static_assert(!is_valid_indexvector_<const int *>::value,
+              "Pointer is incorrectly classified as valid index vector type");
+static_assert(is_valid_indexvector_<const int[4]>::value,
+              "C-Array is incorrectly classified as invalid index vector type");
+
+// }}}
+// apply Scale (std::ratio) functions {{{1
+template <typename Scale, typename T>
+Vc_ALWAYS_INLINE enable_if<Scale::num == Scale::den, Traits::decay<T>> applyScale(T &&x)
+{
+    return std::forward<T>(x);
+}
+
+template <typename Scale, typename T>
+Vc_ALWAYS_INLINE enable_if<
+    Scale::num != Scale::den && Traits::has_multiply_operator<T, int>::value,
+    Traits::decay<T>>
+    applyScale(T &&x)
+{
+    static_assert(Scale::num % Scale::den == 0,
+                  "Non-integral index scaling requested. This typically happens only for "
+                  "Vc::Scalar on 32-bit for gathers on double. You can work around the "
+                  "issue by ensuring that all doubles in the structure are aligned on 8 "
+                  "Bytes.");
+    constexpr int value = Scale::num / Scale::den;
+    Vc_ASSERT(Vc::all_of((x * value) / value == x));
+    return std::forward<T>(x) * value;
+}
+
+template <typename Scale, typename T>
+Vc_ALWAYS_INLINE enable_if<
+    Scale::num != Scale::den && !Traits::has_multiply_operator<T, int>::value,
+    T>
+    applyScale(T x)
+{
+    static_assert(Scale::num % Scale::den == 0,
+                  "Non-integral index scaling requested. This typically happens only for "
+                  "Vc::Scalar on 32-bit for gathers on double. You can work around the "
+                  "issue by ensuring that all doubles in the structure are aligned on 8 "
+                  "Bytes.");
+    constexpr int value = Scale::num / Scale::den;
+    for (size_t i = 0; i < x.size(); ++i) {
+        Vc_ASSERT((x[i] * value) / value == x[i]);
+        x[i] *= value;
+    }
+    return x;
+}
+
+template <typename Scale, typename T, typename U,
+          typename = enable_if<Traits::has_multiply_operator<T, int>::value &&
+                               Traits::has_addition_operator<T, U>::value>>
+Vc_ALWAYS_INLINE typename std::decay<T>::type applyScaleAndAdd(T &&x, U &&y)
+{
+    constexpr int value = Scale::num / Scale::den;
+    if (value == 1) {  // static evaluation
+        return std::forward<T>(x) + std::forward<U>(y);
+    }
+    return std::forward<T>(x) * value + std::forward<U>(y);
+}
+
+template <
+    typename Scale, typename T, typename U,
+    typename = enable_if<
+        !(Traits::has_multiply_operator<T &, int>::value &&
+          Traits::has_addition_operator<T &, decltype(std::declval<U>()[0])>::value) &&
+        Traits::has_subscript_operator<U>::value>>
+Vc_ALWAYS_INLINE T applyScaleAndAdd(T x, U &&y)
+{
+    constexpr int value = Scale::num / Scale::den;
+    for (size_t i = 0; i < x.size(); ++i) {
+        if (value == 1) {  // static evaluation
+            x[i] = x[i] + y[i];
+        } else {
+            x[i] = x[i] * value + y[i];
+        }
+    }
+    return x;
+}
+
+template <typename Scale, typename T, typename U>
+Vc_ALWAYS_INLINE enable_if<!(Traits::has_multiply_operator<T &, int>::value &&
+                             Traits::has_addition_operator<T &, U>::value) &&
+                               !Traits::has_subscript_operator<U>::value,
+                           T>
+    applyScaleAndAdd(T x, U &&y)
+{
+    constexpr int value = Scale::num / Scale::den;
+    for (size_t i = 0; i < x.size(); ++i) {
+        if (value == 1) {  // static evaluation
+            x[i] = x[i] + y;
+        } else {
+            x[i] = x[i] * value + y;
+        }
+    }
+    return x;
+}
+
+// IndexVectorSizeMatches {{{1
+template <std::size_t MinSize,
+          typename IndexT,
+          bool = Traits::is_simd_vector<IndexT>::value>
+struct IndexVectorSizeMatches
+    : public std::true_type  // you might expect this should be false_type here, but the point is
+                             // that IndexT is a type where the size is not known at compile time.
+                             // Thus it may be good but we cannot know from the type. The only check
+                             // we could do is a runtime check, but the type is fine.
+{
+};
+
+template <std::size_t MinSize, typename V>
+struct IndexVectorSizeMatches<MinSize,
+                              V,
+                              true> : public std::integral_constant<bool, (MinSize <= V::Size)>
+{
+};
+
+template <std::size_t MinSize, typename T, std::size_t ArraySize>
+struct IndexVectorSizeMatches<MinSize,
+                              T[ArraySize],
+                              false> : public std::integral_constant<bool, (MinSize <= ArraySize)>
+{
+};
+
+template <std::size_t MinSize, typename T, std::size_t ArraySize>
+struct IndexVectorSizeMatches<MinSize,
+                              std::array<T, ArraySize>,
+                              false> : public std::integral_constant<bool, (MinSize <= ArraySize)>
+{
+};
+
+template <std::size_t MinSize, typename T, std::size_t ArraySize>
+struct IndexVectorSizeMatches<MinSize,
+                              Vc::array<T, ArraySize>,
+                              false> : public std::integral_constant<bool, (MinSize <= ArraySize)>
+{
+};
+
+template <std::size_t MinSize, typename T, std::ptrdiff_t N>
+struct IndexVectorSizeMatches<MinSize, Vc::Common::span<T, N>, false>
+    : public std::integral_constant<bool, (N == -1 || static_cast<std::ptrdiff_t>(MinSize) <= N)> {
+};
+// SubscriptOperation {{{1
+template <
+    typename T, typename IndexVector, typename Scale = std::ratio<1, 1>,
+    bool = is_valid_indexvector_<IndexVector>::value>
+class SubscriptOperation
+{
+    const IndexVector m_indexes;
+    T *const m_address;
+    using ScalarType = typename std::decay<T>::type;
+
+    using IndexVectorScaled = Traits::decay<decltype(convertIndexVector(std::declval<const IndexVector &>()))>;
+
+public:
+    // try to stop the user from forming lvalues of this type
+    SubscriptOperation &operator=(const SubscriptOperation &) = delete;
+    SubscriptOperation(const SubscriptOperation &) = delete;
+#ifndef __cpp_guaranteed_copy_elision
+    constexpr SubscriptOperation(SubscriptOperation &&) = default;
+#endif
+
+    template <typename U,
+              typename = enable_if<((std::is_convertible<const U &, IndexVector>::value ||
+                                     std::is_same<U, IndexVector>::value) &&
+                                    std::is_copy_constructible<IndexVector>::value)>>
+    constexpr Vc_ALWAYS_INLINE SubscriptOperation(T *address, const U &indexes)
+        : m_indexes(indexes), m_address(address)
+    {
+    }
+
+    template <std::size_t... Indexes>
+    constexpr Vc_ALWAYS_INLINE SubscriptOperation(T *address, const IndexVector &indexes,
+                                                  index_sequence<Indexes...>)
+        : m_indexes{indexes[Indexes]...}, m_address(address)
+    {}
+
+    template <typename U>
+    constexpr Vc_ALWAYS_INLINE SubscriptOperation(
+        T *address, const U &indexes,
+        enable_if<((std::is_convertible<const U &, IndexVector>::value ||
+                    std::is_same<U, IndexVector>::value) &&
+                   !std::is_copy_constructible<IndexVector>::value &&
+                   std::is_array<IndexVector>::value &&
+                   std::extent<IndexVector>::value > 0)> = nullarg)
+        : SubscriptOperation(address, indexes,
+                             make_index_sequence<std::extent<IndexVector>::value>())
+    {
+    }
+
+    static constexpr bool need_explicit_scaling =
+        Scale::num % Scale::den != 0 || Scale::num / Scale::den * sizeof(T) > 8;
+
+    Vc_ALWAYS_INLINE
+        GatherArguments<typename std::remove_cv<T>::type, IndexVectorScaled,
+                        (need_explicit_scaling ? 1 : Scale::num / Scale::den)>
+        gatherArguments() &&
+    {
+        static_assert(std::is_arithmetic<ScalarType>::value,
+                      "Incorrect type for a SIMD vector gather. Must be an arithmetic type.");
+        return {applyScale<typename std::conditional<need_explicit_scaling, Scale,
+                                                     std::ratio<1, 1>>::type>(
+                    convertIndexVector(m_indexes)),
+                m_address};
+    }
+
+    Vc_ALWAYS_INLINE ScatterArguments<T, IndexVectorScaled> scatterArguments() &&
+    {
+        static_assert(std::is_arithmetic<ScalarType>::value,
+                      "Incorrect type for a SIMD vector scatter. Must be an arithmetic type.");
+        return {applyScale<Scale>(convertIndexVector(m_indexes)), m_address};
+    }
+
+    template <typename V,
+              typename = enable_if<(std::is_arithmetic<ScalarType>::value &&Traits::is_simd_vector<
+                  V>::value &&IndexVectorSizeMatches<V::Size, IndexVector>::value)>>
+    Vc_INTRINSIC operator V() &&
+    {
+        return V(static_cast<SubscriptOperation &&>(*this).gatherArguments());
+    }
+
+    template <typename V,
+              typename = enable_if<(std::is_arithmetic<ScalarType>::value &&Traits::is_simd_vector<
+                  V>::value &&IndexVectorSizeMatches<V::Size, IndexVector>::value)>>
+    Vc_ALWAYS_INLINE SubscriptOperation &operator=(const V &rhs) &&
+    {
+        static_assert(std::is_arithmetic<ScalarType>::value,
+                      "Incorrect type for a SIMD vector scatter. Must be an arithmetic type.");
+        const auto indexes = applyScale<Scale>(convertIndexVector(m_indexes));
+        rhs.scatter(m_address, indexes);
+        return *this;
+    }
+
+    // precondition: m_address points to a struct/class/union
+    template <
+        typename U,
+        typename S,  // S must be equal to T. Still we require this template parameter -
+        // otherwise instantiation of SubscriptOperation would only be valid for
+        // structs/unions.
+        typename = enable_if<std::is_same<S, typename std::remove_cv<T>::type>::value &&(
+            std::is_class<T>::value || std::is_union<T>::value)>>
+    Vc_ALWAYS_INLINE auto operator[](U S::*member) &&
+        -> SubscriptOperation<
+              typename std::conditional<std::is_const<T>::value,
+                                        const typename std::remove_reference<U>::type,
+                                        typename std::remove_reference<U>::type>::type,
+              IndexVector,
+              // By passing the scale factor as a fraction of integers in the template
+              // arguments the value does not lose information if the division yields a
+              // non-integral value. This could happen e.g. for a struct of struct (S2 {
+              // S1, char }, with sizeof(S1) = 16, sizeof(S2) = 20. Then scale would be
+              // 20/16)
+              std::ratio_multiply<Scale, std::ratio<sizeof(S), sizeof(U)>>>
+    {
+        static_assert(std::is_same<Traits::decay<decltype(m_address->*member)>,
+                                   Traits::decay<U>>::value,
+                      "Type mismatch that should be impossible.");
+        // TODO: check whether scale really works for unions correctly
+        return {&(m_address->*member), m_indexes};
+    }
+
+    /*
+     * The following functions allow subscripting of nested arrays. But
+     * there are two cases of containers and only one that we want to support:
+     * 1. actual arrays (e.g. T[N] or std::array<T, N>)
+     * 2. dynamically allocated vectors (e.g. std::vector<T>)
+     *
+     * For (1.) the offset calculation is straightforward.
+     * For (2.) the m_address pointer points to memory where pointers are
+     * stored to the actual data. Meaning the data can be scattered
+     * freely in memory (and far away from what m_address points to). Supporting this leads to
+     * serious trouble with the pointer (it does not really point to the start of a memory
+     * region anymore) and inefficient code. The user is better off to write a loop that assigns the
+     * scalars to the vector object sequentially.
+     */
+
+private:
+    // The following is a workaround for MSVC 2015 Update 2. Whenever the ratio
+    // in the return type of the following operator[] is encountered with a sizeof
+    // expression that fails, MSVC decides to substitute a 0 for the sizeof instead of
+    // just leaving the ratio instantiation alone via proper SFINAE. The make_ratio helper
+    // ensures that the 0 from the sizeof failure does not reach the denominator of
+    // std::ratio where it would hit a static_assert.
+    template <intmax_t N, intmax_t D> struct make_ratio {
+        using type = std::ratio<N, D == 0 ? 1 : D>;
+    };
+
+public:
+    // precondition: m_address points to a type that implements the subscript operator
+    template <typename U>
+    // U is only required to delay name lookup to the 2nd phase (on use).
+    // This is necessary because m_address[0][index] is only a correct
+    // expression if has_subscript_operator<T>::value is true.
+    Vc_ALWAYS_INLINE auto operator[](U index) && -> typename std::enable_if<
+#ifndef Vc_IMPROVE_ERROR_MESSAGES
+        Traits::has_no_allocated_data<T>::value &&
+#endif
+            std::is_convertible<U, size_t>::value,
+        SubscriptOperation<
+            // the following decltype expression must depend on index and cannot
+            // simply use [0][0] because it would yield an invalid expression in
+            // case m_address[0] returns a struct/union
+            typename std::remove_reference<decltype(m_address[0][index])>::type,
+            IndexVector,
+            std::ratio_multiply<
+                Scale,
+                typename make_ratio<sizeof(T), sizeof(m_address[0][index])>::type>>>::type
+    {
+        static_assert(Traits::has_subscript_operator<T>::value,
+                      "The subscript operator was called on a type that does not implement it.\n");
+        static_assert(Traits::has_no_allocated_data<T>::value,
+                      "Invalid container type in gather/scatter operation.\nYou may only use "
+                      "nested containers that store the data inside the object (such as builtin "
+                      "arrays or std::array) but not containers that store data in allocated "
+                      "memory (such as std::vector).\nSince this feature cannot be queried "
+                      "generically at compile time you need to spezialize the "
+                      "Vc::Traits::has_no_allocated_data_impl<T> type-trait for custom types that "
+                      "meet the requirements.\n");
+        static_assert(std::is_lvalue_reference<decltype(m_address[0][index])>::value,
+                      "The container does not return an lvalue reference to the data at "
+                      "the requested offset. This makes it impossible to execute a "
+                      "gather operation.\n");
+        return {&(m_address[0][index]), m_indexes};
+    }
+
+    // precondition: m_address points to a type that implements the subscript operator
+    template <typename IT>
+    Vc_ALWAYS_INLINE typename std::enable_if<
+#ifndef Vc_IMPROVE_ERROR_MESSAGES
+        Traits::has_no_allocated_data<T>::value &&
+            Traits::has_subscript_operator<T>::value &&
+#endif
+            Traits::has_subscript_operator<IT>::value,
+        SubscriptOperation<typename std::remove_reference<decltype(
+                               m_address[0][std::declval<
+                                   const IT &>()[0]]  // std::declval<IT>()[0] could
+                                                      // be replaced with 0 if it
+                               // were not for two-phase lookup. We need to make the
+                               // m_address[0][0] expression dependent on IT
+                               )>::type,
+                           IndexVectorScaled,
+                           std::ratio<1, 1>  // reset Scale to 1 since it is applied below
+                           >>::type
+    operator[](const IT &index) &&
+    {
+        static_assert(Traits::has_subscript_operator<T>::value,
+                      "The subscript operator was called on a type that does not implement it.\n");
+        static_assert(Traits::has_no_allocated_data<T>::value,
+                      "Invalid container type in gather/scatter operation.\nYou may only use "
+                      "nested containers that store the data inside the object (such as builtin "
+                      "arrays or std::array) but not containers that store data in allocated "
+                      "memory (such as std::vector).\nSince this feature cannot be queried "
+                      "generically at compile time you need to spezialize the "
+                      "Vc::Traits::has_no_allocated_data_impl<T> type-trait for custom types that "
+                      "meet the requirements.\n");
+        return {&(m_address[0][0]),
+                applyScaleAndAdd<std::ratio_multiply<
+                    Scale, std::ratio<sizeof(T), sizeof(m_address[0][0])>>>(
+                    convertIndexVector(m_indexes), index)};
+    }
+};
+
+// specialization for invalid IndexVector type
+template <typename T, typename IndexVector, typename Scale>
+class SubscriptOperation<T, IndexVector, Scale, false>;
+
+// subscript_operator {{{1
+template <
+    typename Container,
+    typename IndexVector,
+    typename = enable_if<
+        Traits::has_subscript_operator<IndexVector>::value  // The index vector must provide [] for
+                                                            // the implementations of gather/scatter
+        &&Traits::has_contiguous_storage<Container>::value  // Container must use contiguous
+                                                            // storage, otherwise the index vector
+        // cannot be used as memory offsets, which is required for efficient
+        // gather/scatter implementations
+        &&std::is_lvalue_reference<decltype(*begin(std::declval<
+            Container>()))>::value  // dereferencing the begin iterator must yield an lvalue
+                                    // reference (const or non-const). Otherwise it is not possible
+                                    // to determine a pointer to the data storage (see above).
+        >>
+Vc_ALWAYS_INLINE SubscriptOperation<
+    typename std::remove_reference<decltype(*begin(std::declval<Container>()))>::
+        type,  // the type of the first value in the container is what the internal array pointer
+               // has to point to. But if the subscript operator of the container returns a
+               // reference we need to drop that part because it's useless information for us. But
+               // const and volatile, as well as array rank/extent are interesting and need not be
+               // dropped.
+    typename std::remove_const<typename std::remove_reference<
+        IndexVector>::type>::type  // keep volatile and possibly the array extent, but the const and
+                                   // & parts of the type need to be removed because
+                                   // SubscriptOperation explicitly adds them for its member type
+    > subscript_operator(Container &&c, IndexVector &&indexes)
+{
+    Vc_ASSERT(std::addressof(*begin(c)) + 1 ==
+              std::addressof(*(begin(c) + 1)));  // runtime assertion for contiguous storage, this
+                                                 // requires a RandomAccessIterator - but that
+                                                 // should be given for a container with contiguous
+                                                 // storage
+    return {std::addressof(*begin(c)), std::forward<IndexVector>(indexes)};
+}
+
+/**
+ * \internal
+ * Implement subscripts of std::initializer_list. This function must be in the global scope
+ * because Container arguments may be in any scope. The other argument is in std scope.
+ *
+ * -----
+ * std::initializer_list does not have constexpr member functions in C++11, but from C++14 onwards
+ * the world is a happier place. :)
+ */
+template <typename Container, typename I>
+Vc_ALWAYS_INLINE Vc::Common::SubscriptOperation<
+    typename std::remove_reference<decltype(std::declval<Container>()[0])>::type,
+    const std::initializer_list<I> &> subscript_operator(Container &&vec,
+                                                   const std::initializer_list<I> &indexes)
+{
+    return {&vec[0], indexes};
+}
+//}}}1
+
+}  // namespace Common
+
+using Common::subscript_operator;
+
+}  // namespace Vc
+
+#endif // VC_COMMON_SUBSCRIPT_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/support.h
+++ b/Vc/common/support.h
@ -0,0 +1,7 @@
+#ifndef VC_DEPRECATED_COMMON_SUPPORT_H_
+#define VC_DEPRECATED_COMMON_SUPPORT_H_
+#ifdef __GNUC__
+#warning "the <Vc/common/support.h> header is deprecated. Use <Vc/support.h> instead."
+#endif
+#include <Vc/support.h>
+#endif // VC_DEPRECATED_COMMON_SUPPORT_H_
--- a/Vc/common/transpose.h
+++ b/Vc/common/transpose.h
@ -0,0 +1,57 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_TRANSPOSE_H_
+#define VC_COMMON_TRANSPOSE_H_
+
+#include "macros.h"
+#include <tuple>
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+template <typename... Inputs> struct TransposeProxy
+{
+    TransposeProxy(const Inputs &... inputs) : in{inputs...} {}
+
+    std::tuple<const Inputs &...> in;
+};
+
+template <int LhsLength, size_t RhsLength> struct TransposeTag {
+};
+}  // namespace Common
+
+template <typename... Vs> Common::TransposeProxy<Vs...> transpose(Vs... vs)
+{
+    return {vs...};
+}
+}  // namespace Vc
+
+#endif  // VC_COMMON_TRANSPOSE_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/trigonometric.h
+++ b/Vc/common/trigonometric.h
@ -0,0 +1,226 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_TRIGONOMETRIC_H_
+#define VC_COMMON_TRIGONOMETRIC_H_
+
+#include "macros.h"
+
+#ifdef Vc_HAVE_LIBMVEC
+extern "C" {
+__m128 _ZGVbN4v_sinf(__m128);
+__m128d _ZGVbN2v_sin(__m128d);
+__m128 _ZGVbN4v_cosf(__m128);
+__m128d _ZGVbN2v_cos(__m128d);
+__m256 _ZGVdN8v_sinf(__m256);
+__m256d _ZGVdN4v_sin(__m256d);
+__m256 _ZGVdN8v_cosf(__m256);
+__m256d _ZGVdN4v_cos(__m256d);
+}
+#endif
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Detail
+{
+template<Vc::Implementation Impl> struct MapImpl { enum Dummy { Value = Impl }; };
+template<> struct MapImpl<Vc::SSE42Impl> { enum Dummy { Value = MapImpl<Vc::SSE41Impl>::Value }; };
+
+template<Vc::Implementation Impl> using TrigonometricImplementation =
+    ImplementationT<MapImpl<Impl>::Value
+#if defined(Vc_IMPL_XOP) && defined(Vc_IMPL_FMA4)
+    + Vc::XopInstructions
+    + Vc::Fma4Instructions
+#endif
+    >;
+}  // namespace Detail
+
+namespace Common
+{
+template<typename Impl> struct Trigonometric
+{
+    template<typename T> static T Vc_VDECL sin(const T &_x);
+    template<typename T> static T Vc_VDECL cos(const T &_x);
+    template<typename T> static void Vc_VDECL sincos(const T &_x, T *_sin, T *_cos);
+    template<typename T> static T Vc_VDECL asin (const T &_x);
+    template<typename T> static T Vc_VDECL atan (const T &_x);
+    template<typename T> static T Vc_VDECL atan2(const T &y, const T &x);
+};
+}  // namespace Common
+
+#if defined Vc_IMPL_SSE || defined DOXYGEN
+// this is either SSE, AVX, or AVX2
+namespace Detail
+{
+template <typename T, typename Abi>
+using Trig = Common::Trigonometric<Detail::TrigonometricImplementation<
+    (std::is_same<Abi, VectorAbi::Sse>::value
+         ? SSE42Impl
+         : std::is_same<Abi, VectorAbi::Avx>::value ? AVXImpl : ScalarImpl)>>;
+}  // namespace Detail
+
+#ifdef Vc_HAVE_LIBMVEC
+Vc_INTRINSIC __m128  sin_dispatch(__m128  x) { return ::_ZGVbN4v_sinf(x); }
+Vc_INTRINSIC __m128d sin_dispatch(__m128d x) { return ::_ZGVbN2v_sin (x); }
+Vc_INTRINSIC __m128  cos_dispatch(__m128  x) { return ::_ZGVbN4v_cosf(x); }
+Vc_INTRINSIC __m128d cos_dispatch(__m128d x) { return ::_ZGVbN2v_cos (x); }
+#ifdef Vc_IMPL_AVX
+Vc_INTRINSIC __m256  sin_dispatch(__m256  x) { return ::_ZGVdN8v_sinf(x); }
+Vc_INTRINSIC __m256d sin_dispatch(__m256d x) { return ::_ZGVdN4v_sin (x); }
+Vc_INTRINSIC __m256  cos_dispatch(__m256  x) { return ::_ZGVdN8v_cosf(x); }
+Vc_INTRINSIC __m256d cos_dispatch(__m256d x) { return ::_ZGVdN4v_cos (x); }
+#endif
+
+template <typename T, typename Abi>
+Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> sin(const Vector<T, Abi> &x)
+{
+    return sin_dispatch(x.data());
+}
+template <typename T, typename Abi>
+Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> cos(const Vector<T, Abi> &x)
+{
+    return cos_dispatch(x.data());
+}
+#else
+/**
+ * \ingroup Math
+ * Returns the sine of all input values in \p x.
+ *
+ * \param x The values to apply the sine function on.
+ *
+ * \returns the sine of \p x.
+ *
+ * \note The single-precision implementation has a precision of max. 2 ulp (mean 0.17 ulp)
+ * in the range [-8192, 8192].
+ * (testSin< float_v> with a maximal distance of 2 to the reference (mean: 0.310741))
+ *
+ * \note The double-precision implementation has a precision of max. 3 ulp (mean 1040 ulp)
+ * in the range [-8192, 8192].
+ * (testSin<double_v> with a maximal distance of 1 to the reference (mean: 0.170621))
+ *
+ * \note The precision and execution latency depends on:
+ *       - `Abi` (e.g. Scalar uses the `<cmath>` implementation
+ *       - whether `Vc_HAVE_LIBMVEC` is defined
+ *       - for the `<cmath>` fallback, the implementations differ (e.g. MacOS vs. Linux
+ * vs. Windows; fpmath=sse vs. fpmath=387)
+ *
+ * \note Vc versions before 1.4 had different precision.
+ */
+template <typename T, typename Abi>
+Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> sin(const Vector<T, Abi> &x)
+{
+    return Detail::Trig<T, Abi>::sin(x);
+}
+
+/**
+ * \ingroup Math
+ * Returns the cosine of all input values in \p x.
+ *
+ * \param x The values to apply the cosine function on.
+ * \returns the cosine of \p x.
+ *
+ * \note The single-precision implementation has a precision of max. 2 ulp (mean 0.18 ulp) in the range [-8192, 8192].
+ * \note The double-precision implementation has a precision of max. 3 ulp (mean 1160 ulp) in the range [-8192, 8192].
+ * \note Vc versions before 1.4 had different precision.
+ */
+template <typename T, typename Abi>
+Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> cos(const Vector<T, Abi> &x)
+{
+    return Detail::Trig<T, Abi>::cos(x);
+}
+#endif
+
+/**
+ * \ingroup Math
+ * Returns the arcsine of all input values in \p x.
+ *
+ * \param x The values to apply the arcsine function on.
+ * \returns the arcsine of \p x.
+ *
+ * \note The single-precision implementation has an error of max. 2 ulp (mean 0.3 ulp).
+ * \note The double-precision implementation has an error of max. 36 ulp (mean 0.4 ulp).
+ */
+template <typename T, typename Abi>
+Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> asin(const Vector<T, Abi> &x)
+{
+    return Detail::Trig<T, Abi>::asin(x);
+}
+
+/**
+ * \ingroup Math
+ * Returns the arctangent of all input values in \p x.
+ *
+ * \param x The values to apply the arctangent function on.
+ * \returns the arctangent of \p x.
+ * \note The single-precision implementation has an error of max. 3 ulp (mean 0.4 ulp) in the range [-8192, 8192].
+ * \note The double-precision implementation has an error of max. 2 ulp (mean 0.1 ulp) in the range [-8192, 8192].
+ */
+template <typename T, typename Abi>
+Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> atan(const Vector<T, Abi> &x)
+{
+    return Detail::Trig<T, Abi>::atan(x);
+}
+
+/**
+ * \ingroup Math
+ * Returns the arctangent of all input values in \p x and \p y.
+ *
+ * Calculates the angle given the lengths of the opposite and adjacent legs in a right
+ * triangle.
+ * \param y The opposite leg.
+ * \param x The adjacent leg.
+ * \returns the arctangent of \p y / \p x.
+ */
+template <typename T, typename Abi>
+Vc_INTRINSIC Vector<T, detail::not_fixed_size_abi<Abi>> atan2(const Vector<T, Abi> &y,
+                                                              const Vector<T, Abi> &x)
+{
+    return Detail::Trig<T, Abi>::atan2(y, x);
+}
+
+/**
+ * \ingroup Math
+ *
+ * \param x Input value to both sine and cosine.
+ * \param sin A non-null pointer to a potentially uninitialized object of type Vector.
+ *            When \c sincos returns, `*sin` contains the result of `sin(x)`.
+ * \param cos A non-null pointer to a potentially uninitialized object of type Vector.
+ *            When \c sincos returns, `*cos` contains the result of `cos(x)`.
+ *
+ * \see sin, cos
+ */
+template <typename T, typename Abi>
+Vc_INTRINSIC void sincos(const Vector<T, Abi> &x,
+                         Vector<T, detail::not_fixed_size_abi<Abi>> *sin,
+                         Vector<T, Abi> *cos)
+{
+    Detail::Trig<T, Abi>::sincos(x, sin, cos);
+}
+#endif
+}  // namespace Vc_VERSIONED_NAMESPACE
+
+#endif  // VC_COMMON_TRIGONOMETRIC_H_
--- a/Vc/common/types.h
+++ b/Vc/common/types.h
@ -0,0 +1,402 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2012-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_TYPES_H_
+#define VC_COMMON_TYPES_H_
+
+#ifdef Vc_CHECK_ALIGNMENT
+#include <cstdlib>
+#include <cstdio>
+#endif
+
+#include <ratio>
+#include "../global.h"
+#include "../traits/type_traits.h"
+#include "permutation.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+///\addtogroup Utilities
+///@{
+
+/// \internal Allow writing \c size_t without the `std::` prefix.
+using std::size_t;
+
+/// long long shorthand
+using llong = long long;
+/// unsigned long long shorthand
+using ullong = unsigned long long;
+/// unsigned long shorthand
+using ulong = unsigned long;
+/// unsigned int shorthand
+using uint = unsigned int;
+/// unsigned short shorthand
+using ushort = unsigned short;
+/// unsigned char shorthand
+using uchar = unsigned char;
+/// signed char shorthand
+using schar = signed char;
+
+/**\internal
+ * Tag type for explicit zero-initialization
+ */
+struct VectorSpecialInitializerZero {};
+/**\internal
+ * Tag type for explicit one-initialization
+ */
+struct VectorSpecialInitializerOne {};
+/**\internal
+ * Tag type for explicit "iota-initialization"
+ */
+struct VectorSpecialInitializerIndexesFromZero {};
+
+/**
+ * The special object \p Vc::Zero can be used to construct Vector and Mask objects
+ * initialized to zero/\c false.
+ */
+constexpr VectorSpecialInitializerZero Zero = {};
+/**
+ * The special object \p Vc::One can be used to construct Vector and Mask objects
+ * initialized to one/\c true.
+ */
+constexpr VectorSpecialInitializerOne One = {};
+/**
+ * The special object \p Vc::IndexesFromZero can be used to construct Vector objects
+ * initialized to values 0, 1, 2, 3, 4, ...
+ */
+constexpr VectorSpecialInitializerIndexesFromZero IndexesFromZero = {};
+///@}
+
+namespace Detail
+{
+template<typename T> struct MayAliasImpl {
+#ifdef Vc_ICC
+#pragma warning(disable:2621)
+#endif
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wattributes"
+#endif
+    typedef T type Vc_MAY_ALIAS;
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+#ifdef Vc_ICC
+#pragma warning(enable:2621)
+#endif
+};
+//template<size_t Bytes> struct MayAlias<MaskBool<Bytes>> { typedef MaskBool<Bytes> type; };
+}  // namespace Detail
+/**\internal
+ * Helper MayAlias<T> that turns T into the type to be used for an aliasing pointer. This
+ * adds the may_alias attribute to T (with compilers that support it). But for MaskBool this
+ * attribute is already part of the type and applying it a second times leads to warnings/errors,
+ * therefore MaskBool is simply forwarded as is.
+ */
+template <typename T> using MayAlias = typename Detail::MayAliasImpl<T>::type;
+
+template <class To, class From> MayAlias<To> &aliasing_cast(From &x)
+{
+    return *reinterpret_cast<MayAlias<To> *>(&x);
+}
+template <class To, class From> const MayAlias<To> &aliasing_cast(const From &x)
+{
+    return *reinterpret_cast<const MayAlias<To> *>(&x);
+}
+
+template <class To, class From> MayAlias<To> *aliasing_cast(From *x)
+{
+    return reinterpret_cast<MayAlias<To> *>(x);
+}
+template <class To, class From> const MayAlias<To> *aliasing_cast(const From *x)
+{
+    return reinterpret_cast<const MayAlias<To> *>(x);
+}
+
+/**\internal
+ * This enumeration lists all possible operators in C++.
+ *
+ * The assignment and compound assignment enumerators are used with the conditional_assign
+ * implementation.
+ */
+enum class Operator : char {
+    Assign,
+    Multiply,
+    MultiplyAssign,
+    Divide,
+    DivideAssign,
+    Remainder,
+    RemainderAssign,
+    Plus,
+    PlusAssign,
+    Minus,
+    MinusAssign,
+    RightShift,
+    RightShiftAssign,
+    LeftShift,
+    LeftShiftAssign,
+    And,
+    AndAssign,
+    Xor,
+    XorAssign,
+    Or,
+    OrAssign,
+    PreIncrement,
+    PostIncrement,
+    PreDecrement,
+    PostDecrement,
+    LogicalAnd,
+    LogicalOr,
+    Comma,
+    UnaryPlus,
+    UnaryMinus,
+    UnaryNot,
+    UnaryOnesComplement,
+    CompareEqual,
+    CompareNotEqual,
+    CompareLess,
+    CompareGreater,
+    CompareLessEqual,
+    CompareGreaterEqual
+};
+
+// forward declaration for Vc::array in <Vc/array>
+template <typename T, std::size_t N> struct array;
+// forward declaration for Vc::span in <Vc/span>
+namespace Common {
+template <typename T, std::ptrdiff_t N> class span;
+}
+
+/* TODO: add type for half-float, something along these lines:
+class half_float
+{
+    uint16_t data;
+public:
+    constexpr half_float() : data(0) {}
+    constexpr half_float(const half_float &) = default;
+    constexpr half_float(half_float &&) = default;
+    constexpr half_float &operator=(const half_float &) = default;
+
+    constexpr explicit half_float(float);
+    constexpr explicit half_float(double);
+    constexpr explicit half_float(int);
+    constexpr explicit half_float(unsigned int);
+
+    explicit operator float       () const;
+    explicit operator double      () const;
+    explicit operator int         () const;
+    explicit operator unsigned int() const;
+
+    bool operator==(half_float rhs) const;
+    bool operator!=(half_float rhs) const;
+    bool operator>=(half_float rhs) const;
+    bool operator<=(half_float rhs) const;
+    bool operator> (half_float rhs) const;
+    bool operator< (half_float rhs) const;
+
+    half_float operator+(half_float rhs) const;
+    half_float operator-(half_float rhs) const;
+    half_float operator*(half_float rhs) const;
+    half_float operator/(half_float rhs) const;
+};
+*/
+
+// TODO: the following doesn't really belong into the toplevel Vc namespace.
+#ifndef Vc_CHECK_ALIGNMENT
+template<typename _T> static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *){}
+#else
+template<typename _T> static Vc_ALWAYS_INLINE void assertCorrectAlignment(const _T *ptr)
+{
+    const size_t s = alignof(_T);
+    if((reinterpret_cast<size_t>(ptr) & ((s ^ (s & (s - 1))) - 1)) != 0) {
+        fprintf(stderr, "A vector with incorrect alignment has just been created. Look at the stacktrace to find the guilty object.\n");
+        abort();
+    }
+}
+#endif
+
+namespace Common
+{
+// defined in common/simdarrayhelper.h
+template <typename T, std::size_t Pieces, std::size_t Index> struct Segment;
+
+/**
+ * \internal
+ *
+ * Helper interface to make m_indexes in InterleavedMemoryAccessBase behave like an integer vector.
+ * Only that the entries are successive entries from the given start index.
+ */
+template<size_t StructSize> class SuccessiveEntries
+{
+#ifdef Vc_MSVC
+    // scatterinterleavedmemory fails with garbage values in m_first if size_type is a
+    // 64-bit integer type. Using a 32-bit type seems to work around the miscompilation.
+    using size_type = unsigned;
+#else
+    using size_type = size_t;
+#endif
+    const size_type m_first;
+
+public:
+    typedef SuccessiveEntries AsArg;
+    Vc_INTRINSIC SuccessiveEntries(size_type first) : m_first(first) {}
+    Vc_INTRINSIC Vc_PURE size_type operator[](size_type offset) const
+    {
+        return m_first + offset * StructSize;
+    }
+    Vc_INTRINSIC Vc_PURE size_type data() const { return m_first; }
+    Vc_INTRINSIC Vc_PURE SuccessiveEntries operator+(const SuccessiveEntries &rhs) const
+    {
+        return SuccessiveEntries(m_first + rhs.m_first);
+    }
+    Vc_INTRINSIC Vc_PURE SuccessiveEntries operator*(const SuccessiveEntries &rhs) const
+    {
+        return SuccessiveEntries(m_first * rhs.m_first);
+    }
+    Vc_INTRINSIC Vc_PURE SuccessiveEntries operator<<(size_type x) const
+    {
+        return {m_first << x};
+    }
+
+    friend Vc_INTRINSIC SuccessiveEntries &internal_data(SuccessiveEntries &x)
+    {
+        return x;
+    }
+    friend Vc_INTRINSIC const SuccessiveEntries &internal_data(const SuccessiveEntries &x)
+    {
+        return x;
+    }
+};
+
+// declaration for functions in common/malloc.h
+template <std::size_t alignment>
+Vc_INTRINSIC_L void *aligned_malloc(std::size_t n) Vc_INTRINSIC_R;
+Vc_ALWAYS_INLINE_L void free(void *p) Vc_ALWAYS_INLINE_R;
+
+/**\internal
+ * Central definition of the type combinations that convert implicitly.
+ */
+template <typename Mask, typename T, typename U>
+using enable_if_mask_converts_implicitly =
+    enable_if<(!std::is_same<Mask, Traits::decay<U>>::value &&  // that'd be the copy ctor
+               Traits::is_simd_mask<U>::value && !Traits::isSimdMaskArray<U>::value &&
+               Traits::is_implicit_cast_allowed_mask<
+                   Traits::entry_type_of<typename Traits::decay<U>::Vector>, T>::value)>;
+/**\internal
+ * Central definition of the type combinations that only convert explicitly.
+ */
+template <typename T, typename U>
+using enable_if_mask_converts_explicitly = enable_if<(
+    Traits::isSimdMaskArray<U>::value ||
+    (Traits::is_simd_mask<U>::value &&
+     !Traits::is_implicit_cast_allowed_mask<
+         Traits::entry_type_of<typename Traits::decay<U>::Vector>, T>::value))>;
+
+/**\internal
+ * Tag type for overloading on the width (\VSize{T}) of a vector.
+ */
+template <typename T> using WidthT = std::integral_constant<std::size_t, sizeof(T)>;
+
+// forward declaration of MaskBool in common/maskbool.h
+template <std::size_t Bytes> class MaskBool;
+
+// forward declaration of SubscriptOperation in common/subscript.h
+template <typename T, typename IndexVector, typename Scale, bool>
+class SubscriptOperation;
+
+/**
+ * \internal
+ * Helper type to pass along the two arguments for a gather operation.
+ *
+ * \tparam IndexVector  Normally an integer SIMD vector, but an array or std::vector also
+ *                      works (though often not as efficient).
+ */
+template <class T, class IndexVector, int Scale = 1>
+struct GatherArguments {
+    static_assert(std::is_same<T, remove_cvref_t<T>>::value && !std::is_pointer<T>::value,
+                  "GatherArguments expects an cv unqualified non-ref/ptr type");
+    const IndexVector indexes;
+    const T *const address;
+};
+template <int Scale, class T, class I>
+GatherArguments<T, I, Scale> make_gather(const T *m, const I &i)
+{
+    return {i, m};
+}
+
+/**
+ * \internal
+ * Helper type to pass along the two arguments for a scatter operation.
+ *
+ * \tparam IndexVector  Normally an integer SIMD vector, but an array or std::vector also
+ *                      works (though often not as efficient).
+ */
+template <typename T, typename IndexVector> struct ScatterArguments
+{
+    const IndexVector indexes;
+    T *const address;
+};
+
+/**\internal
+ * Break the recursion of the function below.
+ */
+template <typename I, I Begin, I End, typename F>
+Vc_INTRINSIC enable_if<(Begin >= End), void> unrolled_loop(F &&)
+{
+}
+
+/**\internal
+ * Force the code in the lambda \p f to be called with indexes starting from \p Begin up
+ * to (excluding) \p End to be called without compare and jump instructions (i.e. an
+ * unrolled loop).
+ */
+template <typename I, I Begin, I End, typename F>
+Vc_INTRINSIC Vc_FLATTEN enable_if<(Begin < End), void> unrolled_loop(F &&f)
+{
+    f(Begin);
+    unrolled_loop<I, Begin + 1, End>(f);
+}
+
+/**\internal
+ * Small simplification of the unrolled_loop call for ranges from 0 to \p Size using
+ * std::size_t as the index type.
+ */
+template <std::size_t Size, typename F> Vc_INTRINSIC void for_all_vector_entries(F &&f)
+{
+    unrolled_loop<std::size_t, 0u, Size>(std::forward<F>(f));
+}
+
+}  // namespace Common
+}  // namespace Vc
+
+#include "vector.h"
+#include "mask.h"
+#include "memoryfwd.h"
+
+#endif // VC_COMMON_TYPES_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/utility.h
+++ b/Vc/common/utility.h
@ -0,0 +1,96 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_UTILITY_H_
+#define VC_COMMON_UTILITY_H_
+
+#include "macros.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+namespace Common
+{
+/**
+ * \internal
+ * Returns the next power of 2 larger than or equal to \p x.
+ */
+template <size_t x, bool = (x & (x - 1)) == 0> struct NextPowerOfTwo;
+template <size_t x>
+struct NextPowerOfTwo<x, true> : public std::integral_constant<size_t, x> {
+};
+template <size_t x>
+struct NextPowerOfTwo<x, false>
+    : public std::integral_constant<
+          size_t, NextPowerOfTwo<(x | (x >> 1) | (x >> 2) | (x >> 5)) + 1>::value> {
+};
+
+/**
+ * \internal
+ * Enforce an upper bound to an alignment value. This is necessary because some compilers
+ * implement such an upper bound and emit a warning if it is encountered.
+ */
+template <size_t A>
+struct BoundedAlignment : public std::integral_constant<size_t,
+#if defined Vc_MSVC || defined Vc_GCC
+                                                        ((A - 1) &
+#ifdef Vc_MSVC
+                                                         31
+#elif defined __AVX__
+                                                         255
+#else
+                                                         127
+#endif
+                                                         ) + 1
+#else
+                                                        A
+#endif
+                                                        > {
+};
+
+/**
+ * \internal
+ * Returns the size of the left/first SimdArray member.
+ */
+template <std::size_t N> static constexpr std::size_t left_size()
+{
+    return Common::NextPowerOfTwo<(N + 1) / 2>::value;
+}
+/**
+ * \internal
+ * Returns the size of the right/second SimdArray member.
+ */
+template <std::size_t N> static constexpr std::size_t right_size()
+{
+    return N - left_size<N>();
+}
+
+}  // namespace Common
+}  // namespace Vc
+
+#endif  // VC_COMMON_UTILITY_H_
+
+// vim: foldmethod=marker
--- a/Vc/common/vector.h
+++ b/Vc/common/vector.h
@ -0,0 +1,857 @@
+/*  This file is part of the Vc library. {{{
+Copyright © 2015 Matthias Kretz <kretz@kde.org>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the names of contributing organizations nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+}}}*/
+
+#ifndef VC_COMMON_VECTOR_H_
+#define VC_COMMON_VECTOR_H_
+
+#include <ratio>
+#include "elementreference.h"
+#include "types.h"
+#include "vectorabi.h"
+#include "vectortraits.h"
+#include "simdarrayfwd.h"
+#include "loadstoreflags.h"
+#include "writemaskedvector.h"
+#include "detail.h"
+
+namespace Vc_VERSIONED_NAMESPACE
+{
+/**
+ * \ingroup Math
+ * Copies the sign(s) of \p sign to the value(s) in \p magnitude and returns the resulting
+ * vector.
+ *
+ * \param magnitude This vector's magnitude will be used in the return vector.
+ * \param sign This vector's sign bit will be used in the return vector.
+ *
+ * \return a value where the sign of the value equals the sign of \p sign. I.e.
+ * `sign(copysign(v, r)) == sign(r)`.
+ */
+template <typename T, typename Abi,
+          typename = enable_if<std::is_floating_point<T>::value &&
+                               !detail::is_fixed_size_abi<Abi>::value>>
+inline Vector<T, Abi> copysign(Vector<T, Abi> magnitude, Vector<T, Abi> sign);
+
+/**
+ * \ingroup Math
+ * Extracts the exponent of each floating-point vector component.
+ *
+ * \param x The vector of values to check for the sign.
+ * \return the exponent to base 2.
+ *
+ * This function provides efficient access to the exponent of the floating point number. The
+ * returned value is a fast approximation to the logarithm of base 2. The absolute error of that
+ * approximation is between [0, 1[.
+ *
+ * Examples:
+\verbatim
+ value | exponent | log2
+=======|==========|=======
+   1.0 |        0 | 0
+   2.0 |        1 | 1
+   3.0 |        1 | 1.585
+   3.9 |        1 | 1.963
+   4.0 |        2 | 2
+   4.1 |        2 | 2.036
+\endverbatim
+ *
+ * \warning This function assumes a positive value (non-zero). If the value is negative the sign bit will
+ * modify the returned value. An input value of zero will return the bias of the floating-point
+ * representation. If you compile with Vc runtime checks, the function will assert
+ * values greater than or equal to zero.
+ *
+ * You may use abs to apply this function to negative values:
+ * \code
+ * exponent(abs(v))
+ * \endcode
+ */
+template <typename T, typename Abi,
+          typename = enable_if<std::is_floating_point<T>::value &&
+                               !detail::is_fixed_size_abi<Abi>::value>>
+inline Vector<T, Abi> exponent(Vector<T, Abi> x);
+
+/**
+ * \ingroup Math
+ * Returns for each vector component whether it stores a negative value.
+ *
+ * \param x The vector of values to check for the sign.
+ * \returns a mask which is \c true only in those components that are negative in \p x.
+ */
+template <typename T, typename Abi>
+Vc_INTRINSIC Vc_CONST typename Vector<T, detail::not_fixed_size_abi<Abi>>::MaskType
+isnegative(Vector<T, Abi> x)
+{
+    return x < Vector<T, Abi>::Zero();
+}
+
+/**
+ * \class Vector types.h <Vc/vector.h>
+ * \ingroup Vectors
+ *
+ * The main vector class for expressing data parallelism.
+ *
+ * are specializations of this class.
+ * For most cases there are no API differences for the specializations.
+ * Make use of Vector<T> for generic programming, otherwise you might prefer to use
+ * the \p *_v aliases.
+ *
+ * \see Vc::float_v, Vc::double_v, Vc::int_v, Vc::uint_v, Vc::short_v, Vc::ushort_v
+ * \see Mask
+ */
+template<typename T, typename Abi = VectorAbi::Best<T>> class Vector
+{
+public:
+    /**
+     * Returns the number of scalar components (\VSize{T}) in a vector of this type.
+     *
+     * The size of the vector. I.e. the number of scalar entries in the vector. Do not
+     * make any assumptions about the size of vectors. If you need vectors of \c float and
+     * \c int types use Vector::IndexType or SimdArray.
+     *
+     * You can easily use if clauses to compare Vector sizes. The compiler can
+     * statically evaluate and fully optimize dead code away (very much like \#ifdef, but
+     * with syntax checking).
+     *
+     * \returns The number of components (i.e. \VSize{T}) objects of this vector type
+     * store and manipulate.
+     */
+    static constexpr size_t size() { return VectorTraits<T, Abi>::size(); }
+
+    /**
+     * Specifies the alignment requirement for aligned load and store calls for objects of
+     * this vector type.
+     */
+    static constexpr size_t MemoryAlignment = VectorTraits<T, Abi>::memoryAlignment();
+
+    /// The ABI tag type of the current template instantiation.
+    using abi = Abi;
+
+    /// The type of the entries in the vector.
+    using EntryType = typename VectorTraits<T, Abi>::EntryType;
+    /// \copydoc EntryType
+    using value_type = EntryType;
+
+    using VectorEntryType = typename VectorTraits<T, Abi>::VectorEntryType;
+    /**\internal
+     * This type reveals the implementation-specific type used for the data member.
+     */
+    using VectorType = typename VectorTraits<T, Abi>::VectorType;
+    /**\internal
+     * \copydoc VectorType
+     */
+    using vector_type = VectorType;
+
+    /// The type of the mask used for masked operations and returned from comparisons.
+    using MaskType = Vc::Mask<T, Abi>;
+    /// \copydoc MaskType
+    using mask_type = MaskType;
+
+    using MaskArgument = MaskType;
+    using VectorArgument = Vector;
+
+    /// The type of the vector used for indexes in gather and scatter operations.
+    using IndexType = Vc::fixed_size_simd<int, VectorTraits<T, Abi>::size()>;
+    /// \copydoc IndexType
+    using index_type = IndexType;
+
+    using reference = Detail::ElementReference<Vector>;
+
+    /// \name Generators
+    ///@{
+    /**
+     * Returns a vector with the entries initialized to zero.
+     */
+    static inline Vector Zero();
+
+    /**
+     * Returns a vector with the entries initialized to one.
+     */
+    static inline Vector One();
+
+    /**
+     * Returns a vector with the entries initialized to 0, 1, 2, 3, 4, 5, ...
+     */
+    static inline Vector IndexesFromZero();
+
+    /**
+     * Returns a vector with pseudo-random entries.
+     *
+     * Currently the state of the random number generator cannot be modified and starts
+     * off with the same state. Thus you will get the same sequence of numbers for the
+     * same sequence of calls.
+     *
+     * \return a new random vector. Floating-point values will be in the 0-1 range.
+     * Integers will use the full range the integer representation allows.
+     *
+     * \note This function may use a very small amount of state and thus will be a weak
+     * random number generator.
+     */
+    static inline Vector Random();
+
+    /// Generate a vector object from return values of \p gen (static variant of \ref fill).
+    template <typename G> static inline Vector generate(G gen);
+    ///@}
+
+    /// \name Compile-Time Constant Initialization
+    ///@{
+    /**
+     * Construct a zero-initialized vector object.
+     *
+     * This constructor follows the behavior of the underlying arithmetic type \p T in
+     * that the expression `T()` zero-initializes the object. On the other hand the
+     * variable \c x in `T x;` is uninitialized.
+     * Since, for class types, both expressions call the default constructor `Vector<T> x`
+     * must zero-initialize \c x as well.
+     */
+    inline Vector() = default;
+
+    /**
+     * Construct a vector with the entries initialized to zero.
+     *
+     * \see Vc::Zero, Zero()
+     */
+    explicit inline Vector(VectorSpecialInitializerZero);
+
+    /**
+     * Construct a vector with the entries initialized to one.
+     *
+     * \see Vc::One, One()
+     */
+    explicit inline Vector(VectorSpecialInitializerOne);
+
+    /**
+     * Construct a vector with the entries initialized to 0, 1, 2, 3, 4, 5, ...
+     *
+     * \see Vc::IndexesFromZero, IndexesFromZero()
+     */
+    explicit inline Vector(VectorSpecialInitializerIndexesFromZero);
+    ///@}
+
+    /// \name Conversion/Broadcast Constructors
+    ///@{
+    /**
+     * Implict conversion from compatible Vector<U, Abi> types.
+     */
+    template <typename U>
+    inline Vector(Vector<U, abi> x,
+                  enable_if<Traits::is_implicit_cast_allowed<U, T>::value> = nullarg);
+
+#if Vc_IS_VERSION_1
+    /**
+     * Explicit conversion (i.e. `static_cast`) from the remaining Vector<U, Abi> types.
+     *
+     * \param x A vector object to use for initialization of the new vector object. If \p
+     *          x contains more entries than the new object the high components will be
+     *          ignored. If \p x contains fewer entries than the new object the high
+     *          components of the new object will be zero-initialized. Type conversion is
+     *          done according to the standard conversion rules for the underlying
+     *          fundamental arithmetic types.
+     */
+    template <typename U>
+    Vc_DEPRECATED("use simd_cast instead of explicit type casting to convert between "
+                  "vector types") inline explicit Vector(
+        Vector<U, abi> x,
+        enable_if<!Traits::is_implicit_cast_allowed<U, T>::value> = nullarg);
+#endif
+
+    /**
+     * Broadcast Constructor.
+     *
+     * Constructs a vector with all entries of the vector filled with the given value.
+     *
+     * \param a The scalar value to broadcast to all entries of the constructed vector.
+     */
+    inline Vector(EntryType a);
+    template <typename U>
+    inline Vector(U a, enable_if<std::is_same<U, int>::value &&
+                                 !std::is_same<U, EntryType>::value> = nullarg);
+    inline explicit Vector(reference a);
+    ///@}
+
+    /**
+     * \name Loads & Stores
+     */
+    ///@{
+#include "../common/loadinterface.h"
+#include "../common/storeinterface.h"
+    ///@}
+
+    /**
+     * Set all entries to zero.
+     */
+    inline void setZero();
+
+    /**
+     * Set all entries to zero where the mask is set.
+     *
+     * A 4-vector with a mask of `[0111]` therefore would set the last three entries to 0.
+     *
+     * \param mask Selects the entries to be set to zero.
+     */
+    inline void setZero(MaskType mask);
+
+    /**
+     * Set all entries to zero where the mask is not set.
+     *
+     * A 4-vector with a mask of `[0111]` therefore would set only the first entry to 0.
+     *
+     * \param mask Selects the entries to not be set to zero.
+     */
+    inline void setZeroInverted(MaskType mask);
+
+    /**
+     * Set all entries to the bit representation of a QNaN.
+     */
+    inline void setQnan();
+
+    /**
+     * Set all entries to the bit representation of a QNaN where the mask is set.
+     *
+     * \param mask Selects the entries to be set to QNaN.
+     */
+    inline void setQnan(MaskType mask);
+
+#define Vc_CURRENT_CLASS_NAME Vector
+#include "../common/gatherinterface.h"
+#include "../common/scatterinterface.h"
+#undef Vc_CURRENT_CLASS_NAME
+
+    /// \name Scalar Subscript Operators
+    ///@{
+    /**
+     * This operator can be used to modify scalar entries of the vector.
+     *
+     * \param index A value between 0 and Size. This value is not checked internally so
+     *              you must make/be sure it is in range.
+     *
+     * \return a reference to the vector entry at the given \p index.
+     *
+     * \warning The use of this function may result in suboptimal performance. Please
+     *          check whether you can find a more vector-friendly way to do what you
+     *          intended.
+     * \note the returned object models the concept of a reference and
+     * as such it can exist longer than the data it is referencing.
+     * \note to avoid lifetime issues, we strongly advice not to store
+     * any reference objects.
+     */
+    inline reference operator[](size_t index) noexcept;
+    /**
+     * This operator can be used to read scalar entries of the vector.
+     *
+     * \param index A value between 0 and Size. This value is not checked internally so
+     *              you must make/be sure it is in range.
+     *
+     * \return a copy of the vector entry at the given \p index.
+     */
+    inline EntryType operator[](size_t index) const noexcept;
+    ///@}
+
+    /// \name Unary Operators
+    ///@{
+    /**
+     * Determine where the vector is null.
+     *
+     * \returns a mask which denotes the zero entries of this vector object.
+     */
+    inline MaskType operator!() const;
+
+    /**
+     * Inverts all bits.
+     *
+     * \returns a new vector which has all bits inverted. I.e. `v & ~v == 0`.
+     *
+     * \note This operator is only defined for integral types \p T.
+     */
+    inline Vector operator~() const;
+
+    /// Returns a new vector object with all entries negated.
+    inline Vector operator-() const;
+    /// Returns a copy of the vector object.
+    inline Vector operator+() const;
+    ///@}
+
+    /**
+     * \name Increment and Decrement Operators
+     * The increment and decrement operators apply the increment/decrement operation per
+     * component.
+     *
+     * The semantics are equal to the semantics of the fundamental arithmetics type \p T.
+     *
+     * \note Over-/Underflow of signed integral types is undefined behavior and may
+     * actually break your code.
+     */
+    ///@{
+    inline Vector &operator++();  // prefix
+    inline Vector operator++(int);  // postfix
+    inline Vector &operator--();  // prefix
+    inline Vector operator--(int);  // postfix
+    ///@}
+
+#define Vc_OP(symbol)                                                                    \
+    inline Vc_PURE Vector operator symbol(const Vector &x) const;
+    /**
+     * \name Arithmetic Operations
+     *
+     * The arithmetic operations are implemented as component-wise
+     * application of the operator on the two vector objects.
+     *
+     * Example:
+     * \code
+     * void foo(float_v a, float_v b) {
+     *   const float_v product    = a * b;
+     *   const float_v difference = a - b;
+     *   a += b;
+     *   auto quotient = a / b;
+     *   auto modulo = static_cast<int_v>(a) % static_cast<int_v>(b);
+     * }
+     * \endcode
+     *
+     * \param x The vector to add, subtract, multiply, or divide by.
+     * \returns A vector object of the same type with the components filled according to a
+     *          component-wise application of the operator.
+     *
+     * \note If a signed integral vector operation overflows the result is undefined.
+     * (which is in agreement to the behavior of the fundamental signed integral types in
+     * C++)
+     */
+    ///@{
+    Vc_ALL_ARITHMETICS(Vc_OP);
+    ///@}
+
+    /**
+     * \name Binary Operations
+     *
+     * The binary operations are implemented as component-wise
+     * application of the operator on the two vector objects.
+     *
+     * Example:
+     * \code
+     * void foo(int_v a, int_v b) {
+     *   const int_v combined_bits = a | b;
+     *   const int_v masked_bits = a & b;
+     *   a ^= b;  // flipped bits
+     * }
+     * \endcode
+     *
+     * \returns A vector object of the same type with the components filled according to a
+     *          component-wise application of the operator.
+     */
+    ///@{
+    Vc_ALL_BINARY(Vc_OP);
+    ///@}
+
+    /**
+     * \name Shift Operations
+     *
+     * The shift operations are implemented as component-wise
+     * application of the operator on the two vector objects.
+     *
+     * Example:
+     * \code
+     * void foo(int_v a, int_v b) {
+     *   const int_v right = a >> b;
+     *   a <<= b;
+     * }
+     * \endcode
+     *
+     * \returns A vector object of the same type with the components filled according to a
+     *          component-wise application of the operator.
+     */
+    ///@{
+    Vc_ALL_SHIFTS(Vc_OP);
+    ///@}
+#undef Vc_OP
+
+    /**
+     * \name Comparisons
+     *
+     * All comparison operators return a mask object.
+     *
+     * Example:
+     * \code
+     * void foo(const float_v &a, const float_v &b) {
+     *   const float_m mask = a < b;
+     *   ...
+     * }
+     * \endcode
+     *
+     * \param x The vector to compare against.
+     * \returns A mask object. Its components contain the boolean results of the
+     *          component-wise compare operation.
+     */
+    ///@{
+#define Vc_CMP_OP(symbol) inline Vc_PURE MaskType operator symbol(const Vector &x) const;
+    Vc_ALL_COMPARES(Vc_CMP_OP);
+#undef Vc_CMP_OP
+    ///@}
+
+    /**
+     * Writemask the vector before an assignment.
+     *
+     * \param mask The writemask to be used.
+     *
+     * \return an object that can be used for any kind of masked assignment.
+     *
+     * The returned object is only to be used for assignments and should not be assigned
+     * to a variable.
+     *
+     * Examples:
+     * \code
+     * float_v v = float_v::Zero();         // v  = [0, 0, 0, 0]
+     * int_v v2 = int_v::IndexesFromZero(); // v2 = [0, 1, 2, 3]
+     * v(v2 < 2) = 1.f;                     // v  = [1, 1, 0, 0]
+     * v(v2 < 3) += 1.f;                    // v  = [2, 2, 1, 0]
+     * ++v2(v < 1.f);                       // v2 = [0, 1, 2, 4]
+     * \endcode
+     */
+    inline Common::WriteMaskedVector<Vector, MaskType> operator()(MaskType mask);
+
+    /**
+     * \name Horizontal Reduction Operations
+     *
+     * Horizontal operations can be used to reduce the values of a vector to a scalar
+     * value.
+     *
+     * Example:
+     * \code
+     * void foo(const float_v &v) {
+     *   float min = v.min(); // smallest value in v
+     *   float sum = v.sum(); // sum of all values in v
+     * }
+     * \endcode
+     */
+    ///@{
+
+    /// Returns the smallest entry in the vector.
+    inline EntryType min() const;
+    /// Returns the largest entry in the vector.
+    inline EntryType max() const;
+    /// Returns the product of all entries in the vector.
+    inline EntryType product() const;
+    /// Returns the sum of all entries in the vector.
+    inline EntryType sum() const;
+    /// Returns a vector containing the sum of all entries with smaller index.
+    inline Vector partialSum() const;
+    /// Returns the smallest entry of the vector components selected by \p mask.
+    inline EntryType min(MaskType mask) const;
+    /// Returns the largest entry of the vector components selected by \p mask.
+    inline EntryType max(MaskType mask) const;
+    /// Returns the product of the vector components selected by \p mask.
+    inline EntryType product(MaskType mask) const;
+    /// Returns the sum of the vector components selected by \p mask.
+    inline EntryType sum(MaskType mask) const;
+    ///@}
+
+    /**
+     * \name Shift and Rotate
+     *
+     * These functions allow to shift or rotate the entries in a vector.
+     *
+     * All functions with an \p amount parameter support positive and negative numbers for
+     * the shift/rotate value.
+     *
+     * Example:
+     * \code
+     * using namespace Vc;
+     * int_v foo = int_v::IndexesFromZero() + 1; // e.g. [1, 2, 3, 4] with SSE
+     * int_v x;
+     * x = foo.shifted( 1); // [2, 3, 4, 0]
+     * x = foo.shifted( 2); // [3, 4, 0, 0]
+     * x = foo.shifted( 3); // [4, 0, 0, 0]
+     * x = foo.shifted( 4); // [0, 0, 0, 0]
+     * x = foo.shifted(-1); // [0, 1, 2, 3]
+     * x = foo.shifted(-2); // [0, 0, 1, 2]
+     * x = foo.shifted(-3); // [0, 0, 0, 1]
+     * x = foo.shifted(-4); // [0, 0, 0, 0]
+     *
+     * x = foo.rotated( 1); // [2, 3, 4, 1]
+     * x = foo.rotated( 2); // [3, 4, 1, 2]
+     * x = foo.rotated( 3); // [4, 1, 2, 3]
+     * x = foo.rotated( 4); // [1, 2, 3, 4]
+     * x = foo.rotated(-1); // [4, 1, 2, 3]
+     * x = foo.rotated(-2); // [3, 4, 1, 2]
+     * x = foo.rotated(-3); // [2, 3, 4, 1]
+     * x = foo.rotated(-4); // [1, 2, 3, 4]
+     * \endcode
+     *
+     * These functions are slightly related to the above swizzles. In any case, they are
+     * often useful for communication between SIMD lanes or binary decoding operations.
+     *
+     * \warning Use of these functions leads to less portable code. Consider the scalar
+     * implementation where every vector has only one entry. The shift and rotate
+     * functions have no useful task to fulfil there and you will almost certainly not get
+     * any useful results. It is recommended to add a static_assert for the assumed
+     * minimum vector size.
+     */
+    ///@{
+
+    /// Shift vector entries to the left by \p amount; shifting in zeros.
+    inline Vector shifted(int amount) const;
+    /**
+     * Shift vector entries to the left by \p amount; shifting in values from shiftIn
+     * (instead of zeros).
+     *
+     * This function can be used to create vectors from unaligned memory locations.
+     *
+     * Example:
+     * \code
+     * Vc::Memory<int_v, 256> mem;
+     * for (int i = 0; i < 256; ++i) { mem[i] = i + 1; }
+     * int_v a = mem.vectorAt(0);
+     * int_v b = mem.vectorAt(int_v::Size);
+     * int_v x = a.shifted(1, b);
+     * // now x == mem.vectorAt(1, Vc::Unaligned)
+     * \endcode
+     *
+     * \param amount  The number of entries to shift by. \p amount must be between \c
+     *                -Size and \c Size, otherwise the result is undefined.
+     * \param shiftIn The vector of values to shift in.
+     * \return        A new vector with values from \p this and \p shiftIn concatenated
+     *                and then shifted by \p amount.
+     */
+    inline Vector shifted(int amount, Vector shiftIn) const;
+    /// Rotate vector entries to the left by \p amount.
+    inline Vector rotated(int amount) const;
+    /// Returns a vector with all components reversed.
+    inline Vector reversed() const;
+    ///@}
+
+    /**
+     * Return a sorted copy of the vector.
+     *
+     * \returns a sorted vector. The returned values are in ascending order:
+       \verbatim
+       v[0] <= v[1] <= v[2] <= v[3] ...
+       \endverbatim
+     *
+     * \note If the vector contains NaNs the result is undefined.
+     *
+     * Example:
+     * \code
+     * int_v v = int_v::Random();
+     * int_v s = v.sorted();
+     * std::cout << v << '\n' << s << '\n';
+     * \endcode
+     *
+     * With SSE the output would be:
+     *
+       \verbatim
+       [1513634383, -963914658, 1763536262, -1285037745]
+       [-1285037745, -963914658, 1513634383, 1763536262]
+       \endverbatim
+     *
+     * With the Scalar implementation:
+       \verbatim
+       [1513634383]
+       [1513634383]
+       \endverbatim
+     */
+    inline Vector sorted() const;
+
+    /*!
+     * \name Apply/Call/Fill Functions
+     *
+     * There are still many situations where the code needs to switch from SIMD operations
+     * to scalar execution. In this case you can, of course rely on operator[]. But there
+     * are also a number of functions that can help with common patterns.
+     *
+     * The apply functions expect a function that returns a scalar value, i.e. a function
+     * of the form "T f(T)".  The call functions do not return a value and thus the
+     * function passed does not need a return value. The fill functions are used to
+     * serially set the entries of the vector from the return values of a function.
+     *
+     * Example:
+     * \code
+     * void foo(float_v v) {
+     *   float_v logarithm = v.apply(std::log);
+     *   float_v exponential = v.apply(std::exp);
+     * }
+     * \endcode
+     *
+     * Of course, you can also use lambdas here:
+     * \code
+     *   float_v power = v.apply([](float f) { return std::pow(f, 0.6f); })
+     * \endcode
+     *
+     * \param f A functor: this can either be a function or an object that implements
+     * operator().
+     */
+    ///@{
+
+    /// Call \p f sequentially, starting with the minimum up to the maximum value.
+    template <typename F> void callWithValuesSorted(F &&f);
+    /// Call \p f with the scalar entries of the vector.
+    template <typename F> inline void call(F &&f) const;
+    /// As above, but skip the entries where \p mask is not set.
+    template <typename F> inline void call(F &&f, MaskType mask) const;
+
+    /// Call \p f on every entry of the vector and return the results as a new vector.
+    template <typename F> inline Vector apply(F &&f) const;
+    /// As above, but skip the entries where \p mask is not set.
+    template <typename F> inline Vector apply(F &&f, MaskType mask) const;
+
+    /// Fill the vector with the values [f(0), f(1), f(2), ...].
+    template <typename IndexT> inline void fill(EntryType(&f)(IndexT));
+    /// Fill the vector with the values [f(), f(), f(), ...].
+    inline void fill(EntryType(&f)());
+    ///@}
+
+    /**\internal
+     * Interleaves this vector and \p x and returns the resulting low vector.
+     * Used to implement Vc::interleave.
+     */
+    inline Vector interleaveLow(Vector x) const;
+    /**\internal
+     * Interleaves this vector and \p x and returns the resulting low vector.
+     * Used to implement Vc::interleave.
+     */
+    inline Vector interleaveHigh(Vector x) const;
+
+    /**\internal
+     * Assigns the components of \p v where \p m is \c true.
+     */
+    inline void assign(const Vector &v, const MaskType &m);
+
+    /**
+     * \internal
+     * \name Internal Data Access
+     * Returns a (const) reference the internal data member, storing the vector data.
+     */
+    ///@{
+    inline VectorType &data();
+    inline const VectorType &data() const;
+    ///@}
+
+    /// \name Deprecated Members
+    ///@{
+
+    /**
+     * Returns the exponents of the floating-point values in the vector.
+     *
+     * \return A new vector object of the same type containing the exponents.
+     *
+     * \deprecated use Vc::exponent instead.
+     */
+    Vc_DEPRECATED("use exponent(x) instead") inline Vector exponent() const;
+
+    /**
+     * Returns whether a value is negative.
+     *
+     * \return A new mask object indicating the sign of each vector element.
+     *
+     * \deprecated use Vc::isnegative instead.
+     */
+    Vc_DEPRECATED("use isnegative(x) instead") inline MaskType isNegative() const;
+
+    ///\copydoc size
+    ///\deprecated Use Vc::Vector::size instead.
+    static constexpr size_t Size = VectorTraits<T, Abi>::size();
+
+    /**
+     * Casts the current object to \p V2.
+     *
+     * \returns a converted object of type \p Vc.
+     *
+     * \deprecated Use Vc::simd_cast instead.
+     */
+    template <typename V2> inline V2 staticCast() const;
+
+    /**
+     * reinterpret_cast the vector components to construct a vector of type \p V2.
+     *
+     * \returns An object of type \p V2 with the smae bit-representation.
+     *
+     * \deprecated use Vc::reinterpret_components_cast instead.
+     */
+    template <typename V2>
+    Vc_DEPRECATED("use reinterpret_components_cast instead") inline V2
+        reinterpretCast() const;
+
+    /**
+     * Copies the signs of the components of \p reference to the components of the current
+     * vector, returning the result.
+     *
+     * \param reference A vector object that determines the sign of the the result.
+     * \returns A new vector with sign taken from \p reference and absolute value taken
+     * from the current vector object.
+     *
+     * \deprecated Use Vc::copysign instead.
+     */
+    Vc_DEPRECATED("use copysign(x, y) instead") inline Vector
+        copySign(Vector reference) const;
+    ///@}
+
+    Vc_FREE_STORE_OPERATORS_ALIGNED(alignof(Vector));
+
+private:
+    VectorType d;
+};
+
+/**
+ * \ingroup Utilities
+ * Constructs a new Vector object of type \p V from the Vector \p x, reinterpreting the
+ * bits of \p x for the new type \p V.
+ *
+ * This function is only applicable if:
+ * - the \c sizeof of the input and output types is equal
+ * - the Vector::size() of the input and output types is equal
+ * - the \c VectorEntryTypes of input and output have equal \c sizeof
+ *
+ * \tparam V The requested type to change \p x into.
+ * \param x The Vector to reinterpret as an object of type \p V.
+ * \returns A new object (rvalue) of type \p V.
+ *
+ * \warning This cast is non-portable since the applicability (see above) may change
+ * depending on the default vector types of the target platform. The function is perfectly
+ * safe to use with fully specified \p Abi, though.
+ */
+template <typename V, typename T, typename Abi>
+Vc_ALWAYS_INLINE Vc_CONST enable_if<
+    (V::size() == Vector<T, Abi>::size() &&
+     sizeof(typename V::VectorEntryType) ==
+         sizeof(typename Vector<T, Abi>::VectorEntryType) &&
+     sizeof(V) == sizeof(Vector<T, Abi>) && alignof(V) <= alignof(Vector<T, Abi>)),
+    V>
+reinterpret_components_cast(const Vector<T, Abi> &x)
+{
+    return reinterpret_cast<const V &>(x);
+}
+
+#define Vc_OP(symbol)                                                                    \
+    template <typename T, typename Abi>                                                  \
+    inline Vector<T, Abi> &operator symbol##=(Vector<T, Abi> &,                          \
+                                              const Vector<T, Abi> &x);
+    //Vc_ALL_ARITHMETICS(Vc_OP);
+    //Vc_ALL_BINARY(Vc_OP);
+    //Vc_ALL_SHIFTS(Vc_OP);
+#undef Vc_OP
+
+}  // namespace Vc
+
+#endif  // VC_COMMON_VECTOR_H_
+
+// vim: foldmethod=marker
--- a/Show More
+++ b/Show More