forked from openkylin/rust-ppv-lite86
Import Upstream version 0.2.16
This commit is contained in:
commit
e957653932
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"git": {
|
||||
"sha1": "4b1e1d655d05c9da29aa833ce705feedb3da760b"
|
||||
},
|
||||
"path_in_vcs": "utils-simd/ppv-lite86"
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
# Changelog
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.2.16]
|
||||
### Added
|
||||
- add [u64; 4] conversion for generic vec256, to support BLAKE on non-x86.
|
||||
- impl `From` (rather than just `Into`) for conversions between `*_storage` types and arrays.
|
|
@ -0,0 +1,31 @@
|
|||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.16"
|
||||
authors = ["The CryptoCorrosion Contributors"]
|
||||
description = "Implementation of the crypto-simd API for x86"
|
||||
keywords = ["crypto", "simd", "x86"]
|
||||
categories = ["cryptography", "no-std"]
|
||||
license = "MIT/Apache-2.0"
|
||||
repository = "https://github.com/cryptocorrosion/cryptocorrosion"
|
||||
|
||||
[dependencies]
|
||||
|
||||
[features]
|
||||
default = ["std"]
|
||||
no_simd = []
|
||||
simd = []
|
||||
std = []
|
||||
[badges.travis-ci]
|
||||
repository = "cryptocorrosion/cryptocorrosion"
|
|
@ -0,0 +1,21 @@
|
|||
[package]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.16"
|
||||
authors = ["The CryptoCorrosion Contributors"]
|
||||
edition = "2018"
|
||||
license = "MIT/Apache-2.0"
|
||||
description = "Implementation of the crypto-simd API for x86"
|
||||
repository = "https://github.com/cryptocorrosion/cryptocorrosion"
|
||||
keywords = ["crypto", "simd", "x86"]
|
||||
categories = ["cryptography", "no-std"]
|
||||
|
||||
[dependencies]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "cryptocorrosion/cryptocorrosion" }
|
||||
|
||||
[features]
|
||||
default = ["std"]
|
||||
std = []
|
||||
simd = [] # deprecated
|
||||
no_simd = [] # for weird platforms like "x86_64 without SSE2"
|
|
@ -0,0 +1,201 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2019 The CryptoCorrosion Contributors
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -0,0 +1,25 @@
|
|||
Copyright (c) 2019 The CryptoCorrosion Contributors
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
|
@ -0,0 +1,866 @@
|
|||
#![allow(non_camel_case_types)]
|
||||
|
||||
use crate::soft::{x2, x4};
|
||||
use crate::types::*;
|
||||
use core::ops::*;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Clone, Copy)]
|
||||
pub union vec128_storage {
|
||||
d: [u32; 4],
|
||||
q: [u64; 2],
|
||||
}
|
||||
impl From<[u32; 4]> for vec128_storage {
|
||||
#[inline(always)]
|
||||
fn from(d: [u32; 4]) -> Self {
|
||||
Self { d }
|
||||
}
|
||||
}
|
||||
impl From<vec128_storage> for [u32; 4] {
|
||||
#[inline(always)]
|
||||
fn from(d: vec128_storage) -> Self {
|
||||
unsafe { d.d }
|
||||
}
|
||||
}
|
||||
impl From<[u64; 2]> for vec128_storage {
|
||||
#[inline(always)]
|
||||
fn from(q: [u64; 2]) -> Self {
|
||||
Self { q }
|
||||
}
|
||||
}
|
||||
impl From<vec128_storage> for [u64; 2] {
|
||||
#[inline(always)]
|
||||
fn from(q: vec128_storage) -> Self {
|
||||
unsafe { q.q }
|
||||
}
|
||||
}
|
||||
impl Default for vec128_storage {
|
||||
#[inline(always)]
|
||||
fn default() -> Self {
|
||||
Self { q: [0, 0] }
|
||||
}
|
||||
}
|
||||
impl Eq for vec128_storage {}
|
||||
impl PartialEq<vec128_storage> for vec128_storage {
|
||||
#[inline(always)]
|
||||
fn eq(&self, rhs: &Self) -> bool {
|
||||
unsafe { self.q == rhs.q }
|
||||
}
|
||||
}
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Default)]
|
||||
pub struct vec256_storage {
|
||||
v128: [vec128_storage; 2],
|
||||
}
|
||||
impl vec256_storage {
|
||||
#[inline(always)]
|
||||
pub fn new128(v128: [vec128_storage; 2]) -> Self {
|
||||
Self { v128 }
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn split128(self) -> [vec128_storage; 2] {
|
||||
self.v128
|
||||
}
|
||||
}
|
||||
impl From<vec256_storage> for [u64; 4] {
|
||||
#[inline(always)]
|
||||
fn from(q: vec256_storage) -> Self {
|
||||
let [a, b]: [u64; 2] = q.v128[0].into();
|
||||
let [c, d]: [u64; 2] = q.v128[1].into();
|
||||
[a, b, c, d]
|
||||
}
|
||||
}
|
||||
impl From<[u64; 4]> for vec256_storage {
|
||||
#[inline(always)]
|
||||
fn from([a, b, c, d]: [u64; 4]) -> Self {
|
||||
Self {
|
||||
v128: [[a, b].into(), [c, d].into()],
|
||||
}
|
||||
}
|
||||
}
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Default)]
|
||||
pub struct vec512_storage {
|
||||
v128: [vec128_storage; 4],
|
||||
}
|
||||
impl vec512_storage {
|
||||
#[inline(always)]
|
||||
pub fn new128(v128: [vec128_storage; 4]) -> Self {
|
||||
Self { v128 }
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn split128(self) -> [vec128_storage; 4] {
|
||||
self.v128
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn dmap<T, F>(t: T, f: F) -> T
|
||||
where
|
||||
T: Store<vec128_storage> + Into<vec128_storage>,
|
||||
F: Fn(u32) -> u32,
|
||||
{
|
||||
let t: vec128_storage = t.into();
|
||||
let d = unsafe { t.d };
|
||||
let d = vec128_storage {
|
||||
d: [f(d[0]), f(d[1]), f(d[2]), f(d[3])],
|
||||
};
|
||||
unsafe { T::unpack(d) }
|
||||
}
|
||||
|
||||
fn dmap2<T, F>(a: T, b: T, f: F) -> T
|
||||
where
|
||||
T: Store<vec128_storage> + Into<vec128_storage>,
|
||||
F: Fn(u32, u32) -> u32,
|
||||
{
|
||||
let a: vec128_storage = a.into();
|
||||
let b: vec128_storage = b.into();
|
||||
let ao = unsafe { a.d };
|
||||
let bo = unsafe { b.d };
|
||||
let d = vec128_storage {
|
||||
d: [
|
||||
f(ao[0], bo[0]),
|
||||
f(ao[1], bo[1]),
|
||||
f(ao[2], bo[2]),
|
||||
f(ao[3], bo[3]),
|
||||
],
|
||||
};
|
||||
unsafe { T::unpack(d) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn qmap<T, F>(t: T, f: F) -> T
|
||||
where
|
||||
T: Store<vec128_storage> + Into<vec128_storage>,
|
||||
F: Fn(u64) -> u64,
|
||||
{
|
||||
let t: vec128_storage = t.into();
|
||||
let q = unsafe { t.q };
|
||||
let q = vec128_storage {
|
||||
q: [f(q[0]), f(q[1])],
|
||||
};
|
||||
unsafe { T::unpack(q) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn qmap2<T, F>(a: T, b: T, f: F) -> T
|
||||
where
|
||||
T: Store<vec128_storage> + Into<vec128_storage>,
|
||||
F: Fn(u64, u64) -> u64,
|
||||
{
|
||||
let a: vec128_storage = a.into();
|
||||
let b: vec128_storage = b.into();
|
||||
let ao = unsafe { a.q };
|
||||
let bo = unsafe { b.q };
|
||||
let q = vec128_storage {
|
||||
q: [f(ao[0], bo[0]), f(ao[1], bo[1])],
|
||||
};
|
||||
unsafe { T::unpack(q) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn o_of_q(q: [u64; 2]) -> u128 {
|
||||
u128::from(q[0]) | (u128::from(q[1]) << 64)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn q_of_o(o: u128) -> [u64; 2] {
|
||||
[o as u64, (o >> 64) as u64]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn omap<T, F>(a: T, f: F) -> T
|
||||
where
|
||||
T: Store<vec128_storage> + Into<vec128_storage>,
|
||||
F: Fn(u128) -> u128,
|
||||
{
|
||||
let a: vec128_storage = a.into();
|
||||
let ao = o_of_q(unsafe { a.q });
|
||||
let o = vec128_storage { q: q_of_o(f(ao)) };
|
||||
unsafe { T::unpack(o) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn omap2<T, F>(a: T, b: T, f: F) -> T
|
||||
where
|
||||
T: Store<vec128_storage> + Into<vec128_storage>,
|
||||
F: Fn(u128, u128) -> u128,
|
||||
{
|
||||
let a: vec128_storage = a.into();
|
||||
let b: vec128_storage = b.into();
|
||||
let ao = o_of_q(unsafe { a.q });
|
||||
let bo = o_of_q(unsafe { b.q });
|
||||
let o = vec128_storage {
|
||||
q: q_of_o(f(ao, bo)),
|
||||
};
|
||||
unsafe { T::unpack(o) }
|
||||
}
|
||||
|
||||
impl RotateEachWord128 for u128x1_generic {}
|
||||
impl BitOps128 for u128x1_generic {}
|
||||
impl BitOps64 for u128x1_generic {}
|
||||
impl BitOps64 for u64x2_generic {}
|
||||
impl BitOps32 for u128x1_generic {}
|
||||
impl BitOps32 for u64x2_generic {}
|
||||
impl BitOps32 for u32x4_generic {}
|
||||
impl BitOps0 for u128x1_generic {}
|
||||
impl BitOps0 for u64x2_generic {}
|
||||
impl BitOps0 for u32x4_generic {}
|
||||
|
||||
macro_rules! impl_bitops {
|
||||
($vec:ident) => {
|
||||
impl Not for $vec {
|
||||
type Output = Self;
|
||||
#[inline(always)]
|
||||
fn not(self) -> Self::Output {
|
||||
omap(self, |x| !x)
|
||||
}
|
||||
}
|
||||
impl BitAnd for $vec {
|
||||
type Output = Self;
|
||||
#[inline(always)]
|
||||
fn bitand(self, rhs: Self) -> Self::Output {
|
||||
omap2(self, rhs, |x, y| x & y)
|
||||
}
|
||||
}
|
||||
impl BitOr for $vec {
|
||||
type Output = Self;
|
||||
#[inline(always)]
|
||||
fn bitor(self, rhs: Self) -> Self::Output {
|
||||
omap2(self, rhs, |x, y| x | y)
|
||||
}
|
||||
}
|
||||
impl BitXor for $vec {
|
||||
type Output = Self;
|
||||
#[inline(always)]
|
||||
fn bitxor(self, rhs: Self) -> Self::Output {
|
||||
omap2(self, rhs, |x, y| x ^ y)
|
||||
}
|
||||
}
|
||||
impl AndNot for $vec {
|
||||
type Output = Self;
|
||||
#[inline(always)]
|
||||
fn andnot(self, rhs: Self) -> Self::Output {
|
||||
omap2(self, rhs, |x, y| !x & y)
|
||||
}
|
||||
}
|
||||
impl BitAndAssign for $vec {
|
||||
#[inline(always)]
|
||||
fn bitand_assign(&mut self, rhs: Self) {
|
||||
*self = *self & rhs
|
||||
}
|
||||
}
|
||||
impl BitOrAssign for $vec {
|
||||
#[inline(always)]
|
||||
fn bitor_assign(&mut self, rhs: Self) {
|
||||
*self = *self | rhs
|
||||
}
|
||||
}
|
||||
impl BitXorAssign for $vec {
|
||||
#[inline(always)]
|
||||
fn bitxor_assign(&mut self, rhs: Self) {
|
||||
*self = *self ^ rhs
|
||||
}
|
||||
}
|
||||
|
||||
impl Swap64 for $vec {
|
||||
#[inline(always)]
|
||||
fn swap1(self) -> Self {
|
||||
qmap(self, |x| {
|
||||
((x & 0x5555555555555555) << 1) | ((x & 0xaaaaaaaaaaaaaaaa) >> 1)
|
||||
})
|
||||
}
|
||||
#[inline(always)]
|
||||
fn swap2(self) -> Self {
|
||||
qmap(self, |x| {
|
||||
((x & 0x3333333333333333) << 2) | ((x & 0xcccccccccccccccc) >> 2)
|
||||
})
|
||||
}
|
||||
#[inline(always)]
|
||||
fn swap4(self) -> Self {
|
||||
qmap(self, |x| {
|
||||
((x & 0x0f0f0f0f0f0f0f0f) << 4) | ((x & 0xf0f0f0f0f0f0f0f0) >> 4)
|
||||
})
|
||||
}
|
||||
#[inline(always)]
|
||||
fn swap8(self) -> Self {
|
||||
qmap(self, |x| {
|
||||
((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8)
|
||||
})
|
||||
}
|
||||
#[inline(always)]
|
||||
fn swap16(self) -> Self {
|
||||
dmap(self, |x| x.rotate_left(16))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn swap32(self) -> Self {
|
||||
qmap(self, |x| x.rotate_left(32))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn swap64(self) -> Self {
|
||||
omap(self, |x| (x << 64) | (x >> 64))
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
impl_bitops!(u32x4_generic);
|
||||
impl_bitops!(u64x2_generic);
|
||||
impl_bitops!(u128x1_generic);
|
||||
|
||||
impl RotateEachWord32 for u32x4_generic {
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right7(self) -> Self {
|
||||
dmap(self, |x| x.rotate_right(7))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right8(self) -> Self {
|
||||
dmap(self, |x| x.rotate_right(8))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right11(self) -> Self {
|
||||
dmap(self, |x| x.rotate_right(11))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right12(self) -> Self {
|
||||
dmap(self, |x| x.rotate_right(12))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right16(self) -> Self {
|
||||
dmap(self, |x| x.rotate_right(16))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right20(self) -> Self {
|
||||
dmap(self, |x| x.rotate_right(20))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right24(self) -> Self {
|
||||
dmap(self, |x| x.rotate_right(24))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right25(self) -> Self {
|
||||
dmap(self, |x| x.rotate_right(25))
|
||||
}
|
||||
}
|
||||
|
||||
impl RotateEachWord32 for u64x2_generic {
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right7(self) -> Self {
|
||||
qmap(self, |x| x.rotate_right(7))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right8(self) -> Self {
|
||||
qmap(self, |x| x.rotate_right(8))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right11(self) -> Self {
|
||||
qmap(self, |x| x.rotate_right(11))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right12(self) -> Self {
|
||||
qmap(self, |x| x.rotate_right(12))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right16(self) -> Self {
|
||||
qmap(self, |x| x.rotate_right(16))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right20(self) -> Self {
|
||||
qmap(self, |x| x.rotate_right(20))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right24(self) -> Self {
|
||||
qmap(self, |x| x.rotate_right(24))
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right25(self) -> Self {
|
||||
qmap(self, |x| x.rotate_right(25))
|
||||
}
|
||||
}
|
||||
impl RotateEachWord64 for u64x2_generic {
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right32(self) -> Self {
|
||||
qmap(self, |x| x.rotate_right(32))
|
||||
}
|
||||
}
|
||||
|
||||
// workaround for koute/cargo-web#52 (u128::rotate_* broken with cargo web)
|
||||
#[inline(always)]
|
||||
fn rotate_u128_right(x: u128, i: u32) -> u128 {
|
||||
(x >> i) | (x << (128 - i))
|
||||
}
|
||||
#[test]
|
||||
fn test_rotate_u128() {
|
||||
const X: u128 = 0x0001_0203_0405_0607_0809_0a0b_0c0d_0e0f;
|
||||
const R: u128 = X.rotate_right(17);
|
||||
assert_eq!(rotate_u128_right(X, 17), R);
|
||||
}
|
||||
|
||||
impl RotateEachWord32 for u128x1_generic {
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right7(self) -> Self {
|
||||
Self([rotate_u128_right(self.0[0], 7)])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right8(self) -> Self {
|
||||
Self([rotate_u128_right(self.0[0], 8)])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right11(self) -> Self {
|
||||
Self([rotate_u128_right(self.0[0], 11)])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right12(self) -> Self {
|
||||
Self([rotate_u128_right(self.0[0], 12)])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right16(self) -> Self {
|
||||
Self([rotate_u128_right(self.0[0], 16)])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right20(self) -> Self {
|
||||
Self([rotate_u128_right(self.0[0], 20)])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right24(self) -> Self {
|
||||
Self([rotate_u128_right(self.0[0], 24)])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right25(self) -> Self {
|
||||
Self([rotate_u128_right(self.0[0], 25)])
|
||||
}
|
||||
}
|
||||
impl RotateEachWord64 for u128x1_generic {
|
||||
#[inline(always)]
|
||||
fn rotate_each_word_right32(self) -> Self {
|
||||
Self([rotate_u128_right(self.0[0], 32)])
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct GenericMachine;
|
||||
impl Machine for GenericMachine {
|
||||
type u32x4 = u32x4_generic;
|
||||
type u64x2 = u64x2_generic;
|
||||
type u128x1 = u128x1_generic;
|
||||
type u32x4x2 = u32x4x2_generic;
|
||||
type u64x2x2 = u64x2x2_generic;
|
||||
type u64x4 = u64x4_generic;
|
||||
type u128x2 = u128x2_generic;
|
||||
type u32x4x4 = u32x4x4_generic;
|
||||
type u64x2x4 = u64x2x4_generic;
|
||||
type u128x4 = u128x4_generic;
|
||||
#[inline(always)]
|
||||
unsafe fn instance() -> Self {
|
||||
Self
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
pub struct u32x4_generic([u32; 4]);
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
pub struct u64x2_generic([u64; 2]);
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
pub struct u128x1_generic([u128; 1]);
|
||||
|
||||
impl From<u32x4_generic> for vec128_storage {
|
||||
#[inline(always)]
|
||||
fn from(d: u32x4_generic) -> Self {
|
||||
Self { d: d.0 }
|
||||
}
|
||||
}
|
||||
impl From<u64x2_generic> for vec128_storage {
|
||||
#[inline(always)]
|
||||
fn from(q: u64x2_generic) -> Self {
|
||||
Self { q: q.0 }
|
||||
}
|
||||
}
|
||||
impl From<u128x1_generic> for vec128_storage {
|
||||
#[inline(always)]
|
||||
fn from(o: u128x1_generic) -> Self {
|
||||
Self { q: q_of_o(o.0[0]) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Store<vec128_storage> for u32x4_generic {
|
||||
#[inline(always)]
|
||||
unsafe fn unpack(s: vec128_storage) -> Self {
|
||||
Self(s.d)
|
||||
}
|
||||
}
|
||||
impl Store<vec128_storage> for u64x2_generic {
|
||||
#[inline(always)]
|
||||
unsafe fn unpack(s: vec128_storage) -> Self {
|
||||
Self(s.q)
|
||||
}
|
||||
}
|
||||
impl Store<vec128_storage> for u128x1_generic {
|
||||
#[inline(always)]
|
||||
unsafe fn unpack(s: vec128_storage) -> Self {
|
||||
Self([o_of_q(s.q); 1])
|
||||
}
|
||||
}
|
||||
|
||||
impl ArithOps for u32x4_generic {}
|
||||
impl ArithOps for u64x2_generic {}
|
||||
impl ArithOps for u128x1_generic {}
|
||||
|
||||
impl Add for u32x4_generic {
|
||||
type Output = Self;
|
||||
#[inline(always)]
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
dmap2(self, rhs, |x, y| x.wrapping_add(y))
|
||||
}
|
||||
}
|
||||
impl Add for u64x2_generic {
|
||||
type Output = Self;
|
||||
#[inline(always)]
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
qmap2(self, rhs, |x, y| x.wrapping_add(y))
|
||||
}
|
||||
}
|
||||
impl Add for u128x1_generic {
|
||||
type Output = Self;
|
||||
#[inline(always)]
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
omap2(self, rhs, |x, y| x.wrapping_add(y))
|
||||
}
|
||||
}
|
||||
impl AddAssign for u32x4_generic {
|
||||
#[inline(always)]
|
||||
fn add_assign(&mut self, rhs: Self) {
|
||||
*self = *self + rhs
|
||||
}
|
||||
}
|
||||
impl AddAssign for u64x2_generic {
|
||||
#[inline(always)]
|
||||
fn add_assign(&mut self, rhs: Self) {
|
||||
*self = *self + rhs
|
||||
}
|
||||
}
|
||||
impl AddAssign for u128x1_generic {
|
||||
#[inline(always)]
|
||||
fn add_assign(&mut self, rhs: Self) {
|
||||
*self = *self + rhs
|
||||
}
|
||||
}
|
||||
impl BSwap for u32x4_generic {
|
||||
#[inline(always)]
|
||||
fn bswap(self) -> Self {
|
||||
dmap(self, |x| x.swap_bytes())
|
||||
}
|
||||
}
|
||||
impl BSwap for u64x2_generic {
|
||||
#[inline(always)]
|
||||
fn bswap(self) -> Self {
|
||||
qmap(self, |x| x.swap_bytes())
|
||||
}
|
||||
}
|
||||
impl BSwap for u128x1_generic {
|
||||
#[inline(always)]
|
||||
fn bswap(self) -> Self {
|
||||
omap(self, |x| x.swap_bytes())
|
||||
}
|
||||
}
|
||||
impl StoreBytes for u32x4_generic {
|
||||
#[inline(always)]
|
||||
unsafe fn unsafe_read_le(input: &[u8]) -> Self {
|
||||
assert_eq!(input.len(), 16);
|
||||
let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16]));
|
||||
dmap(x, |x| x.to_le())
|
||||
}
|
||||
#[inline(always)]
|
||||
unsafe fn unsafe_read_be(input: &[u8]) -> Self {
|
||||
assert_eq!(input.len(), 16);
|
||||
let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16]));
|
||||
dmap(x, |x| x.to_be())
|
||||
}
|
||||
#[inline(always)]
|
||||
fn write_le(self, out: &mut [u8]) {
|
||||
assert_eq!(out.len(), 16);
|
||||
let x = dmap(self, |x| x.to_le());
|
||||
unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) }
|
||||
}
|
||||
#[inline(always)]
|
||||
fn write_be(self, out: &mut [u8]) {
|
||||
assert_eq!(out.len(), 16);
|
||||
let x = dmap(self, |x| x.to_be());
|
||||
unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) }
|
||||
}
|
||||
}
|
||||
impl StoreBytes for u64x2_generic {
|
||||
#[inline(always)]
|
||||
unsafe fn unsafe_read_le(input: &[u8]) -> Self {
|
||||
assert_eq!(input.len(), 16);
|
||||
let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16]));
|
||||
qmap(x, |x| x.to_le())
|
||||
}
|
||||
#[inline(always)]
|
||||
unsafe fn unsafe_read_be(input: &[u8]) -> Self {
|
||||
assert_eq!(input.len(), 16);
|
||||
let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16]));
|
||||
qmap(x, |x| x.to_be())
|
||||
}
|
||||
#[inline(always)]
|
||||
fn write_le(self, out: &mut [u8]) {
|
||||
assert_eq!(out.len(), 16);
|
||||
let x = qmap(self, |x| x.to_le());
|
||||
unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) }
|
||||
}
|
||||
#[inline(always)]
|
||||
fn write_be(self, out: &mut [u8]) {
|
||||
assert_eq!(out.len(), 16);
|
||||
let x = qmap(self, |x| x.to_be());
|
||||
unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct G0;
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct G1;
|
||||
pub type u32x4x2_generic = x2<u32x4_generic, G0>;
|
||||
pub type u64x2x2_generic = x2<u64x2_generic, G0>;
|
||||
pub type u64x4_generic = x2<u64x2_generic, G1>;
|
||||
pub type u128x2_generic = x2<u128x1_generic, G0>;
|
||||
pub type u32x4x4_generic = x4<u32x4_generic>;
|
||||
pub type u64x2x4_generic = x4<u64x2_generic>;
|
||||
pub type u128x4_generic = x4<u128x1_generic>;
|
||||
|
||||
impl Vector<[u32; 16]> for u32x4x4_generic {
|
||||
fn to_scalars(self) -> [u32; 16] {
|
||||
let [a, b, c, d] = self.0;
|
||||
let a = a.0;
|
||||
let b = b.0;
|
||||
let c = c.0;
|
||||
let d = d.0;
|
||||
[
|
||||
a[0], a[1], a[2], a[3], //
|
||||
b[0], b[1], b[2], b[3], //
|
||||
c[0], c[1], c[2], c[3], //
|
||||
d[0], d[1], d[2], d[3], //
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
impl MultiLane<[u32; 4]> for u32x4_generic {
|
||||
#[inline(always)]
|
||||
fn to_lanes(self) -> [u32; 4] {
|
||||
self.0
|
||||
}
|
||||
#[inline(always)]
|
||||
fn from_lanes(xs: [u32; 4]) -> Self {
|
||||
Self(xs)
|
||||
}
|
||||
}
|
||||
impl MultiLane<[u64; 2]> for u64x2_generic {
|
||||
#[inline(always)]
|
||||
fn to_lanes(self) -> [u64; 2] {
|
||||
self.0
|
||||
}
|
||||
#[inline(always)]
|
||||
fn from_lanes(xs: [u64; 2]) -> Self {
|
||||
Self(xs)
|
||||
}
|
||||
}
|
||||
impl MultiLane<[u64; 4]> for u64x4_generic {
|
||||
#[inline(always)]
|
||||
fn to_lanes(self) -> [u64; 4] {
|
||||
let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
|
||||
[a[0], a[1], b[0], b[1]]
|
||||
}
|
||||
#[inline(always)]
|
||||
fn from_lanes(xs: [u64; 4]) -> Self {
|
||||
let (a, b) = (
|
||||
u64x2_generic::from_lanes([xs[0], xs[1]]),
|
||||
u64x2_generic::from_lanes([xs[2], xs[3]]),
|
||||
);
|
||||
x2::new([a, b])
|
||||
}
|
||||
}
|
||||
impl MultiLane<[u128; 1]> for u128x1_generic {
|
||||
#[inline(always)]
|
||||
fn to_lanes(self) -> [u128; 1] {
|
||||
self.0
|
||||
}
|
||||
#[inline(always)]
|
||||
fn from_lanes(xs: [u128; 1]) -> Self {
|
||||
Self(xs)
|
||||
}
|
||||
}
|
||||
impl Vec4<u32> for u32x4_generic {
|
||||
#[inline(always)]
|
||||
fn extract(self, i: u32) -> u32 {
|
||||
self.0[i as usize]
|
||||
}
|
||||
#[inline(always)]
|
||||
fn insert(mut self, v: u32, i: u32) -> Self {
|
||||
self.0[i as usize] = v;
|
||||
self
|
||||
}
|
||||
}
|
||||
impl Vec4<u64> for u64x4_generic {
|
||||
#[inline(always)]
|
||||
fn extract(self, i: u32) -> u64 {
|
||||
let d: [u64; 4] = self.to_lanes();
|
||||
d[i as usize]
|
||||
}
|
||||
#[inline(always)]
|
||||
fn insert(self, v: u64, i: u32) -> Self {
|
||||
self.0[(i / 2) as usize].insert(v, i % 2);
|
||||
self
|
||||
}
|
||||
}
|
||||
impl Vec2<u64> for u64x2_generic {
|
||||
#[inline(always)]
|
||||
fn extract(self, i: u32) -> u64 {
|
||||
self.0[i as usize]
|
||||
}
|
||||
#[inline(always)]
|
||||
fn insert(mut self, v: u64, i: u32) -> Self {
|
||||
self.0[i as usize] = v;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl Words4 for u32x4_generic {
|
||||
#[inline(always)]
|
||||
fn shuffle2301(self) -> Self {
|
||||
self.swap64()
|
||||
}
|
||||
#[inline(always)]
|
||||
fn shuffle1230(self) -> Self {
|
||||
let x = self.0;
|
||||
Self([x[3], x[0], x[1], x[2]])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn shuffle3012(self) -> Self {
|
||||
let x = self.0;
|
||||
Self([x[1], x[2], x[3], x[0]])
|
||||
}
|
||||
}
|
||||
impl LaneWords4 for u32x4_generic {
|
||||
#[inline(always)]
|
||||
fn shuffle_lane_words2301(self) -> Self {
|
||||
self.shuffle2301()
|
||||
}
|
||||
#[inline(always)]
|
||||
fn shuffle_lane_words1230(self) -> Self {
|
||||
self.shuffle1230()
|
||||
}
|
||||
#[inline(always)]
|
||||
fn shuffle_lane_words3012(self) -> Self {
|
||||
self.shuffle3012()
|
||||
}
|
||||
}
|
||||
|
||||
impl Words4 for u64x4_generic {
|
||||
#[inline(always)]
|
||||
fn shuffle2301(self) -> Self {
|
||||
x2::new([self.0[1], self.0[0]])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn shuffle1230(self) -> Self {
|
||||
unimplemented!()
|
||||
}
|
||||
#[inline(always)]
|
||||
fn shuffle3012(self) -> Self {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl u32x4<GenericMachine> for u32x4_generic {}
|
||||
impl u64x2<GenericMachine> for u64x2_generic {}
|
||||
impl u128x1<GenericMachine> for u128x1_generic {}
|
||||
impl u32x4x2<GenericMachine> for u32x4x2_generic {}
|
||||
impl u64x2x2<GenericMachine> for u64x2x2_generic {}
|
||||
impl u64x4<GenericMachine> for u64x4_generic {}
|
||||
impl u128x2<GenericMachine> for u128x2_generic {}
|
||||
impl u32x4x4<GenericMachine> for u32x4x4_generic {}
|
||||
impl u64x2x4<GenericMachine> for u64x2x4_generic {}
|
||||
impl u128x4<GenericMachine> for u128x4_generic {}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! dispatch {
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
|
||||
#[inline(always)]
|
||||
$($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
|
||||
let $mach = unsafe { $crate::generic::GenericMachine::instance() };
|
||||
#[inline(always)]
|
||||
fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
|
||||
fn_impl($mach, $($arg),*)
|
||||
}
|
||||
};
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
|
||||
dispatch!($mach, $MTy, {
|
||||
$([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
|
||||
});
|
||||
}
|
||||
}
|
||||
#[macro_export]
|
||||
macro_rules! dispatch_light128 {
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
|
||||
#[inline(always)]
|
||||
$($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
|
||||
let $mach = unsafe { $crate::generic::GenericMachine::instance() };
|
||||
#[inline(always)]
|
||||
fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
|
||||
fn_impl($mach, $($arg),*)
|
||||
}
|
||||
};
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
|
||||
dispatch!($mach, $MTy, {
|
||||
$([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
|
||||
});
|
||||
}
|
||||
}
|
||||
#[macro_export]
|
||||
macro_rules! dispatch_light256 {
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
|
||||
#[inline(always)]
|
||||
$($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
|
||||
let $mach = unsafe { $crate::generic::GenericMachine::instance() };
|
||||
#[inline(always)]
|
||||
fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
|
||||
fn_impl($mach, $($arg),*)
|
||||
}
|
||||
};
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
|
||||
dispatch!($mach, $MTy, {
|
||||
$([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
|
||||
});
|
||||
}
|
||||
}
|
||||
#[macro_export]
|
||||
macro_rules! dispatch_light512 {
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
|
||||
#[inline(always)]
|
||||
$($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
|
||||
let $mach = unsafe { $crate::generic::GenericMachine::instance() };
|
||||
#[inline(always)]
|
||||
fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
|
||||
fn_impl($mach, $($arg),*)
|
||||
}
|
||||
};
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
|
||||
dispatch!($mach, $MTy, {
|
||||
$([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_bswap32() {
|
||||
let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
|
||||
let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
|
||||
|
||||
let m = unsafe { GenericMachine::instance() };
|
||||
|
||||
let x: <GenericMachine as Machine>::u32x4 = m.vec(xs);
|
||||
let x = x.bswap();
|
||||
|
||||
let y = m.vec(ys);
|
||||
assert_eq!(x, y);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
#![no_std]
|
||||
|
||||
// Design:
|
||||
// - safety: safe creation of any machine type is done only by instance methods of a
|
||||
// Machine (which is a ZST + Copy type), which can only by created unsafely or safely
|
||||
// through feature detection (e.g. fn AVX2::try_get() -> Option<Machine>).
|
||||
|
||||
mod soft;
|
||||
mod types;
|
||||
pub use self::types::*;
|
||||
|
||||
#[cfg(all(target_arch = "x86_64", not(feature = "no_simd"), not(miri)))]
|
||||
pub mod x86_64;
|
||||
#[cfg(all(target_arch = "x86_64", not(feature = "no_simd"), not(miri)))]
|
||||
use self::x86_64 as arch;
|
||||
|
||||
#[cfg(any(feature = "no_simd", miri, not(target_arch = "x86_64")))]
|
||||
pub mod generic;
|
||||
#[cfg(any(feature = "no_simd", miri, not(target_arch = "x86_64")))]
|
||||
use self::generic as arch;
|
||||
|
||||
pub use self::arch::{vec128_storage, vec256_storage, vec512_storage};
|
|
@ -0,0 +1,472 @@
|
|||
//! Implement 256- and 512- bit in terms of 128-bit, for machines without native wide SIMD.
|
||||
|
||||
use crate::types::*;
|
||||
use crate::{vec128_storage, vec256_storage, vec512_storage};
|
||||
use core::marker::PhantomData;
|
||||
use core::ops::*;
|
||||
|
||||
#[derive(Copy, Clone, Default)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub struct x2<W, G>(pub [W; 2], PhantomData<G>);
|
||||
impl<W, G> x2<W, G> {
|
||||
#[inline(always)]
|
||||
pub fn new(xs: [W; 2]) -> Self {
|
||||
x2(xs, PhantomData)
|
||||
}
|
||||
}
|
||||
macro_rules! fwd_binop_x2 {
|
||||
($trait:ident, $fn:ident) => {
|
||||
impl<W: $trait + Copy, G> $trait for x2<W, G> {
|
||||
type Output = x2<W::Output, G>;
|
||||
#[inline(always)]
|
||||
fn $fn(self, rhs: Self) -> Self::Output {
|
||||
x2::new([self.0[0].$fn(rhs.0[0]), self.0[1].$fn(rhs.0[1])])
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
macro_rules! fwd_binop_assign_x2 {
|
||||
($trait:ident, $fn_assign:ident) => {
|
||||
impl<W: $trait + Copy, G> $trait for x2<W, G> {
|
||||
#[inline(always)]
|
||||
fn $fn_assign(&mut self, rhs: Self) {
|
||||
(self.0[0]).$fn_assign(rhs.0[0]);
|
||||
(self.0[1]).$fn_assign(rhs.0[1]);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
macro_rules! fwd_unop_x2 {
|
||||
($fn:ident) => {
|
||||
#[inline(always)]
|
||||
fn $fn(self) -> Self {
|
||||
x2::new([self.0[0].$fn(), self.0[1].$fn()])
|
||||
}
|
||||
};
|
||||
}
|
||||
impl<W, G> RotateEachWord32 for x2<W, G>
|
||||
where
|
||||
W: Copy + RotateEachWord32,
|
||||
{
|
||||
fwd_unop_x2!(rotate_each_word_right7);
|
||||
fwd_unop_x2!(rotate_each_word_right8);
|
||||
fwd_unop_x2!(rotate_each_word_right11);
|
||||
fwd_unop_x2!(rotate_each_word_right12);
|
||||
fwd_unop_x2!(rotate_each_word_right16);
|
||||
fwd_unop_x2!(rotate_each_word_right20);
|
||||
fwd_unop_x2!(rotate_each_word_right24);
|
||||
fwd_unop_x2!(rotate_each_word_right25);
|
||||
}
|
||||
impl<W, G> RotateEachWord64 for x2<W, G>
|
||||
where
|
||||
W: Copy + RotateEachWord64,
|
||||
{
|
||||
fwd_unop_x2!(rotate_each_word_right32);
|
||||
}
|
||||
impl<W, G> RotateEachWord128 for x2<W, G> where W: RotateEachWord128 {}
|
||||
impl<W, G> BitOps0 for x2<W, G>
|
||||
where
|
||||
W: BitOps0,
|
||||
G: Copy,
|
||||
{
|
||||
}
|
||||
impl<W, G> BitOps32 for x2<W, G>
|
||||
where
|
||||
W: BitOps32 + BitOps0,
|
||||
G: Copy,
|
||||
{
|
||||
}
|
||||
impl<W, G> BitOps64 for x2<W, G>
|
||||
where
|
||||
W: BitOps64 + BitOps0,
|
||||
G: Copy,
|
||||
{
|
||||
}
|
||||
impl<W, G> BitOps128 for x2<W, G>
|
||||
where
|
||||
W: BitOps128 + BitOps0,
|
||||
G: Copy,
|
||||
{
|
||||
}
|
||||
fwd_binop_x2!(BitAnd, bitand);
|
||||
fwd_binop_x2!(BitOr, bitor);
|
||||
fwd_binop_x2!(BitXor, bitxor);
|
||||
fwd_binop_x2!(AndNot, andnot);
|
||||
fwd_binop_assign_x2!(BitAndAssign, bitand_assign);
|
||||
fwd_binop_assign_x2!(BitOrAssign, bitor_assign);
|
||||
fwd_binop_assign_x2!(BitXorAssign, bitxor_assign);
|
||||
impl<W, G> ArithOps for x2<W, G>
|
||||
where
|
||||
W: ArithOps,
|
||||
G: Copy,
|
||||
{
|
||||
}
|
||||
fwd_binop_x2!(Add, add);
|
||||
fwd_binop_assign_x2!(AddAssign, add_assign);
|
||||
impl<W: Not + Copy, G> Not for x2<W, G> {
|
||||
type Output = x2<W::Output, G>;
|
||||
#[inline(always)]
|
||||
fn not(self) -> Self::Output {
|
||||
x2::new([self.0[0].not(), self.0[1].not()])
|
||||
}
|
||||
}
|
||||
impl<W, G> UnsafeFrom<[W; 2]> for x2<W, G> {
|
||||
#[inline(always)]
|
||||
unsafe fn unsafe_from(xs: [W; 2]) -> Self {
|
||||
x2::new(xs)
|
||||
}
|
||||
}
|
||||
impl<W: Copy, G> Vec2<W> for x2<W, G> {
|
||||
#[inline(always)]
|
||||
fn extract(self, i: u32) -> W {
|
||||
self.0[i as usize]
|
||||
}
|
||||
#[inline(always)]
|
||||
fn insert(mut self, w: W, i: u32) -> Self {
|
||||
self.0[i as usize] = w;
|
||||
self
|
||||
}
|
||||
}
|
||||
impl<W: Copy + Store<vec128_storage>, G> Store<vec256_storage> for x2<W, G> {
|
||||
#[inline(always)]
|
||||
unsafe fn unpack(p: vec256_storage) -> Self {
|
||||
let p = p.split128();
|
||||
x2::new([W::unpack(p[0]), W::unpack(p[1])])
|
||||
}
|
||||
}
|
||||
impl<W, G> From<x2<W, G>> for vec256_storage
|
||||
where
|
||||
W: Copy,
|
||||
vec128_storage: From<W>,
|
||||
{
|
||||
#[inline(always)]
|
||||
fn from(x: x2<W, G>) -> Self {
|
||||
vec256_storage::new128([x.0[0].into(), x.0[1].into()])
|
||||
}
|
||||
}
|
||||
impl<W, G> Swap64 for x2<W, G>
|
||||
where
|
||||
W: Swap64 + Copy,
|
||||
{
|
||||
fwd_unop_x2!(swap1);
|
||||
fwd_unop_x2!(swap2);
|
||||
fwd_unop_x2!(swap4);
|
||||
fwd_unop_x2!(swap8);
|
||||
fwd_unop_x2!(swap16);
|
||||
fwd_unop_x2!(swap32);
|
||||
fwd_unop_x2!(swap64);
|
||||
}
|
||||
impl<W: Copy, G> MultiLane<[W; 2]> for x2<W, G> {
|
||||
#[inline(always)]
|
||||
fn to_lanes(self) -> [W; 2] {
|
||||
self.0
|
||||
}
|
||||
#[inline(always)]
|
||||
fn from_lanes(lanes: [W; 2]) -> Self {
|
||||
x2::new(lanes)
|
||||
}
|
||||
}
|
||||
impl<W: BSwap + Copy, G> BSwap for x2<W, G> {
|
||||
#[inline(always)]
|
||||
fn bswap(self) -> Self {
|
||||
x2::new([self.0[0].bswap(), self.0[1].bswap()])
|
||||
}
|
||||
}
|
||||
impl<W: StoreBytes + BSwap + Copy, G> StoreBytes for x2<W, G> {
|
||||
#[inline(always)]
|
||||
unsafe fn unsafe_read_le(input: &[u8]) -> Self {
|
||||
let input = input.split_at(input.len() / 2);
|
||||
x2::new([W::unsafe_read_le(input.0), W::unsafe_read_le(input.1)])
|
||||
}
|
||||
#[inline(always)]
|
||||
unsafe fn unsafe_read_be(input: &[u8]) -> Self {
|
||||
let input = input.split_at(input.len() / 2);
|
||||
x2::new([W::unsafe_read_be(input.0), W::unsafe_read_be(input.1)])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn write_le(self, out: &mut [u8]) {
|
||||
let out = out.split_at_mut(out.len() / 2);
|
||||
self.0[0].write_le(out.0);
|
||||
self.0[1].write_le(out.1);
|
||||
}
|
||||
#[inline(always)]
|
||||
fn write_be(self, out: &mut [u8]) {
|
||||
let out = out.split_at_mut(out.len() / 2);
|
||||
self.0[0].write_be(out.0);
|
||||
self.0[1].write_be(out.1);
|
||||
}
|
||||
}
|
||||
impl<W: Copy + LaneWords4, G: Copy> LaneWords4 for x2<W, G> {
|
||||
#[inline(always)]
|
||||
fn shuffle_lane_words2301(self) -> Self {
|
||||
Self::new([
|
||||
self.0[0].shuffle_lane_words2301(),
|
||||
self.0[1].shuffle_lane_words2301(),
|
||||
])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn shuffle_lane_words1230(self) -> Self {
|
||||
Self::new([
|
||||
self.0[0].shuffle_lane_words1230(),
|
||||
self.0[1].shuffle_lane_words1230(),
|
||||
])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn shuffle_lane_words3012(self) -> Self {
|
||||
Self::new([
|
||||
self.0[0].shuffle_lane_words3012(),
|
||||
self.0[1].shuffle_lane_words3012(),
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Default)]
|
||||
#[allow(non_camel_case_types)]
|
||||
pub struct x4<W>(pub [W; 4]);
|
||||
impl<W> x4<W> {
|
||||
#[inline(always)]
|
||||
pub fn new(xs: [W; 4]) -> Self {
|
||||
x4(xs)
|
||||
}
|
||||
}
|
||||
macro_rules! fwd_binop_x4 {
|
||||
($trait:ident, $fn:ident) => {
|
||||
impl<W: $trait + Copy> $trait for x4<W> {
|
||||
type Output = x4<W::Output>;
|
||||
#[inline(always)]
|
||||
fn $fn(self, rhs: Self) -> Self::Output {
|
||||
x4([
|
||||
self.0[0].$fn(rhs.0[0]),
|
||||
self.0[1].$fn(rhs.0[1]),
|
||||
self.0[2].$fn(rhs.0[2]),
|
||||
self.0[3].$fn(rhs.0[3]),
|
||||
])
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
macro_rules! fwd_binop_assign_x4 {
|
||||
($trait:ident, $fn_assign:ident) => {
|
||||
impl<W: $trait + Copy> $trait for x4<W> {
|
||||
#[inline(always)]
|
||||
fn $fn_assign(&mut self, rhs: Self) {
|
||||
self.0[0].$fn_assign(rhs.0[0]);
|
||||
self.0[1].$fn_assign(rhs.0[1]);
|
||||
self.0[2].$fn_assign(rhs.0[2]);
|
||||
self.0[3].$fn_assign(rhs.0[3]);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
macro_rules! fwd_unop_x4 {
|
||||
($fn:ident) => {
|
||||
#[inline(always)]
|
||||
fn $fn(self) -> Self {
|
||||
x4([
|
||||
self.0[0].$fn(),
|
||||
self.0[1].$fn(),
|
||||
self.0[2].$fn(),
|
||||
self.0[3].$fn(),
|
||||
])
|
||||
}
|
||||
};
|
||||
}
|
||||
impl<W> RotateEachWord32 for x4<W>
|
||||
where
|
||||
W: Copy + RotateEachWord32,
|
||||
{
|
||||
fwd_unop_x4!(rotate_each_word_right7);
|
||||
fwd_unop_x4!(rotate_each_word_right8);
|
||||
fwd_unop_x4!(rotate_each_word_right11);
|
||||
fwd_unop_x4!(rotate_each_word_right12);
|
||||
fwd_unop_x4!(rotate_each_word_right16);
|
||||
fwd_unop_x4!(rotate_each_word_right20);
|
||||
fwd_unop_x4!(rotate_each_word_right24);
|
||||
fwd_unop_x4!(rotate_each_word_right25);
|
||||
}
|
||||
impl<W> RotateEachWord64 for x4<W>
|
||||
where
|
||||
W: Copy + RotateEachWord64,
|
||||
{
|
||||
fwd_unop_x4!(rotate_each_word_right32);
|
||||
}
|
||||
impl<W> RotateEachWord128 for x4<W> where W: RotateEachWord128 {}
|
||||
impl<W> BitOps0 for x4<W> where W: BitOps0 {}
|
||||
impl<W> BitOps32 for x4<W> where W: BitOps32 + BitOps0 {}
|
||||
impl<W> BitOps64 for x4<W> where W: BitOps64 + BitOps0 {}
|
||||
impl<W> BitOps128 for x4<W> where W: BitOps128 + BitOps0 {}
|
||||
fwd_binop_x4!(BitAnd, bitand);
|
||||
fwd_binop_x4!(BitOr, bitor);
|
||||
fwd_binop_x4!(BitXor, bitxor);
|
||||
fwd_binop_x4!(AndNot, andnot);
|
||||
fwd_binop_assign_x4!(BitAndAssign, bitand_assign);
|
||||
fwd_binop_assign_x4!(BitOrAssign, bitor_assign);
|
||||
fwd_binop_assign_x4!(BitXorAssign, bitxor_assign);
|
||||
impl<W> ArithOps for x4<W> where W: ArithOps {}
|
||||
fwd_binop_x4!(Add, add);
|
||||
fwd_binop_assign_x4!(AddAssign, add_assign);
|
||||
impl<W: Not + Copy> Not for x4<W> {
|
||||
type Output = x4<W::Output>;
|
||||
#[inline(always)]
|
||||
fn not(self) -> Self::Output {
|
||||
x4([
|
||||
self.0[0].not(),
|
||||
self.0[1].not(),
|
||||
self.0[2].not(),
|
||||
self.0[3].not(),
|
||||
])
|
||||
}
|
||||
}
|
||||
impl<W> UnsafeFrom<[W; 4]> for x4<W> {
|
||||
#[inline(always)]
|
||||
unsafe fn unsafe_from(xs: [W; 4]) -> Self {
|
||||
x4(xs)
|
||||
}
|
||||
}
|
||||
impl<W: Copy> Vec4<W> for x4<W> {
|
||||
#[inline(always)]
|
||||
fn extract(self, i: u32) -> W {
|
||||
self.0[i as usize]
|
||||
}
|
||||
#[inline(always)]
|
||||
fn insert(mut self, w: W, i: u32) -> Self {
|
||||
self.0[i as usize] = w;
|
||||
self
|
||||
}
|
||||
}
|
||||
impl<W: Copy> Vec4Ext<W> for x4<W> {
|
||||
#[inline(always)]
|
||||
fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self)
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
(
|
||||
x4([a.0[0], b.0[0], c.0[0], d.0[0]]),
|
||||
x4([a.0[1], b.0[1], c.0[1], d.0[1]]),
|
||||
x4([a.0[2], b.0[2], c.0[2], d.0[2]]),
|
||||
x4([a.0[3], b.0[3], c.0[3], d.0[3]]),
|
||||
)
|
||||
}
|
||||
}
|
||||
impl<W: Copy + Store<vec128_storage>> Store<vec512_storage> for x4<W> {
|
||||
#[inline(always)]
|
||||
unsafe fn unpack(p: vec512_storage) -> Self {
|
||||
let p = p.split128();
|
||||
x4([
|
||||
W::unpack(p[0]),
|
||||
W::unpack(p[1]),
|
||||
W::unpack(p[2]),
|
||||
W::unpack(p[3]),
|
||||
])
|
||||
}
|
||||
}
|
||||
impl<W> From<x4<W>> for vec512_storage
|
||||
where
|
||||
W: Copy,
|
||||
vec128_storage: From<W>,
|
||||
{
|
||||
#[inline(always)]
|
||||
fn from(x: x4<W>) -> Self {
|
||||
vec512_storage::new128([x.0[0].into(), x.0[1].into(), x.0[2].into(), x.0[3].into()])
|
||||
}
|
||||
}
|
||||
impl<W> Swap64 for x4<W>
|
||||
where
|
||||
W: Swap64 + Copy,
|
||||
{
|
||||
fwd_unop_x4!(swap1);
|
||||
fwd_unop_x4!(swap2);
|
||||
fwd_unop_x4!(swap4);
|
||||
fwd_unop_x4!(swap8);
|
||||
fwd_unop_x4!(swap16);
|
||||
fwd_unop_x4!(swap32);
|
||||
fwd_unop_x4!(swap64);
|
||||
}
|
||||
impl<W: Copy> MultiLane<[W; 4]> for x4<W> {
|
||||
#[inline(always)]
|
||||
fn to_lanes(self) -> [W; 4] {
|
||||
self.0
|
||||
}
|
||||
#[inline(always)]
|
||||
fn from_lanes(lanes: [W; 4]) -> Self {
|
||||
x4(lanes)
|
||||
}
|
||||
}
|
||||
impl<W: BSwap + Copy> BSwap for x4<W> {
|
||||
#[inline(always)]
|
||||
fn bswap(self) -> Self {
|
||||
x4([
|
||||
self.0[0].bswap(),
|
||||
self.0[1].bswap(),
|
||||
self.0[2].bswap(),
|
||||
self.0[3].bswap(),
|
||||
])
|
||||
}
|
||||
}
|
||||
impl<W: StoreBytes + BSwap + Copy> StoreBytes for x4<W> {
|
||||
#[inline(always)]
|
||||
unsafe fn unsafe_read_le(input: &[u8]) -> Self {
|
||||
let n = input.len() / 4;
|
||||
x4([
|
||||
W::unsafe_read_le(&input[..n]),
|
||||
W::unsafe_read_le(&input[n..n * 2]),
|
||||
W::unsafe_read_le(&input[n * 2..n * 3]),
|
||||
W::unsafe_read_le(&input[n * 3..]),
|
||||
])
|
||||
}
|
||||
#[inline(always)]
|
||||
unsafe fn unsafe_read_be(input: &[u8]) -> Self {
|
||||
let n = input.len() / 4;
|
||||
x4([
|
||||
W::unsafe_read_be(&input[..n]),
|
||||
W::unsafe_read_be(&input[n..n * 2]),
|
||||
W::unsafe_read_be(&input[n * 2..n * 3]),
|
||||
W::unsafe_read_be(&input[n * 3..]),
|
||||
])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn write_le(self, out: &mut [u8]) {
|
||||
let n = out.len() / 4;
|
||||
self.0[0].write_le(&mut out[..n]);
|
||||
self.0[1].write_le(&mut out[n..n * 2]);
|
||||
self.0[2].write_le(&mut out[n * 2..n * 3]);
|
||||
self.0[3].write_le(&mut out[n * 3..]);
|
||||
}
|
||||
#[inline(always)]
|
||||
fn write_be(self, out: &mut [u8]) {
|
||||
let n = out.len() / 4;
|
||||
self.0[0].write_be(&mut out[..n]);
|
||||
self.0[1].write_be(&mut out[n..n * 2]);
|
||||
self.0[2].write_be(&mut out[n * 2..n * 3]);
|
||||
self.0[3].write_be(&mut out[n * 3..]);
|
||||
}
|
||||
}
|
||||
impl<W: Copy + LaneWords4> LaneWords4 for x4<W> {
|
||||
#[inline(always)]
|
||||
fn shuffle_lane_words2301(self) -> Self {
|
||||
x4([
|
||||
self.0[0].shuffle_lane_words2301(),
|
||||
self.0[1].shuffle_lane_words2301(),
|
||||
self.0[2].shuffle_lane_words2301(),
|
||||
self.0[3].shuffle_lane_words2301(),
|
||||
])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn shuffle_lane_words1230(self) -> Self {
|
||||
x4([
|
||||
self.0[0].shuffle_lane_words1230(),
|
||||
self.0[1].shuffle_lane_words1230(),
|
||||
self.0[2].shuffle_lane_words1230(),
|
||||
self.0[3].shuffle_lane_words1230(),
|
||||
])
|
||||
}
|
||||
#[inline(always)]
|
||||
fn shuffle_lane_words3012(self) -> Self {
|
||||
x4([
|
||||
self.0[0].shuffle_lane_words3012(),
|
||||
self.0[1].shuffle_lane_words3012(),
|
||||
self.0[2].shuffle_lane_words3012(),
|
||||
self.0[3].shuffle_lane_words3012(),
|
||||
])
|
||||
}
|
||||
}
|
|
@ -0,0 +1,298 @@
|
|||
#![allow(non_camel_case_types)]
|
||||
use core::ops::{Add, AddAssign, BitAnd, BitOr, BitXor, BitXorAssign, Not};
|
||||
|
||||
pub trait AndNot {
|
||||
type Output;
|
||||
fn andnot(self, rhs: Self) -> Self::Output;
|
||||
}
|
||||
pub trait BSwap {
|
||||
fn bswap(self) -> Self;
|
||||
}
|
||||
/// Ops that depend on word size
|
||||
pub trait ArithOps: Add<Output = Self> + AddAssign + Sized + Copy + Clone + BSwap {}
|
||||
/// Ops that are independent of word size and endian
|
||||
pub trait BitOps0:
|
||||
BitAnd<Output = Self>
|
||||
+ BitOr<Output = Self>
|
||||
+ BitXor<Output = Self>
|
||||
+ BitXorAssign
|
||||
+ Not<Output = Self>
|
||||
+ AndNot<Output = Self>
|
||||
+ Sized
|
||||
+ Copy
|
||||
+ Clone
|
||||
{
|
||||
}
|
||||
|
||||
pub trait BitOps32: BitOps0 + RotateEachWord32 {}
|
||||
pub trait BitOps64: BitOps32 + RotateEachWord64 {}
|
||||
pub trait BitOps128: BitOps64 + RotateEachWord128 {}
|
||||
|
||||
pub trait RotateEachWord32 {
|
||||
fn rotate_each_word_right7(self) -> Self;
|
||||
fn rotate_each_word_right8(self) -> Self;
|
||||
fn rotate_each_word_right11(self) -> Self;
|
||||
fn rotate_each_word_right12(self) -> Self;
|
||||
fn rotate_each_word_right16(self) -> Self;
|
||||
fn rotate_each_word_right20(self) -> Self;
|
||||
fn rotate_each_word_right24(self) -> Self;
|
||||
fn rotate_each_word_right25(self) -> Self;
|
||||
}
|
||||
|
||||
pub trait RotateEachWord64 {
|
||||
fn rotate_each_word_right32(self) -> Self;
|
||||
}
|
||||
|
||||
pub trait RotateEachWord128 {}
|
||||
|
||||
// Vector type naming scheme:
|
||||
// uN[xP]xL
|
||||
// Unsigned; N-bit words * P bits per lane * L lanes
|
||||
//
|
||||
// A lane is always 128-bits, chosen because common SIMD architectures treat 128-bit units of
|
||||
// wide vectors specially (supporting e.g. intra-lane shuffles), and tend to have limited and
|
||||
// slow inter-lane operations.
|
||||
|
||||
use crate::arch::{vec128_storage, vec256_storage, vec512_storage};
|
||||
|
||||
#[allow(clippy::missing_safety_doc)]
|
||||
pub trait UnsafeFrom<T> {
|
||||
unsafe fn unsafe_from(t: T) -> Self;
|
||||
}
|
||||
|
||||
/// A vector composed of two elements, which may be words or themselves vectors.
|
||||
pub trait Vec2<W> {
|
||||
fn extract(self, i: u32) -> W;
|
||||
fn insert(self, w: W, i: u32) -> Self;
|
||||
}
|
||||
|
||||
/// A vector composed of four elements, which may be words or themselves vectors.
|
||||
pub trait Vec4<W> {
|
||||
fn extract(self, i: u32) -> W;
|
||||
fn insert(self, w: W, i: u32) -> Self;
|
||||
}
|
||||
/// Vec4 functions which may not be implemented yet for all Vec4 types.
|
||||
/// NOTE: functions in this trait may be moved to Vec4 in any patch release. To avoid breakage,
|
||||
/// import Vec4Ext only together with Vec4, and don't qualify its methods.
|
||||
pub trait Vec4Ext<W> {
|
||||
fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self)
|
||||
where
|
||||
Self: Sized;
|
||||
}
|
||||
pub trait Vector<T> {
|
||||
fn to_scalars(self) -> T;
|
||||
}
|
||||
|
||||
// TODO: multiples of 4 should inherit this
|
||||
/// A vector composed of four words; depending on their size, operations may cross lanes.
|
||||
pub trait Words4 {
|
||||
fn shuffle1230(self) -> Self;
|
||||
fn shuffle2301(self) -> Self;
|
||||
fn shuffle3012(self) -> Self;
|
||||
}
|
||||
|
||||
/// A vector composed one or more lanes each composed of four words.
|
||||
pub trait LaneWords4 {
|
||||
fn shuffle_lane_words1230(self) -> Self;
|
||||
fn shuffle_lane_words2301(self) -> Self;
|
||||
fn shuffle_lane_words3012(self) -> Self;
|
||||
}
|
||||
|
||||
// TODO: make this a part of BitOps
|
||||
/// Exchange neigboring ranges of bits of the specified size
|
||||
pub trait Swap64 {
|
||||
fn swap1(self) -> Self;
|
||||
fn swap2(self) -> Self;
|
||||
fn swap4(self) -> Self;
|
||||
fn swap8(self) -> Self;
|
||||
fn swap16(self) -> Self;
|
||||
fn swap32(self) -> Self;
|
||||
fn swap64(self) -> Self;
|
||||
}
|
||||
|
||||
pub trait u32x4<M: Machine>:
|
||||
BitOps32
|
||||
+ Store<vec128_storage>
|
||||
+ ArithOps
|
||||
+ Vec4<u32>
|
||||
+ Words4
|
||||
+ LaneWords4
|
||||
+ StoreBytes
|
||||
+ MultiLane<[u32; 4]>
|
||||
+ Into<vec128_storage>
|
||||
{
|
||||
}
|
||||
pub trait u64x2<M: Machine>:
|
||||
BitOps64 + Store<vec128_storage> + ArithOps + Vec2<u64> + MultiLane<[u64; 2]> + Into<vec128_storage>
|
||||
{
|
||||
}
|
||||
pub trait u128x1<M: Machine>:
|
||||
BitOps128 + Store<vec128_storage> + Swap64 + MultiLane<[u128; 1]> + Into<vec128_storage>
|
||||
{
|
||||
}
|
||||
|
||||
pub trait u32x4x2<M: Machine>:
|
||||
BitOps32
|
||||
+ Store<vec256_storage>
|
||||
+ Vec2<M::u32x4>
|
||||
+ MultiLane<[M::u32x4; 2]>
|
||||
+ ArithOps
|
||||
+ Into<vec256_storage>
|
||||
+ StoreBytes
|
||||
{
|
||||
}
|
||||
pub trait u64x2x2<M: Machine>:
|
||||
BitOps64
|
||||
+ Store<vec256_storage>
|
||||
+ Vec2<M::u64x2>
|
||||
+ MultiLane<[M::u64x2; 2]>
|
||||
+ ArithOps
|
||||
+ StoreBytes
|
||||
+ Into<vec256_storage>
|
||||
{
|
||||
}
|
||||
pub trait u64x4<M: Machine>:
|
||||
BitOps64
|
||||
+ Store<vec256_storage>
|
||||
+ Vec4<u64>
|
||||
+ MultiLane<[u64; 4]>
|
||||
+ ArithOps
|
||||
+ Words4
|
||||
+ StoreBytes
|
||||
+ Into<vec256_storage>
|
||||
{
|
||||
}
|
||||
pub trait u128x2<M: Machine>:
|
||||
BitOps128
|
||||
+ Store<vec256_storage>
|
||||
+ Vec2<M::u128x1>
|
||||
+ MultiLane<[M::u128x1; 2]>
|
||||
+ Swap64
|
||||
+ Into<vec256_storage>
|
||||
{
|
||||
}
|
||||
|
||||
pub trait u32x4x4<M: Machine>:
|
||||
BitOps32
|
||||
+ Store<vec512_storage>
|
||||
+ Vec4<M::u32x4>
|
||||
+ Vec4Ext<M::u32x4>
|
||||
+ Vector<[u32; 16]>
|
||||
+ MultiLane<[M::u32x4; 4]>
|
||||
+ ArithOps
|
||||
+ LaneWords4
|
||||
+ Into<vec512_storage>
|
||||
+ StoreBytes
|
||||
{
|
||||
}
|
||||
pub trait u64x2x4<M: Machine>:
|
||||
BitOps64
|
||||
+ Store<vec512_storage>
|
||||
+ Vec4<M::u64x2>
|
||||
+ MultiLane<[M::u64x2; 4]>
|
||||
+ ArithOps
|
||||
+ Into<vec512_storage>
|
||||
{
|
||||
}
|
||||
// TODO: Words4
|
||||
pub trait u128x4<M: Machine>:
|
||||
BitOps128
|
||||
+ Store<vec512_storage>
|
||||
+ Vec4<M::u128x1>
|
||||
+ MultiLane<[M::u128x1; 4]>
|
||||
+ Swap64
|
||||
+ Into<vec512_storage>
|
||||
{
|
||||
}
|
||||
|
||||
/// A vector composed of multiple 128-bit lanes.
|
||||
pub trait MultiLane<Lanes> {
|
||||
/// Split a multi-lane vector into single-lane vectors.
|
||||
fn to_lanes(self) -> Lanes;
|
||||
/// Build a multi-lane vector from individual lanes.
|
||||
fn from_lanes(lanes: Lanes) -> Self;
|
||||
}
|
||||
|
||||
/// Combine single vectors into a multi-lane vector.
|
||||
pub trait VZip<V> {
|
||||
fn vzip(self) -> V;
|
||||
}
|
||||
|
||||
impl<V, T> VZip<V> for T
|
||||
where
|
||||
V: MultiLane<T>,
|
||||
{
|
||||
#[inline(always)]
|
||||
fn vzip(self) -> V {
|
||||
V::from_lanes(self)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Machine: Sized + Copy {
|
||||
type u32x4: u32x4<Self>;
|
||||
type u64x2: u64x2<Self>;
|
||||
type u128x1: u128x1<Self>;
|
||||
|
||||
type u32x4x2: u32x4x2<Self>;
|
||||
type u64x2x2: u64x2x2<Self>;
|
||||
type u64x4: u64x4<Self>;
|
||||
type u128x2: u128x2<Self>;
|
||||
|
||||
type u32x4x4: u32x4x4<Self>;
|
||||
type u64x2x4: u64x2x4<Self>;
|
||||
type u128x4: u128x4<Self>;
|
||||
|
||||
#[inline(always)]
|
||||
fn unpack<S, V: Store<S>>(self, s: S) -> V {
|
||||
unsafe { V::unpack(s) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn vec<V, A>(self, a: A) -> V
|
||||
where
|
||||
V: MultiLane<A>,
|
||||
{
|
||||
V::from_lanes(a)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn read_le<V>(self, input: &[u8]) -> V
|
||||
where
|
||||
V: StoreBytes,
|
||||
{
|
||||
unsafe { V::unsafe_read_le(input) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn read_be<V>(self, input: &[u8]) -> V
|
||||
where
|
||||
V: StoreBytes,
|
||||
{
|
||||
unsafe { V::unsafe_read_be(input) }
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
/// Caller must ensure the type of Self is appropriate for the hardware of the execution
|
||||
/// environment.
|
||||
unsafe fn instance() -> Self;
|
||||
}
|
||||
|
||||
pub trait Store<S> {
|
||||
/// # Safety
|
||||
/// Caller must ensure the type of Self is appropriate for the hardware of the execution
|
||||
/// environment.
|
||||
unsafe fn unpack(p: S) -> Self;
|
||||
}
|
||||
|
||||
pub trait StoreBytes {
|
||||
/// # Safety
|
||||
/// Caller must ensure the type of Self is appropriate for the hardware of the execution
|
||||
/// environment.
|
||||
unsafe fn unsafe_read_le(input: &[u8]) -> Self;
|
||||
/// # Safety
|
||||
/// Caller must ensure the type of Self is appropriate for the hardware of the execution
|
||||
/// environment.
|
||||
unsafe fn unsafe_read_be(input: &[u8]) -> Self;
|
||||
fn write_le(self, out: &mut [u8]);
|
||||
fn write_be(self, out: &mut [u8]);
|
||||
}
|
|
@ -0,0 +1,437 @@
|
|||
// crate minimums: sse2, x86_64
|
||||
|
||||
use crate::types::*;
|
||||
use core::arch::x86_64::{__m128i, __m256i};
|
||||
|
||||
mod sse2;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct YesS3;
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct NoS3;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct YesS4;
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct NoS4;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct YesA1;
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct NoA1;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct YesA2;
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct NoA2;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct YesNI;
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct NoNI;
|
||||
|
||||
use core::marker::PhantomData;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
|
||||
impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
|
||||
where
|
||||
sse2::u128x1_sse2<S3, S4, NI>: Swap64,
|
||||
sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
|
||||
sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
|
||||
sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
|
||||
sse2::u128x1_sse2<S3, S4, NI>: BSwap,
|
||||
sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
|
||||
sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
|
||||
sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
|
||||
sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
|
||||
sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
|
||||
{
|
||||
type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
|
||||
type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
|
||||
type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;
|
||||
|
||||
type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
|
||||
type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
|
||||
type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
|
||||
type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;
|
||||
|
||||
type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
|
||||
type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
|
||||
type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn instance() -> Self {
|
||||
SseMachine(PhantomData)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct Avx2Machine<NI>(PhantomData<NI>);
|
||||
impl<NI: Copy> Machine for Avx2Machine<NI>
|
||||
where
|
||||
sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
|
||||
sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
|
||||
sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
|
||||
sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
|
||||
{
|
||||
type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
|
||||
type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
|
||||
type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
|
||||
|
||||
type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>;
|
||||
type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
|
||||
type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
|
||||
type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
|
||||
|
||||
type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
|
||||
type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
|
||||
type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;
|
||||
|
||||
#[inline(always)]
|
||||
unsafe fn instance() -> Self {
|
||||
Avx2Machine(PhantomData)
|
||||
}
|
||||
}
|
||||
|
||||
pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
|
||||
pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
|
||||
pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
|
||||
/// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
|
||||
/// to avoid expensive SSE/VEX conflicts.
|
||||
pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
|
||||
pub type AVX2 = Avx2Machine<NoNI>;
|
||||
|
||||
/// Generic wrapper for unparameterized storage of any of the possible impls.
|
||||
/// Converting into and out of this type should be essentially free, although it may be more
|
||||
/// aligned than a particular impl requires.
|
||||
#[allow(non_camel_case_types)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub union vec128_storage {
|
||||
u32x4: [u32; 4],
|
||||
u64x2: [u64; 2],
|
||||
u128x1: [u128; 1],
|
||||
sse2: __m128i,
|
||||
}
|
||||
impl Store<vec128_storage> for vec128_storage {
|
||||
#[inline(always)]
|
||||
unsafe fn unpack(p: vec128_storage) -> Self {
|
||||
p
|
||||
}
|
||||
}
|
||||
impl<'a> From<&'a vec128_storage> for &'a [u32; 4] {
|
||||
#[inline(always)]
|
||||
fn from(x: &'a vec128_storage) -> Self {
|
||||
unsafe { &x.u32x4 }
|
||||
}
|
||||
}
|
||||
impl From<[u32; 4]> for vec128_storage {
|
||||
#[inline(always)]
|
||||
fn from(u32x4: [u32; 4]) -> Self {
|
||||
vec128_storage { u32x4 }
|
||||
}
|
||||
}
|
||||
impl Default for vec128_storage {
|
||||
#[inline(always)]
|
||||
fn default() -> Self {
|
||||
vec128_storage { u128x1: [0] }
|
||||
}
|
||||
}
|
||||
impl Eq for vec128_storage {}
|
||||
impl PartialEq for vec128_storage {
|
||||
#[inline(always)]
|
||||
fn eq(&self, rhs: &Self) -> bool {
|
||||
unsafe { self.u128x1 == rhs.u128x1 }
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub union vec256_storage {
|
||||
u32x8: [u32; 8],
|
||||
u64x4: [u64; 4],
|
||||
u128x2: [u128; 2],
|
||||
sse2: [vec128_storage; 2],
|
||||
avx: __m256i,
|
||||
}
|
||||
impl From<[u64; 4]> for vec256_storage {
|
||||
#[inline(always)]
|
||||
fn from(u64x4: [u64; 4]) -> Self {
|
||||
vec256_storage { u64x4 }
|
||||
}
|
||||
}
|
||||
impl Default for vec256_storage {
|
||||
#[inline(always)]
|
||||
fn default() -> Self {
|
||||
vec256_storage { u128x2: [0, 0] }
|
||||
}
|
||||
}
|
||||
impl vec256_storage {
|
||||
#[inline(always)]
|
||||
pub fn new128(xs: [vec128_storage; 2]) -> Self {
|
||||
Self { sse2: xs }
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn split128(self) -> [vec128_storage; 2] {
|
||||
unsafe { self.sse2 }
|
||||
}
|
||||
}
|
||||
impl Eq for vec256_storage {}
|
||||
impl PartialEq for vec256_storage {
|
||||
#[inline(always)]
|
||||
fn eq(&self, rhs: &Self) -> bool {
|
||||
unsafe { self.sse2 == rhs.sse2 }
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub union vec512_storage {
|
||||
u32x16: [u32; 16],
|
||||
u64x8: [u64; 8],
|
||||
u128x4: [u128; 4],
|
||||
sse2: [vec128_storage; 4],
|
||||
avx: [vec256_storage; 2],
|
||||
}
|
||||
impl Default for vec512_storage {
|
||||
#[inline(always)]
|
||||
fn default() -> Self {
|
||||
vec512_storage {
|
||||
u128x4: [0, 0, 0, 0],
|
||||
}
|
||||
}
|
||||
}
|
||||
impl vec512_storage {
|
||||
#[inline(always)]
|
||||
pub fn new128(xs: [vec128_storage; 4]) -> Self {
|
||||
Self { sse2: xs }
|
||||
}
|
||||
#[inline(always)]
|
||||
pub fn split128(self) -> [vec128_storage; 4] {
|
||||
unsafe { self.sse2 }
|
||||
}
|
||||
}
|
||||
impl Eq for vec512_storage {}
|
||||
impl PartialEq for vec512_storage {
|
||||
#[inline(always)]
|
||||
fn eq(&self, rhs: &Self) -> bool {
|
||||
unsafe { self.avx == rhs.avx }
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! impl_into {
|
||||
($storage:ident, $array:ty, $name:ident) => {
|
||||
impl From<$storage> for $array {
|
||||
#[inline(always)]
|
||||
fn from(vec: $storage) -> Self {
|
||||
unsafe { vec.$name }
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
impl_into!(vec128_storage, [u32; 4], u32x4);
|
||||
impl_into!(vec128_storage, [u64; 2], u64x2);
|
||||
impl_into!(vec128_storage, [u128; 1], u128x1);
|
||||
impl_into!(vec256_storage, [u32; 8], u32x8);
|
||||
impl_into!(vec256_storage, [u64; 4], u64x4);
|
||||
impl_into!(vec256_storage, [u128; 2], u128x2);
|
||||
impl_into!(vec512_storage, [u32; 16], u32x16);
|
||||
impl_into!(vec512_storage, [u64; 8], u64x8);
|
||||
impl_into!(vec512_storage, [u128; 4], u128x4);
|
||||
|
||||
/// Generate the full set of optimized implementations to take advantage of the most important
|
||||
/// hardware feature sets.
|
||||
///
|
||||
/// This dispatcher is suitable for maximizing throughput.
|
||||
#[macro_export]
|
||||
macro_rules! dispatch {
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
|
||||
#[cfg(feature = "std")]
|
||||
$($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
|
||||
#[inline(always)]
|
||||
fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
|
||||
use std::arch::x86_64::*;
|
||||
#[target_feature(enable = "avx2")]
|
||||
unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
|
||||
let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
|
||||
_mm256_zeroupper();
|
||||
ret
|
||||
}
|
||||
#[target_feature(enable = "avx")]
|
||||
#[target_feature(enable = "sse4.1")]
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn impl_avx($($arg: $argty),*) -> $ret {
|
||||
let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
|
||||
_mm256_zeroupper();
|
||||
ret
|
||||
}
|
||||
#[target_feature(enable = "sse4.1")]
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
|
||||
fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
|
||||
}
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
|
||||
fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
|
||||
}
|
||||
#[target_feature(enable = "sse2")]
|
||||
unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
|
||||
fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
|
||||
}
|
||||
unsafe {
|
||||
if is_x86_feature_detected!("avx2") {
|
||||
impl_avx2($($arg),*)
|
||||
} else if is_x86_feature_detected!("avx") {
|
||||
impl_avx($($arg),*)
|
||||
} else if is_x86_feature_detected!("sse4.1") {
|
||||
impl_sse41($($arg),*)
|
||||
} else if is_x86_feature_detected!("ssse3") {
|
||||
impl_ssse3($($arg),*)
|
||||
} else if is_x86_feature_detected!("sse2") {
|
||||
impl_sse2($($arg),*)
|
||||
} else {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
}
|
||||
#[cfg(not(feature = "std"))]
|
||||
#[inline(always)]
|
||||
$($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
|
||||
unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
|
||||
unsafe {
|
||||
if cfg!(target_feature = "avx2") {
|
||||
fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
|
||||
} else if cfg!(target_feature = "avx") {
|
||||
fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
|
||||
} else if cfg!(target_feature = "sse4.1") {
|
||||
fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
|
||||
} else if cfg!(target_feature = "ssse3") {
|
||||
fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
|
||||
} else {
|
||||
fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
|
||||
dispatch!($mach, $MTy, {
|
||||
$([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
|
||||
/// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
|
||||
///
|
||||
/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
|
||||
/// features (e.g. because they are done infrequently), so minimizing their contribution to code
|
||||
/// size is more important.
|
||||
#[macro_export]
|
||||
macro_rules! dispatch_light128 {
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
|
||||
#[cfg(feature = "std")]
|
||||
$($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
|
||||
#[inline(always)]
|
||||
fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
|
||||
use std::arch::x86_64::*;
|
||||
#[target_feature(enable = "avx")]
|
||||
unsafe fn impl_avx($($arg: $argty),*) -> $ret {
|
||||
fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
|
||||
}
|
||||
#[target_feature(enable = "sse2")]
|
||||
unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
|
||||
fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
|
||||
}
|
||||
unsafe {
|
||||
if is_x86_feature_detected!("avx") {
|
||||
impl_avx($($arg),*)
|
||||
} else if is_x86_feature_detected!("sse2") {
|
||||
impl_sse2($($arg),*)
|
||||
} else {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
}
|
||||
#[cfg(not(feature = "std"))]
|
||||
#[inline(always)]
|
||||
$($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
|
||||
unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
|
||||
unsafe {
|
||||
if cfg!(target_feature = "avx2") {
|
||||
fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
|
||||
} else if cfg!(target_feature = "avx") {
|
||||
fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
|
||||
} else if cfg!(target_feature = "sse4.1") {
|
||||
fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
|
||||
} else if cfg!(target_feature = "ssse3") {
|
||||
fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
|
||||
} else {
|
||||
fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
|
||||
dispatch_light128!($mach, $MTy, {
|
||||
$([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
|
||||
/// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
|
||||
///
|
||||
/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
|
||||
/// features (e.g. because they are done infrequently), so minimizing their contribution to code
|
||||
/// size is more important.
|
||||
#[macro_export]
|
||||
macro_rules! dispatch_light256 {
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
|
||||
#[cfg(feature = "std")]
|
||||
$([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
|
||||
#[inline(always)]
|
||||
fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
|
||||
use std::arch::x86_64::*;
|
||||
#[target_feature(enable = "avx")]
|
||||
unsafe fn impl_avx($($arg: $argty),*) -> $ret {
|
||||
fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
|
||||
}
|
||||
#[target_feature(enable = "sse2")]
|
||||
unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
|
||||
fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
|
||||
}
|
||||
unsafe {
|
||||
if is_x86_feature_detected!("avx") {
|
||||
impl_avx($($arg),*)
|
||||
} else if is_x86_feature_detected!("sse2") {
|
||||
impl_sse2($($arg),*)
|
||||
} else {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
}
|
||||
#[cfg(not(feature = "std"))]
|
||||
#[inline(always)]
|
||||
$($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
|
||||
unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
|
||||
unsafe {
|
||||
if cfg!(target_feature = "avx2") {
|
||||
fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
|
||||
} else if cfg!(target_feature = "avx") {
|
||||
fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
|
||||
} else if cfg!(target_feature = "sse4.1") {
|
||||
fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
|
||||
} else if cfg!(target_feature = "ssse3") {
|
||||
fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
|
||||
} else {
|
||||
fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
|
||||
dispatch_light256!($mach, $MTy, {
|
||||
$([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
|
||||
});
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue