From 369d11587339ce74f8ebc76f2607fe55545eaf7d Mon Sep 17 00:00:00 2001 From: garhve Date: Tue, 20 Dec 2022 11:04:25 +0800 Subject: Build small project following the book --- .../target/doc/src/ppv_lite86/lib.rs.html | 46 + .../target/doc/src/ppv_lite86/soft.rs.html | 946 ++++++ .../target/doc/src/ppv_lite86/types.rs.html | 598 ++++ .../target/doc/src/ppv_lite86/x86_64/mod.rs.html | 876 +++++ .../target/doc/src/ppv_lite86/x86_64/sse2.rs.html | 3408 ++++++++++++++++++++ 5 files changed, 5874 insertions(+) create mode 100644 rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/lib.rs.html create mode 100644 rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/soft.rs.html create mode 100644 rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/types.rs.html create mode 100644 rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/x86_64/mod.rs.html create mode 100644 rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/x86_64/sse2.rs.html (limited to 'rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86') diff --git a/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/lib.rs.html b/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/lib.rs.html new file mode 100644 index 0000000..8188dea --- /dev/null +++ b/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/lib.rs.html @@ -0,0 +1,46 @@ +lib.rs - source
1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+
#![no_std]
+
+// Design:
+// - safety: safe creation of any machine type is done only by instance methods of a
+//   Machine (which is a ZST + Copy type), which can only by created unsafely or safely
+//   through feature detection (e.g. fn AVX2::try_get() -> Option<Machine>).
+
+mod soft;
+mod types;
+pub use self::types::*;
+
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(feature = "no_simd"), not(miri)))]
+pub mod x86_64;
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(feature = "no_simd"), not(miri)))]
+use self::x86_64 as arch;
+
+#[cfg(any(feature = "no_simd", miri, not(target_arch = "x86_64"), all(target_arch = "x86_64", not(target_feature = "sse2"))))]
+pub mod generic;
+#[cfg(any(feature = "no_simd", miri, not(target_arch = "x86_64"), all(target_arch = "x86_64", not(target_feature = "sse2"))))]
+use self::generic as arch;
+
+pub use self::arch::{vec128_storage, vec256_storage, vec512_storage};
+
+
\ No newline at end of file diff --git a/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/soft.rs.html b/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/soft.rs.html new file mode 100644 index 0000000..15a524c --- /dev/null +++ b/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/soft.rs.html @@ -0,0 +1,946 @@ +soft.rs - source
1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+
//! Implement 256- and 512- bit in terms of 128-bit, for machines without native wide SIMD.
+
+use crate::types::*;
+use crate::{vec128_storage, vec256_storage, vec512_storage};
+use core::marker::PhantomData;
+use core::ops::*;
+
+#[derive(Copy, Clone, Default)]
+#[allow(non_camel_case_types)]
+pub struct x2<W, G>(pub [W; 2], PhantomData<G>);
+impl<W, G> x2<W, G> {
+    #[inline(always)]
+    pub fn new(xs: [W; 2]) -> Self {
+        x2(xs, PhantomData)
+    }
+}
+macro_rules! fwd_binop_x2 {
+    ($trait:ident, $fn:ident) => {
+        impl<W: $trait + Copy, G> $trait for x2<W, G> {
+            type Output = x2<W::Output, G>;
+            #[inline(always)]
+            fn $fn(self, rhs: Self) -> Self::Output {
+                x2::new([self.0[0].$fn(rhs.0[0]), self.0[1].$fn(rhs.0[1])])
+            }
+        }
+    };
+}
+macro_rules! fwd_binop_assign_x2 {
+    ($trait:ident, $fn_assign:ident) => {
+        impl<W: $trait + Copy, G> $trait for x2<W, G> {
+            #[inline(always)]
+            fn $fn_assign(&mut self, rhs: Self) {
+                (self.0[0]).$fn_assign(rhs.0[0]);
+                (self.0[1]).$fn_assign(rhs.0[1]);
+            }
+        }
+    };
+}
+macro_rules! fwd_unop_x2 {
+    ($fn:ident) => {
+        #[inline(always)]
+        fn $fn(self) -> Self {
+            x2::new([self.0[0].$fn(), self.0[1].$fn()])
+        }
+    };
+}
+impl<W, G> RotateEachWord32 for x2<W, G>
+where
+    W: Copy + RotateEachWord32,
+{
+    fwd_unop_x2!(rotate_each_word_right7);
+    fwd_unop_x2!(rotate_each_word_right8);
+    fwd_unop_x2!(rotate_each_word_right11);
+    fwd_unop_x2!(rotate_each_word_right12);
+    fwd_unop_x2!(rotate_each_word_right16);
+    fwd_unop_x2!(rotate_each_word_right20);
+    fwd_unop_x2!(rotate_each_word_right24);
+    fwd_unop_x2!(rotate_each_word_right25);
+}
+impl<W, G> RotateEachWord64 for x2<W, G>
+where
+    W: Copy + RotateEachWord64,
+{
+    fwd_unop_x2!(rotate_each_word_right32);
+}
+impl<W, G> RotateEachWord128 for x2<W, G> where W: RotateEachWord128 {}
+impl<W, G> BitOps0 for x2<W, G>
+where
+    W: BitOps0,
+    G: Copy,
+{
+}
+impl<W, G> BitOps32 for x2<W, G>
+where
+    W: BitOps32 + BitOps0,
+    G: Copy,
+{
+}
+impl<W, G> BitOps64 for x2<W, G>
+where
+    W: BitOps64 + BitOps0,
+    G: Copy,
+{
+}
+impl<W, G> BitOps128 for x2<W, G>
+where
+    W: BitOps128 + BitOps0,
+    G: Copy,
+{
+}
+fwd_binop_x2!(BitAnd, bitand);
+fwd_binop_x2!(BitOr, bitor);
+fwd_binop_x2!(BitXor, bitxor);
+fwd_binop_x2!(AndNot, andnot);
+fwd_binop_assign_x2!(BitAndAssign, bitand_assign);
+fwd_binop_assign_x2!(BitOrAssign, bitor_assign);
+fwd_binop_assign_x2!(BitXorAssign, bitxor_assign);
+impl<W, G> ArithOps for x2<W, G>
+where
+    W: ArithOps,
+    G: Copy,
+{
+}
+fwd_binop_x2!(Add, add);
+fwd_binop_assign_x2!(AddAssign, add_assign);
+impl<W: Not + Copy, G> Not for x2<W, G> {
+    type Output = x2<W::Output, G>;
+    #[inline(always)]
+    fn not(self) -> Self::Output {
+        x2::new([self.0[0].not(), self.0[1].not()])
+    }
+}
+impl<W, G> UnsafeFrom<[W; 2]> for x2<W, G> {
+    #[inline(always)]
+    unsafe fn unsafe_from(xs: [W; 2]) -> Self {
+        x2::new(xs)
+    }
+}
+impl<W: Copy, G> Vec2<W> for x2<W, G> {
+    #[inline(always)]
+    fn extract(self, i: u32) -> W {
+        self.0[i as usize]
+    }
+    #[inline(always)]
+    fn insert(mut self, w: W, i: u32) -> Self {
+        self.0[i as usize] = w;
+        self
+    }
+}
+impl<W: Copy + Store<vec128_storage>, G> Store<vec256_storage> for x2<W, G> {
+    #[inline(always)]
+    unsafe fn unpack(p: vec256_storage) -> Self {
+        let p = p.split128();
+        x2::new([W::unpack(p[0]), W::unpack(p[1])])
+    }
+}
+impl<W, G> From<x2<W, G>> for vec256_storage
+where
+    W: Copy,
+    vec128_storage: From<W>,
+{
+    #[inline(always)]
+    fn from(x: x2<W, G>) -> Self {
+        vec256_storage::new128([x.0[0].into(), x.0[1].into()])
+    }
+}
+impl<W, G> Swap64 for x2<W, G>
+where
+    W: Swap64 + Copy,
+{
+    fwd_unop_x2!(swap1);
+    fwd_unop_x2!(swap2);
+    fwd_unop_x2!(swap4);
+    fwd_unop_x2!(swap8);
+    fwd_unop_x2!(swap16);
+    fwd_unop_x2!(swap32);
+    fwd_unop_x2!(swap64);
+}
+impl<W: Copy, G> MultiLane<[W; 2]> for x2<W, G> {
+    #[inline(always)]
+    fn to_lanes(self) -> [W; 2] {
+        self.0
+    }
+    #[inline(always)]
+    fn from_lanes(lanes: [W; 2]) -> Self {
+        x2::new(lanes)
+    }
+}
+impl<W: BSwap + Copy, G> BSwap for x2<W, G> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        x2::new([self.0[0].bswap(), self.0[1].bswap()])
+    }
+}
+impl<W: StoreBytes + BSwap + Copy, G> StoreBytes for x2<W, G> {
+    #[inline(always)]
+    unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+        let input = input.split_at(input.len() / 2);
+        x2::new([W::unsafe_read_le(input.0), W::unsafe_read_le(input.1)])
+    }
+    #[inline(always)]
+    unsafe fn unsafe_read_be(input: &[u8]) -> Self {
+        let input = input.split_at(input.len() / 2);
+        x2::new([W::unsafe_read_be(input.0), W::unsafe_read_be(input.1)])
+    }
+    #[inline(always)]
+    fn write_le(self, out: &mut [u8]) {
+        let out = out.split_at_mut(out.len() / 2);
+        self.0[0].write_le(out.0);
+        self.0[1].write_le(out.1);
+    }
+    #[inline(always)]
+    fn write_be(self, out: &mut [u8]) {
+        let out = out.split_at_mut(out.len() / 2);
+        self.0[0].write_be(out.0);
+        self.0[1].write_be(out.1);
+    }
+}
+impl<W: Copy + LaneWords4, G: Copy> LaneWords4 for x2<W, G> {
+    #[inline(always)]
+    fn shuffle_lane_words2301(self) -> Self {
+        Self::new([
+            self.0[0].shuffle_lane_words2301(),
+            self.0[1].shuffle_lane_words2301(),
+        ])
+    }
+    #[inline(always)]
+    fn shuffle_lane_words1230(self) -> Self {
+        Self::new([
+            self.0[0].shuffle_lane_words1230(),
+            self.0[1].shuffle_lane_words1230(),
+        ])
+    }
+    #[inline(always)]
+    fn shuffle_lane_words3012(self) -> Self {
+        Self::new([
+            self.0[0].shuffle_lane_words3012(),
+            self.0[1].shuffle_lane_words3012(),
+        ])
+    }
+}
+
+#[derive(Copy, Clone, Default)]
+#[allow(non_camel_case_types)]
+pub struct x4<W>(pub [W; 4]);
+impl<W> x4<W> {
+    #[inline(always)]
+    pub fn new(xs: [W; 4]) -> Self {
+        x4(xs)
+    }
+}
+macro_rules! fwd_binop_x4 {
+    ($trait:ident, $fn:ident) => {
+        impl<W: $trait + Copy> $trait for x4<W> {
+            type Output = x4<W::Output>;
+            #[inline(always)]
+            fn $fn(self, rhs: Self) -> Self::Output {
+                x4([
+                    self.0[0].$fn(rhs.0[0]),
+                    self.0[1].$fn(rhs.0[1]),
+                    self.0[2].$fn(rhs.0[2]),
+                    self.0[3].$fn(rhs.0[3]),
+                ])
+            }
+        }
+    };
+}
+macro_rules! fwd_binop_assign_x4 {
+    ($trait:ident, $fn_assign:ident) => {
+        impl<W: $trait + Copy> $trait for x4<W> {
+            #[inline(always)]
+            fn $fn_assign(&mut self, rhs: Self) {
+                self.0[0].$fn_assign(rhs.0[0]);
+                self.0[1].$fn_assign(rhs.0[1]);
+                self.0[2].$fn_assign(rhs.0[2]);
+                self.0[3].$fn_assign(rhs.0[3]);
+            }
+        }
+    };
+}
+macro_rules! fwd_unop_x4 {
+    ($fn:ident) => {
+        #[inline(always)]
+        fn $fn(self) -> Self {
+            x4([
+                self.0[0].$fn(),
+                self.0[1].$fn(),
+                self.0[2].$fn(),
+                self.0[3].$fn(),
+            ])
+        }
+    };
+}
+impl<W> RotateEachWord32 for x4<W>
+where
+    W: Copy + RotateEachWord32,
+{
+    fwd_unop_x4!(rotate_each_word_right7);
+    fwd_unop_x4!(rotate_each_word_right8);
+    fwd_unop_x4!(rotate_each_word_right11);
+    fwd_unop_x4!(rotate_each_word_right12);
+    fwd_unop_x4!(rotate_each_word_right16);
+    fwd_unop_x4!(rotate_each_word_right20);
+    fwd_unop_x4!(rotate_each_word_right24);
+    fwd_unop_x4!(rotate_each_word_right25);
+}
+impl<W> RotateEachWord64 for x4<W>
+where
+    W: Copy + RotateEachWord64,
+{
+    fwd_unop_x4!(rotate_each_word_right32);
+}
+impl<W> RotateEachWord128 for x4<W> where W: RotateEachWord128 {}
+impl<W> BitOps0 for x4<W> where W: BitOps0 {}
+impl<W> BitOps32 for x4<W> where W: BitOps32 + BitOps0 {}
+impl<W> BitOps64 for x4<W> where W: BitOps64 + BitOps0 {}
+impl<W> BitOps128 for x4<W> where W: BitOps128 + BitOps0 {}
+fwd_binop_x4!(BitAnd, bitand);
+fwd_binop_x4!(BitOr, bitor);
+fwd_binop_x4!(BitXor, bitxor);
+fwd_binop_x4!(AndNot, andnot);
+fwd_binop_assign_x4!(BitAndAssign, bitand_assign);
+fwd_binop_assign_x4!(BitOrAssign, bitor_assign);
+fwd_binop_assign_x4!(BitXorAssign, bitxor_assign);
+impl<W> ArithOps for x4<W> where W: ArithOps {}
+fwd_binop_x4!(Add, add);
+fwd_binop_assign_x4!(AddAssign, add_assign);
+impl<W: Not + Copy> Not for x4<W> {
+    type Output = x4<W::Output>;
+    #[inline(always)]
+    fn not(self) -> Self::Output {
+        x4([
+            self.0[0].not(),
+            self.0[1].not(),
+            self.0[2].not(),
+            self.0[3].not(),
+        ])
+    }
+}
+impl<W> UnsafeFrom<[W; 4]> for x4<W> {
+    #[inline(always)]
+    unsafe fn unsafe_from(xs: [W; 4]) -> Self {
+        x4(xs)
+    }
+}
+impl<W: Copy> Vec4<W> for x4<W> {
+    #[inline(always)]
+    fn extract(self, i: u32) -> W {
+        self.0[i as usize]
+    }
+    #[inline(always)]
+    fn insert(mut self, w: W, i: u32) -> Self {
+        self.0[i as usize] = w;
+        self
+    }
+}
+impl<W: Copy> Vec4Ext<W> for x4<W> {
+    #[inline(always)]
+    fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self)
+    where
+        Self: Sized,
+    {
+        (
+            x4([a.0[0], b.0[0], c.0[0], d.0[0]]),
+            x4([a.0[1], b.0[1], c.0[1], d.0[1]]),
+            x4([a.0[2], b.0[2], c.0[2], d.0[2]]),
+            x4([a.0[3], b.0[3], c.0[3], d.0[3]]),
+        )
+    }
+}
+impl<W: Copy + Store<vec128_storage>> Store<vec512_storage> for x4<W> {
+    #[inline(always)]
+    unsafe fn unpack(p: vec512_storage) -> Self {
+        let p = p.split128();
+        x4([
+            W::unpack(p[0]),
+            W::unpack(p[1]),
+            W::unpack(p[2]),
+            W::unpack(p[3]),
+        ])
+    }
+}
+impl<W> From<x4<W>> for vec512_storage
+where
+    W: Copy,
+    vec128_storage: From<W>,
+{
+    #[inline(always)]
+    fn from(x: x4<W>) -> Self {
+        vec512_storage::new128([x.0[0].into(), x.0[1].into(), x.0[2].into(), x.0[3].into()])
+    }
+}
+impl<W> Swap64 for x4<W>
+where
+    W: Swap64 + Copy,
+{
+    fwd_unop_x4!(swap1);
+    fwd_unop_x4!(swap2);
+    fwd_unop_x4!(swap4);
+    fwd_unop_x4!(swap8);
+    fwd_unop_x4!(swap16);
+    fwd_unop_x4!(swap32);
+    fwd_unop_x4!(swap64);
+}
+impl<W: Copy> MultiLane<[W; 4]> for x4<W> {
+    #[inline(always)]
+    fn to_lanes(self) -> [W; 4] {
+        self.0
+    }
+    #[inline(always)]
+    fn from_lanes(lanes: [W; 4]) -> Self {
+        x4(lanes)
+    }
+}
+impl<W: BSwap + Copy> BSwap for x4<W> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        x4([
+            self.0[0].bswap(),
+            self.0[1].bswap(),
+            self.0[2].bswap(),
+            self.0[3].bswap(),
+        ])
+    }
+}
+impl<W: StoreBytes + BSwap + Copy> StoreBytes for x4<W> {
+    #[inline(always)]
+    unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+        let n = input.len() / 4;
+        x4([
+            W::unsafe_read_le(&input[..n]),
+            W::unsafe_read_le(&input[n..n * 2]),
+            W::unsafe_read_le(&input[n * 2..n * 3]),
+            W::unsafe_read_le(&input[n * 3..]),
+        ])
+    }
+    #[inline(always)]
+    unsafe fn unsafe_read_be(input: &[u8]) -> Self {
+        let n = input.len() / 4;
+        x4([
+            W::unsafe_read_be(&input[..n]),
+            W::unsafe_read_be(&input[n..n * 2]),
+            W::unsafe_read_be(&input[n * 2..n * 3]),
+            W::unsafe_read_be(&input[n * 3..]),
+        ])
+    }
+    #[inline(always)]
+    fn write_le(self, out: &mut [u8]) {
+        let n = out.len() / 4;
+        self.0[0].write_le(&mut out[..n]);
+        self.0[1].write_le(&mut out[n..n * 2]);
+        self.0[2].write_le(&mut out[n * 2..n * 3]);
+        self.0[3].write_le(&mut out[n * 3..]);
+    }
+    #[inline(always)]
+    fn write_be(self, out: &mut [u8]) {
+        let n = out.len() / 4;
+        self.0[0].write_be(&mut out[..n]);
+        self.0[1].write_be(&mut out[n..n * 2]);
+        self.0[2].write_be(&mut out[n * 2..n * 3]);
+        self.0[3].write_be(&mut out[n * 3..]);
+    }
+}
+impl<W: Copy + LaneWords4> LaneWords4 for x4<W> {
+    #[inline(always)]
+    fn shuffle_lane_words2301(self) -> Self {
+        x4([
+            self.0[0].shuffle_lane_words2301(),
+            self.0[1].shuffle_lane_words2301(),
+            self.0[2].shuffle_lane_words2301(),
+            self.0[3].shuffle_lane_words2301(),
+        ])
+    }
+    #[inline(always)]
+    fn shuffle_lane_words1230(self) -> Self {
+        x4([
+            self.0[0].shuffle_lane_words1230(),
+            self.0[1].shuffle_lane_words1230(),
+            self.0[2].shuffle_lane_words1230(),
+            self.0[3].shuffle_lane_words1230(),
+        ])
+    }
+    #[inline(always)]
+    fn shuffle_lane_words3012(self) -> Self {
+        x4([
+            self.0[0].shuffle_lane_words3012(),
+            self.0[1].shuffle_lane_words3012(),
+            self.0[2].shuffle_lane_words3012(),
+            self.0[3].shuffle_lane_words3012(),
+        ])
+    }
+}
+
+
\ No newline at end of file diff --git a/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/types.rs.html b/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/types.rs.html new file mode 100644 index 0000000..40e2382 --- /dev/null +++ b/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/types.rs.html @@ -0,0 +1,598 @@ +types.rs - source
1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+
#![allow(non_camel_case_types)]
+use core::ops::{Add, AddAssign, BitAnd, BitOr, BitXor, BitXorAssign, Not};
+
+pub trait AndNot {
+    type Output;
+    fn andnot(self, rhs: Self) -> Self::Output;
+}
+pub trait BSwap {
+    fn bswap(self) -> Self;
+}
+/// Ops that depend on word size
+pub trait ArithOps: Add<Output = Self> + AddAssign + Sized + Copy + Clone + BSwap {}
+/// Ops that are independent of word size and endian
+pub trait BitOps0:
+    BitAnd<Output = Self>
+    + BitOr<Output = Self>
+    + BitXor<Output = Self>
+    + BitXorAssign
+    + Not<Output = Self>
+    + AndNot<Output = Self>
+    + Sized
+    + Copy
+    + Clone
+{
+}
+
+pub trait BitOps32: BitOps0 + RotateEachWord32 {}
+pub trait BitOps64: BitOps32 + RotateEachWord64 {}
+pub trait BitOps128: BitOps64 + RotateEachWord128 {}
+
+pub trait RotateEachWord32 {
+    fn rotate_each_word_right7(self) -> Self;
+    fn rotate_each_word_right8(self) -> Self;
+    fn rotate_each_word_right11(self) -> Self;
+    fn rotate_each_word_right12(self) -> Self;
+    fn rotate_each_word_right16(self) -> Self;
+    fn rotate_each_word_right20(self) -> Self;
+    fn rotate_each_word_right24(self) -> Self;
+    fn rotate_each_word_right25(self) -> Self;
+}
+
+pub trait RotateEachWord64 {
+    fn rotate_each_word_right32(self) -> Self;
+}
+
+pub trait RotateEachWord128 {}
+
+// Vector type naming scheme:
+// uN[xP]xL
+// Unsigned; N-bit words * P bits per lane * L lanes
+//
+// A lane is always 128-bits, chosen because common SIMD architectures treat 128-bit units of
+// wide vectors specially (supporting e.g. intra-lane shuffles), and tend to have limited and
+// slow inter-lane operations.
+
+use crate::arch::{vec128_storage, vec256_storage, vec512_storage};
+
+#[allow(clippy::missing_safety_doc)]
+pub trait UnsafeFrom<T> {
+    unsafe fn unsafe_from(t: T) -> Self;
+}
+
+/// A vector composed of two elements, which may be words or themselves vectors.
+pub trait Vec2<W> {
+    fn extract(self, i: u32) -> W;
+    fn insert(self, w: W, i: u32) -> Self;
+}
+
+/// A vector composed of four elements, which may be words or themselves vectors.
+pub trait Vec4<W> {
+    fn extract(self, i: u32) -> W;
+    fn insert(self, w: W, i: u32) -> Self;
+}
+/// Vec4 functions which may not be implemented yet for all Vec4 types.
+/// NOTE: functions in this trait may be moved to Vec4 in any patch release. To avoid breakage,
+/// import Vec4Ext only together with Vec4, and don't qualify its methods.
+pub trait Vec4Ext<W> {
+    fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self)
+    where
+        Self: Sized;
+}
+pub trait Vector<T> {
+    fn to_scalars(self) -> T;
+}
+
+// TODO: multiples of 4 should inherit this
+/// A vector composed of four words; depending on their size, operations may cross lanes.
+pub trait Words4 {
+    fn shuffle1230(self) -> Self;
+    fn shuffle2301(self) -> Self;
+    fn shuffle3012(self) -> Self;
+}
+
+/// A vector composed one or more lanes each composed of four words.
+pub trait LaneWords4 {
+    fn shuffle_lane_words1230(self) -> Self;
+    fn shuffle_lane_words2301(self) -> Self;
+    fn shuffle_lane_words3012(self) -> Self;
+}
+
+// TODO: make this a part of BitOps
+/// Exchange neigboring ranges of bits of the specified size
+pub trait Swap64 {
+    fn swap1(self) -> Self;
+    fn swap2(self) -> Self;
+    fn swap4(self) -> Self;
+    fn swap8(self) -> Self;
+    fn swap16(self) -> Self;
+    fn swap32(self) -> Self;
+    fn swap64(self) -> Self;
+}
+
+pub trait u32x4<M: Machine>:
+    BitOps32
+    + Store<vec128_storage>
+    + ArithOps
+    + Vec4<u32>
+    + Words4
+    + LaneWords4
+    + StoreBytes
+    + MultiLane<[u32; 4]>
+    + Into<vec128_storage>
+{
+}
+pub trait u64x2<M: Machine>:
+    BitOps64 + Store<vec128_storage> + ArithOps + Vec2<u64> + MultiLane<[u64; 2]> + Into<vec128_storage>
+{
+}
+pub trait u128x1<M: Machine>:
+    BitOps128 + Store<vec128_storage> + Swap64 + MultiLane<[u128; 1]> + Into<vec128_storage>
+{
+}
+
+pub trait u32x4x2<M: Machine>:
+    BitOps32
+    + Store<vec256_storage>
+    + Vec2<M::u32x4>
+    + MultiLane<[M::u32x4; 2]>
+    + ArithOps
+    + Into<vec256_storage>
+    + StoreBytes
+{
+}
+pub trait u64x2x2<M: Machine>:
+    BitOps64
+    + Store<vec256_storage>
+    + Vec2<M::u64x2>
+    + MultiLane<[M::u64x2; 2]>
+    + ArithOps
+    + StoreBytes
+    + Into<vec256_storage>
+{
+}
+pub trait u64x4<M: Machine>:
+    BitOps64
+    + Store<vec256_storage>
+    + Vec4<u64>
+    + MultiLane<[u64; 4]>
+    + ArithOps
+    + Words4
+    + StoreBytes
+    + Into<vec256_storage>
+{
+}
+pub trait u128x2<M: Machine>:
+    BitOps128
+    + Store<vec256_storage>
+    + Vec2<M::u128x1>
+    + MultiLane<[M::u128x1; 2]>
+    + Swap64
+    + Into<vec256_storage>
+{
+}
+
+pub trait u32x4x4<M: Machine>:
+    BitOps32
+    + Store<vec512_storage>
+    + Vec4<M::u32x4>
+    + Vec4Ext<M::u32x4>
+    + Vector<[u32; 16]>
+    + MultiLane<[M::u32x4; 4]>
+    + ArithOps
+    + LaneWords4
+    + Into<vec512_storage>
+    + StoreBytes
+{
+}
+pub trait u64x2x4<M: Machine>:
+    BitOps64
+    + Store<vec512_storage>
+    + Vec4<M::u64x2>
+    + MultiLane<[M::u64x2; 4]>
+    + ArithOps
+    + Into<vec512_storage>
+{
+}
+// TODO: Words4
+pub trait u128x4<M: Machine>:
+    BitOps128
+    + Store<vec512_storage>
+    + Vec4<M::u128x1>
+    + MultiLane<[M::u128x1; 4]>
+    + Swap64
+    + Into<vec512_storage>
+{
+}
+
+/// A vector composed of multiple 128-bit lanes.
+pub trait MultiLane<Lanes> {
+    /// Split a multi-lane vector into single-lane vectors.
+    fn to_lanes(self) -> Lanes;
+    /// Build a multi-lane vector from individual lanes.
+    fn from_lanes(lanes: Lanes) -> Self;
+}
+
+/// Combine single vectors into a multi-lane vector.
+pub trait VZip<V> {
+    fn vzip(self) -> V;
+}
+
+impl<V, T> VZip<V> for T
+where
+    V: MultiLane<T>,
+{
+    #[inline(always)]
+    fn vzip(self) -> V {
+        V::from_lanes(self)
+    }
+}
+
+pub trait Machine: Sized + Copy {
+    type u32x4: u32x4<Self>;
+    type u64x2: u64x2<Self>;
+    type u128x1: u128x1<Self>;
+
+    type u32x4x2: u32x4x2<Self>;
+    type u64x2x2: u64x2x2<Self>;
+    type u64x4: u64x4<Self>;
+    type u128x2: u128x2<Self>;
+
+    type u32x4x4: u32x4x4<Self>;
+    type u64x2x4: u64x2x4<Self>;
+    type u128x4: u128x4<Self>;
+
+    #[inline(always)]
+    fn unpack<S, V: Store<S>>(self, s: S) -> V {
+        unsafe { V::unpack(s) }
+    }
+
+    #[inline(always)]
+    fn vec<V, A>(self, a: A) -> V
+    where
+        V: MultiLane<A>,
+    {
+        V::from_lanes(a)
+    }
+
+    #[inline(always)]
+    fn read_le<V>(self, input: &[u8]) -> V
+    where
+        V: StoreBytes,
+    {
+        unsafe { V::unsafe_read_le(input) }
+    }
+
+    #[inline(always)]
+    fn read_be<V>(self, input: &[u8]) -> V
+    where
+        V: StoreBytes,
+    {
+        unsafe { V::unsafe_read_be(input) }
+    }
+
+    /// # Safety
+    /// Caller must ensure the type of Self is appropriate for the hardware of the execution
+    /// environment.
+    unsafe fn instance() -> Self;
+}
+
+pub trait Store<S> {
+    /// # Safety
+    /// Caller must ensure the type of Self is appropriate for the hardware of the execution
+    /// environment.
+    unsafe fn unpack(p: S) -> Self;
+}
+
+pub trait StoreBytes {
+    /// # Safety
+    /// Caller must ensure the type of Self is appropriate for the hardware of the execution
+    /// environment.
+    unsafe fn unsafe_read_le(input: &[u8]) -> Self;
+    /// # Safety
+    /// Caller must ensure the type of Self is appropriate for the hardware of the execution
+    /// environment.
+    unsafe fn unsafe_read_be(input: &[u8]) -> Self;
+    fn write_le(self, out: &mut [u8]);
+    fn write_be(self, out: &mut [u8]);
+}
+
+
\ No newline at end of file diff --git a/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/x86_64/mod.rs.html b/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/x86_64/mod.rs.html new file mode 100644 index 0000000..c649c50 --- /dev/null +++ b/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/x86_64/mod.rs.html @@ -0,0 +1,876 @@ +mod.rs - source
1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+
// crate minimums: sse2, x86_64
+
+use crate::types::*;
+use core::arch::x86_64::{__m128i, __m256i};
+
+mod sse2;
+
+#[derive(Copy, Clone)]
+pub struct YesS3;
+#[derive(Copy, Clone)]
+pub struct NoS3;
+
+#[derive(Copy, Clone)]
+pub struct YesS4;
+#[derive(Copy, Clone)]
+pub struct NoS4;
+
+#[derive(Copy, Clone)]
+pub struct YesA1;
+#[derive(Copy, Clone)]
+pub struct NoA1;
+
+#[derive(Copy, Clone)]
+pub struct YesA2;
+#[derive(Copy, Clone)]
+pub struct NoA2;
+
+#[derive(Copy, Clone)]
+pub struct YesNI;
+#[derive(Copy, Clone)]
+pub struct NoNI;
+
+use core::marker::PhantomData;
+
+#[derive(Copy, Clone)]
+pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
+impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
+where
+    sse2::u128x1_sse2<S3, S4, NI>: Swap64,
+    sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
+    sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
+    sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
+    sse2::u128x1_sse2<S3, S4, NI>: BSwap,
+    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
+    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
+    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
+    sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
+    sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
+{
+    type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
+    type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
+    type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;
+
+    type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
+    type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
+    type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
+    type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;
+
+    type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
+    type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
+    type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;
+
+    #[inline(always)]
+    unsafe fn instance() -> Self {
+        SseMachine(PhantomData)
+    }
+}
+
+#[derive(Copy, Clone)]
+pub struct Avx2Machine<NI>(PhantomData<NI>);
+impl<NI: Copy> Machine for Avx2Machine<NI>
+where
+    sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
+    sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
+    sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
+    sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
+{
+    type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
+    type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
+    type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
+
+    type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>;
+    type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
+    type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
+    type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
+
+    type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
+    type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
+    type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;
+
+    #[inline(always)]
+    unsafe fn instance() -> Self {
+        Avx2Machine(PhantomData)
+    }
+}
+
+pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
+pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
+pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
+/// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
+/// to avoid expensive SSE/VEX conflicts.
+pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
+pub type AVX2 = Avx2Machine<NoNI>;
+
+/// Generic wrapper for unparameterized storage of any of the possible impls.
+/// Converting into and out of this type should be essentially free, although it may be more
+/// aligned than a particular impl requires.
+#[allow(non_camel_case_types)]
+#[derive(Copy, Clone)]
+pub union vec128_storage {
+    u32x4: [u32; 4],
+    u64x2: [u64; 2],
+    u128x1: [u128; 1],
+    sse2: __m128i,
+}
+impl Store<vec128_storage> for vec128_storage {
+    #[inline(always)]
+    unsafe fn unpack(p: vec128_storage) -> Self {
+        p
+    }
+}
+impl<'a> From<&'a vec128_storage> for &'a [u32; 4] {
+    #[inline(always)]
+    fn from(x: &'a vec128_storage) -> Self {
+        unsafe { &x.u32x4 }
+    }
+}
+impl From<[u32; 4]> for vec128_storage {
+    #[inline(always)]
+    fn from(u32x4: [u32; 4]) -> Self {
+        vec128_storage { u32x4 }
+    }
+}
+impl Default for vec128_storage {
+    #[inline(always)]
+    fn default() -> Self {
+        vec128_storage { u128x1: [0] }
+    }
+}
+impl Eq for vec128_storage {}
+impl PartialEq for vec128_storage {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { self.u128x1 == rhs.u128x1 }
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[derive(Copy, Clone)]
+pub union vec256_storage {
+    u32x8: [u32; 8],
+    u64x4: [u64; 4],
+    u128x2: [u128; 2],
+    sse2: [vec128_storage; 2],
+    avx: __m256i,
+}
+impl From<[u64; 4]> for vec256_storage {
+    #[inline(always)]
+    fn from(u64x4: [u64; 4]) -> Self {
+        vec256_storage { u64x4 }
+    }
+}
+impl Default for vec256_storage {
+    #[inline(always)]
+    fn default() -> Self {
+        vec256_storage { u128x2: [0, 0] }
+    }
+}
+impl vec256_storage {
+    #[inline(always)]
+    pub fn new128(xs: [vec128_storage; 2]) -> Self {
+        Self { sse2: xs }
+    }
+    #[inline(always)]
+    pub fn split128(self) -> [vec128_storage; 2] {
+        unsafe { self.sse2 }
+    }
+}
+impl Eq for vec256_storage {}
+impl PartialEq for vec256_storage {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { self.sse2 == rhs.sse2 }
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[derive(Copy, Clone)]
+pub union vec512_storage {
+    u32x16: [u32; 16],
+    u64x8: [u64; 8],
+    u128x4: [u128; 4],
+    sse2: [vec128_storage; 4],
+    avx: [vec256_storage; 2],
+}
+impl Default for vec512_storage {
+    #[inline(always)]
+    fn default() -> Self {
+        vec512_storage {
+            u128x4: [0, 0, 0, 0],
+        }
+    }
+}
+impl vec512_storage {
+    #[inline(always)]
+    pub fn new128(xs: [vec128_storage; 4]) -> Self {
+        Self { sse2: xs }
+    }
+    #[inline(always)]
+    pub fn split128(self) -> [vec128_storage; 4] {
+        unsafe { self.sse2 }
+    }
+}
+impl Eq for vec512_storage {}
+impl PartialEq for vec512_storage {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { self.avx == rhs.avx }
+    }
+}
+
+macro_rules! impl_into {
+    ($storage:ident, $array:ty, $name:ident) => {
+        impl From<$storage> for $array {
+            #[inline(always)]
+            fn from(vec: $storage) -> Self {
+                unsafe { vec.$name }
+            }
+        }
+    };
+}
+impl_into!(vec128_storage, [u32; 4], u32x4);
+impl_into!(vec128_storage, [u64; 2], u64x2);
+impl_into!(vec128_storage, [u128; 1], u128x1);
+impl_into!(vec256_storage, [u32; 8], u32x8);
+impl_into!(vec256_storage, [u64; 4], u64x4);
+impl_into!(vec256_storage, [u128; 2], u128x2);
+impl_into!(vec512_storage, [u32; 16], u32x16);
+impl_into!(vec512_storage, [u64; 8], u64x8);
+impl_into!(vec512_storage, [u128; 4], u128x4);
+
+/// Generate the full set of optimized implementations to take advantage of the most important
+/// hardware feature sets.
+///
+/// This dispatcher is suitable for maximizing throughput.
+#[macro_export]
+macro_rules! dispatch {
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
+        #[cfg(feature = "std")]
+        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            #[inline(always)]
+            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            use std::arch::x86_64::*;
+            #[target_feature(enable = "avx2")]
+            unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
+                let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
+                _mm256_zeroupper();
+                ret
+            }
+            #[target_feature(enable = "avx")]
+            #[target_feature(enable = "sse4.1")]
+            #[target_feature(enable = "ssse3")]
+            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
+                let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
+                _mm256_zeroupper();
+                ret
+            }
+            #[target_feature(enable = "sse4.1")]
+            #[target_feature(enable = "ssse3")]
+            unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
+            }
+            #[target_feature(enable = "ssse3")]
+            unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
+            }
+            #[target_feature(enable = "sse2")]
+            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+            }
+            unsafe {
+                if is_x86_feature_detected!("avx2") {
+                    impl_avx2($($arg),*)
+                } else if is_x86_feature_detected!("avx") {
+                    impl_avx($($arg),*)
+                } else if is_x86_feature_detected!("sse4.1") {
+                    impl_sse41($($arg),*)
+                } else if is_x86_feature_detected!("ssse3") {
+                    impl_ssse3($($arg),*)
+                } else if is_x86_feature_detected!("sse2") {
+                    impl_sse2($($arg),*)
+                } else {
+                    unimplemented!()
+                }
+            }
+        }
+        #[cfg(not(feature = "std"))]
+        #[inline(always)]
+        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            unsafe {
+                if cfg!(target_feature = "avx2") {
+                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
+                } else if cfg!(target_feature = "avx") {
+                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+                } else if cfg!(target_feature = "sse4.1") {
+                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
+                } else if cfg!(target_feature = "ssse3") {
+                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
+                } else {
+                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+                }
+            }
+        }
+    };
+    ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
+        dispatch!($mach, $MTy, {
+            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
+        });
+    }
+}
+
+/// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
+/// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
+///
+/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
+/// features (e.g. because they are done infrequently), so minimizing their contribution to code
+/// size is more important.
+#[macro_export]
+macro_rules! dispatch_light128 {
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
+        #[cfg(feature = "std")]
+        $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            #[inline(always)]
+            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            use std::arch::x86_64::*;
+            #[target_feature(enable = "avx")]
+            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+            }
+            #[target_feature(enable = "sse2")]
+            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+            }
+            unsafe {
+                if is_x86_feature_detected!("avx") {
+                    impl_avx($($arg),*)
+                } else if is_x86_feature_detected!("sse2") {
+                    impl_sse2($($arg),*)
+                } else {
+                    unimplemented!()
+                }
+            }
+        }
+        #[cfg(not(feature = "std"))]
+        #[inline(always)]
+        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            unsafe {
+                if cfg!(target_feature = "avx2") {
+                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
+                } else if cfg!(target_feature = "avx") {
+                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+                } else if cfg!(target_feature = "sse4.1") {
+                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
+                } else if cfg!(target_feature = "ssse3") {
+                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
+                } else {
+                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+                }
+            }
+        }
+    };
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
+        dispatch_light128!($mach, $MTy, {
+            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
+        });
+    }
+}
+
+/// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
+/// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
+///
+/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
+/// features (e.g. because they are done infrequently), so minimizing their contribution to code
+/// size is more important.
+#[macro_export]
+macro_rules! dispatch_light256 {
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
+        #[cfg(feature = "std")]
+        $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
+            #[inline(always)]
+            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            use std::arch::x86_64::*;
+            #[target_feature(enable = "avx")]
+            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+            }
+            #[target_feature(enable = "sse2")]
+            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+            }
+            unsafe {
+                if is_x86_feature_detected!("avx") {
+                    impl_avx($($arg),*)
+                } else if is_x86_feature_detected!("sse2") {
+                    impl_sse2($($arg),*)
+                } else {
+                    unimplemented!()
+                }
+            }
+        }
+        #[cfg(not(feature = "std"))]
+        #[inline(always)]
+        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            unsafe {
+                if cfg!(target_feature = "avx2") {
+                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
+                } else if cfg!(target_feature = "avx") {
+                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+                } else if cfg!(target_feature = "sse4.1") {
+                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
+                } else if cfg!(target_feature = "ssse3") {
+                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
+                } else {
+                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+                }
+            }
+        }
+    };
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
+        dispatch_light256!($mach, $MTy, {
+            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
+        });
+    }
+}
+
+
\ No newline at end of file diff --git a/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/x86_64/sse2.rs.html b/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/x86_64/sse2.rs.html new file mode 100644 index 0000000..609cf09 --- /dev/null +++ b/rust/theBook/chapter-2-guessing-game/guessing_game/target/doc/src/ppv_lite86/x86_64/sse2.rs.html @@ -0,0 +1,3408 @@ +sse2.rs - source
1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
+221
+222
+223
+224
+225
+226
+227
+228
+229
+230
+231
+232
+233
+234
+235
+236
+237
+238
+239
+240
+241
+242
+243
+244
+245
+246
+247
+248
+249
+250
+251
+252
+253
+254
+255
+256
+257
+258
+259
+260
+261
+262
+263
+264
+265
+266
+267
+268
+269
+270
+271
+272
+273
+274
+275
+276
+277
+278
+279
+280
+281
+282
+283
+284
+285
+286
+287
+288
+289
+290
+291
+292
+293
+294
+295
+296
+297
+298
+299
+300
+301
+302
+303
+304
+305
+306
+307
+308
+309
+310
+311
+312
+313
+314
+315
+316
+317
+318
+319
+320
+321
+322
+323
+324
+325
+326
+327
+328
+329
+330
+331
+332
+333
+334
+335
+336
+337
+338
+339
+340
+341
+342
+343
+344
+345
+346
+347
+348
+349
+350
+351
+352
+353
+354
+355
+356
+357
+358
+359
+360
+361
+362
+363
+364
+365
+366
+367
+368
+369
+370
+371
+372
+373
+374
+375
+376
+377
+378
+379
+380
+381
+382
+383
+384
+385
+386
+387
+388
+389
+390
+391
+392
+393
+394
+395
+396
+397
+398
+399
+400
+401
+402
+403
+404
+405
+406
+407
+408
+409
+410
+411
+412
+413
+414
+415
+416
+417
+418
+419
+420
+421
+422
+423
+424
+425
+426
+427
+428
+429
+430
+431
+432
+433
+434
+435
+436
+437
+438
+439
+440
+441
+442
+443
+444
+445
+446
+447
+448
+449
+450
+451
+452
+453
+454
+455
+456
+457
+458
+459
+460
+461
+462
+463
+464
+465
+466
+467
+468
+469
+470
+471
+472
+473
+474
+475
+476
+477
+478
+479
+480
+481
+482
+483
+484
+485
+486
+487
+488
+489
+490
+491
+492
+493
+494
+495
+496
+497
+498
+499
+500
+501
+502
+503
+504
+505
+506
+507
+508
+509
+510
+511
+512
+513
+514
+515
+516
+517
+518
+519
+520
+521
+522
+523
+524
+525
+526
+527
+528
+529
+530
+531
+532
+533
+534
+535
+536
+537
+538
+539
+540
+541
+542
+543
+544
+545
+546
+547
+548
+549
+550
+551
+552
+553
+554
+555
+556
+557
+558
+559
+560
+561
+562
+563
+564
+565
+566
+567
+568
+569
+570
+571
+572
+573
+574
+575
+576
+577
+578
+579
+580
+581
+582
+583
+584
+585
+586
+587
+588
+589
+590
+591
+592
+593
+594
+595
+596
+597
+598
+599
+600
+601
+602
+603
+604
+605
+606
+607
+608
+609
+610
+611
+612
+613
+614
+615
+616
+617
+618
+619
+620
+621
+622
+623
+624
+625
+626
+627
+628
+629
+630
+631
+632
+633
+634
+635
+636
+637
+638
+639
+640
+641
+642
+643
+644
+645
+646
+647
+648
+649
+650
+651
+652
+653
+654
+655
+656
+657
+658
+659
+660
+661
+662
+663
+664
+665
+666
+667
+668
+669
+670
+671
+672
+673
+674
+675
+676
+677
+678
+679
+680
+681
+682
+683
+684
+685
+686
+687
+688
+689
+690
+691
+692
+693
+694
+695
+696
+697
+698
+699
+700
+701
+702
+703
+704
+705
+706
+707
+708
+709
+710
+711
+712
+713
+714
+715
+716
+717
+718
+719
+720
+721
+722
+723
+724
+725
+726
+727
+728
+729
+730
+731
+732
+733
+734
+735
+736
+737
+738
+739
+740
+741
+742
+743
+744
+745
+746
+747
+748
+749
+750
+751
+752
+753
+754
+755
+756
+757
+758
+759
+760
+761
+762
+763
+764
+765
+766
+767
+768
+769
+770
+771
+772
+773
+774
+775
+776
+777
+778
+779
+780
+781
+782
+783
+784
+785
+786
+787
+788
+789
+790
+791
+792
+793
+794
+795
+796
+797
+798
+799
+800
+801
+802
+803
+804
+805
+806
+807
+808
+809
+810
+811
+812
+813
+814
+815
+816
+817
+818
+819
+820
+821
+822
+823
+824
+825
+826
+827
+828
+829
+830
+831
+832
+833
+834
+835
+836
+837
+838
+839
+840
+841
+842
+843
+844
+845
+846
+847
+848
+849
+850
+851
+852
+853
+854
+855
+856
+857
+858
+859
+860
+861
+862
+863
+864
+865
+866
+867
+868
+869
+870
+871
+872
+873
+874
+875
+876
+877
+878
+879
+880
+881
+882
+883
+884
+885
+886
+887
+888
+889
+890
+891
+892
+893
+894
+895
+896
+897
+898
+899
+900
+901
+902
+903
+904
+905
+906
+907
+908
+909
+910
+911
+912
+913
+914
+915
+916
+917
+918
+919
+920
+921
+922
+923
+924
+925
+926
+927
+928
+929
+930
+931
+932
+933
+934
+935
+936
+937
+938
+939
+940
+941
+942
+943
+944
+945
+946
+947
+948
+949
+950
+951
+952
+953
+954
+955
+956
+957
+958
+959
+960
+961
+962
+963
+964
+965
+966
+967
+968
+969
+970
+971
+972
+973
+974
+975
+976
+977
+978
+979
+980
+981
+982
+983
+984
+985
+986
+987
+988
+989
+990
+991
+992
+993
+994
+995
+996
+997
+998
+999
+1000
+1001
+1002
+1003
+1004
+1005
+1006
+1007
+1008
+1009
+1010
+1011
+1012
+1013
+1014
+1015
+1016
+1017
+1018
+1019
+1020
+1021
+1022
+1023
+1024
+1025
+1026
+1027
+1028
+1029
+1030
+1031
+1032
+1033
+1034
+1035
+1036
+1037
+1038
+1039
+1040
+1041
+1042
+1043
+1044
+1045
+1046
+1047
+1048
+1049
+1050
+1051
+1052
+1053
+1054
+1055
+1056
+1057
+1058
+1059
+1060
+1061
+1062
+1063
+1064
+1065
+1066
+1067
+1068
+1069
+1070
+1071
+1072
+1073
+1074
+1075
+1076
+1077
+1078
+1079
+1080
+1081
+1082
+1083
+1084
+1085
+1086
+1087
+1088
+1089
+1090
+1091
+1092
+1093
+1094
+1095
+1096
+1097
+1098
+1099
+1100
+1101
+1102
+1103
+1104
+1105
+1106
+1107
+1108
+1109
+1110
+1111
+1112
+1113
+1114
+1115
+1116
+1117
+1118
+1119
+1120
+1121
+1122
+1123
+1124
+1125
+1126
+1127
+1128
+1129
+1130
+1131
+1132
+1133
+1134
+1135
+1136
+1137
+1138
+1139
+1140
+1141
+1142
+1143
+1144
+1145
+1146
+1147
+1148
+1149
+1150
+1151
+1152
+1153
+1154
+1155
+1156
+1157
+1158
+1159
+1160
+1161
+1162
+1163
+1164
+1165
+1166
+1167
+1168
+1169
+1170
+1171
+1172
+1173
+1174
+1175
+1176
+1177
+1178
+1179
+1180
+1181
+1182
+1183
+1184
+1185
+1186
+1187
+1188
+1189
+1190
+1191
+1192
+1193
+1194
+1195
+1196
+1197
+1198
+1199
+1200
+1201
+1202
+1203
+1204
+1205
+1206
+1207
+1208
+1209
+1210
+1211
+1212
+1213
+1214
+1215
+1216
+1217
+1218
+1219
+1220
+1221
+1222
+1223
+1224
+1225
+1226
+1227
+1228
+1229
+1230
+1231
+1232
+1233
+1234
+1235
+1236
+1237
+1238
+1239
+1240
+1241
+1242
+1243
+1244
+1245
+1246
+1247
+1248
+1249
+1250
+1251
+1252
+1253
+1254
+1255
+1256
+1257
+1258
+1259
+1260
+1261
+1262
+1263
+1264
+1265
+1266
+1267
+1268
+1269
+1270
+1271
+1272
+1273
+1274
+1275
+1276
+1277
+1278
+1279
+1280
+1281
+1282
+1283
+1284
+1285
+1286
+1287
+1288
+1289
+1290
+1291
+1292
+1293
+1294
+1295
+1296
+1297
+1298
+1299
+1300
+1301
+1302
+1303
+1304
+1305
+1306
+1307
+1308
+1309
+1310
+1311
+1312
+1313
+1314
+1315
+1316
+1317
+1318
+1319
+1320
+1321
+1322
+1323
+1324
+1325
+1326
+1327
+1328
+1329
+1330
+1331
+1332
+1333
+1334
+1335
+1336
+1337
+1338
+1339
+1340
+1341
+1342
+1343
+1344
+1345
+1346
+1347
+1348
+1349
+1350
+1351
+1352
+1353
+1354
+1355
+1356
+1357
+1358
+1359
+1360
+1361
+1362
+1363
+1364
+1365
+1366
+1367
+1368
+1369
+1370
+1371
+1372
+1373
+1374
+1375
+1376
+1377
+1378
+1379
+1380
+1381
+1382
+1383
+1384
+1385
+1386
+1387
+1388
+1389
+1390
+1391
+1392
+1393
+1394
+1395
+1396
+1397
+1398
+1399
+1400
+1401
+1402
+1403
+1404
+1405
+1406
+1407
+1408
+1409
+1410
+1411
+1412
+1413
+1414
+1415
+1416
+1417
+1418
+1419
+1420
+1421
+1422
+1423
+1424
+1425
+1426
+1427
+1428
+1429
+1430
+1431
+1432
+1433
+1434
+1435
+1436
+1437
+1438
+1439
+1440
+1441
+1442
+1443
+1444
+1445
+1446
+1447
+1448
+1449
+1450
+1451
+1452
+1453
+1454
+1455
+1456
+1457
+1458
+1459
+1460
+1461
+1462
+1463
+1464
+1465
+1466
+1467
+1468
+1469
+1470
+1471
+1472
+1473
+1474
+1475
+1476
+1477
+1478
+1479
+1480
+1481
+1482
+1483
+1484
+1485
+1486
+1487
+1488
+1489
+1490
+1491
+1492
+1493
+1494
+1495
+1496
+1497
+1498
+1499
+1500
+1501
+1502
+1503
+1504
+1505
+1506
+1507
+1508
+1509
+1510
+1511
+1512
+1513
+1514
+1515
+1516
+1517
+1518
+1519
+1520
+1521
+1522
+1523
+1524
+1525
+1526
+1527
+1528
+1529
+1530
+1531
+1532
+1533
+1534
+1535
+1536
+1537
+1538
+1539
+1540
+1541
+1542
+1543
+1544
+1545
+1546
+1547
+1548
+1549
+1550
+1551
+1552
+1553
+1554
+1555
+1556
+1557
+1558
+1559
+1560
+1561
+1562
+1563
+1564
+1565
+1566
+1567
+1568
+1569
+1570
+1571
+1572
+1573
+1574
+1575
+1576
+1577
+1578
+1579
+1580
+1581
+1582
+1583
+1584
+1585
+1586
+1587
+1588
+1589
+1590
+1591
+1592
+1593
+1594
+1595
+1596
+1597
+1598
+1599
+1600
+1601
+1602
+1603
+1604
+1605
+1606
+1607
+1608
+1609
+1610
+1611
+1612
+1613
+1614
+1615
+1616
+1617
+1618
+1619
+1620
+1621
+1622
+1623
+1624
+1625
+1626
+1627
+1628
+1629
+1630
+1631
+1632
+1633
+1634
+1635
+1636
+1637
+1638
+1639
+1640
+1641
+1642
+1643
+1644
+1645
+1646
+1647
+1648
+1649
+1650
+1651
+1652
+1653
+1654
+1655
+1656
+1657
+1658
+1659
+1660
+1661
+1662
+1663
+1664
+1665
+1666
+1667
+1668
+1669
+1670
+1671
+1672
+1673
+1674
+1675
+1676
+1677
+1678
+1679
+1680
+1681
+1682
+1683
+1684
+1685
+1686
+1687
+1688
+1689
+1690
+1691
+1692
+1693
+1694
+1695
+1696
+1697
+1698
+1699
+1700
+1701
+1702
+1703
+
use crate::soft::{x2, x4};
+use crate::types::*;
+use crate::vec128_storage;
+use crate::x86_64::Avx2Machine;
+use crate::x86_64::SseMachine as Machine86;
+use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
+use core::arch::x86_64::*;
+use core::marker::PhantomData;
+use core::ops::{
+    Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
+};
+
+macro_rules! impl_binop {
+    ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
+        impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
+            type Output = Self;
+            #[inline(always)]
+            fn $fn(self, rhs: Self) -> Self::Output {
+                Self::new(unsafe { $impl_fn(self.x, rhs.x) })
+            }
+        }
+    };
+}
+
+macro_rules! impl_binop_assign {
+    ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
+        impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
+        where
+            $vec<S3, S4, NI>: Copy,
+        {
+            #[inline(always)]
+            fn $fn_assign(&mut self, rhs: Self) {
+                *self = self.$fn(rhs);
+            }
+        }
+    };
+}
+
+macro_rules! def_vec {
+    ($vec:ident, $word:ident) => {
+        #[allow(non_camel_case_types)]
+        #[derive(Copy, Clone)]
+        pub struct $vec<S3, S4, NI> {
+            x: __m128i,
+            s3: PhantomData<S3>,
+            s4: PhantomData<S4>,
+            ni: PhantomData<NI>,
+        }
+
+        impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
+            #[inline(always)]
+            unsafe fn unpack(x: vec128_storage) -> Self {
+                Self::new(x.sse2)
+            }
+        }
+        impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
+            #[inline(always)]
+            fn from(x: $vec<S3, S4, NI>) -> Self {
+                vec128_storage { sse2: x.x }
+            }
+        }
+        impl<S3, S4, NI> $vec<S3, S4, NI> {
+            #[inline(always)]
+            fn new(x: __m128i) -> Self {
+                $vec {
+                    x,
+                    s3: PhantomData,
+                    s4: PhantomData,
+                    ni: PhantomData,
+                }
+            }
+        }
+
+        impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
+        where
+            Self: BSwap,
+        {
+            #[inline(always)]
+            unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+                assert_eq!(input.len(), 16);
+                Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
+            }
+            #[inline(always)]
+            unsafe fn unsafe_read_be(input: &[u8]) -> Self {
+                assert_eq!(input.len(), 16);
+                Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
+            }
+            #[inline(always)]
+            fn write_le(self, out: &mut [u8]) {
+                assert_eq!(out.len(), 16);
+                unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
+            }
+            #[inline(always)]
+            fn write_be(self, out: &mut [u8]) {
+                assert_eq!(out.len(), 16);
+                let x = self.bswap().x;
+                unsafe {
+                    _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
+                }
+            }
+        }
+
+        impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
+            #[inline(always)]
+            fn default() -> Self {
+                Self::new(unsafe { _mm_setzero_si128() })
+            }
+        }
+
+        impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
+            type Output = Self;
+            #[inline(always)]
+            fn not(self) -> Self::Output {
+                unsafe {
+                    let ff = _mm_set1_epi64x(-1i64);
+                    self ^ Self::new(ff)
+                }
+            }
+        }
+
+        impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
+        impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
+        impl_binop!($vec, BitOr, bitor, _mm_or_si128);
+        impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
+        impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
+        impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
+        impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
+        impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
+            type Output = Self;
+            #[inline(always)]
+            fn andnot(self, rhs: Self) -> Self {
+                Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
+            }
+        }
+    };
+}
+
+macro_rules! impl_bitops32 {
+    ($vec:ident) => {
+        impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
+            $vec<S3, S4, NI>: RotateEachWord32
+        {
+        }
+    };
+}
+
+macro_rules! impl_bitops64 {
+    ($vec:ident) => {
+        impl_bitops32!($vec);
+        impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
+            $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
+        {
+        }
+    };
+}
+
+macro_rules! impl_bitops128 {
+    ($vec:ident) => {
+        impl_bitops64!($vec);
+        impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
+            $vec<S3, S4, NI>: RotateEachWord128
+        {
+        }
+    };
+}
+
+macro_rules! rotr_32_s3 {
+    ($name:ident, $k0:expr, $k1:expr) => {
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
+        }
+    };
+}
+macro_rules! rotr_32 {
+    ($name:ident, $i:expr) => {
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe {
+                _mm_or_si128(
+                    _mm_srli_epi32(self.x, $i as i32),
+                    _mm_slli_epi32(self.x, 32 - $i as i32),
+                )
+            })
+        }
+    };
+}
+impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
+    rotr_32!(rotate_each_word_right7, 7);
+    rotr_32_s3!(
+        rotate_each_word_right8,
+        0x0c0f_0e0d_080b_0a09,
+        0x0407_0605_0003_0201
+    );
+    rotr_32!(rotate_each_word_right11, 11);
+    rotr_32!(rotate_each_word_right12, 12);
+    rotr_32_s3!(
+        rotate_each_word_right16,
+        0x0d0c_0f0e_0908_0b0a,
+        0x0504_0706_0100_0302
+    );
+    rotr_32!(rotate_each_word_right20, 20);
+    rotr_32_s3!(
+        rotate_each_word_right24,
+        0x0e0d_0c0f_0a09_080b,
+        0x0605_0407_0201_0003
+    );
+    rotr_32!(rotate_each_word_right25, 25);
+}
+impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
+    rotr_32!(rotate_each_word_right7, 7);
+    rotr_32!(rotate_each_word_right8, 8);
+    rotr_32!(rotate_each_word_right11, 11);
+    rotr_32!(rotate_each_word_right12, 12);
+    #[inline(always)]
+    fn rotate_each_word_right16(self) -> Self {
+        Self::new(swap16_s2(self.x))
+    }
+    rotr_32!(rotate_each_word_right20, 20);
+    rotr_32!(rotate_each_word_right24, 24);
+    rotr_32!(rotate_each_word_right25, 25);
+}
+
+macro_rules! rotr_64_s3 {
+    ($name:ident, $k0:expr, $k1:expr) => {
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
+        }
+    };
+}
+macro_rules! rotr_64 {
+    ($name:ident, $i:expr) => {
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe {
+                _mm_or_si128(
+                    _mm_srli_epi64(self.x, $i as i32),
+                    _mm_slli_epi64(self.x, 64 - $i as i32),
+                )
+            })
+        }
+    };
+}
+impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
+    rotr_64!(rotate_each_word_right7, 7);
+    rotr_64_s3!(
+        rotate_each_word_right8,
+        0x080f_0e0d_0c0b_0a09,
+        0x0007_0605_0403_0201
+    );
+    rotr_64!(rotate_each_word_right11, 11);
+    rotr_64!(rotate_each_word_right12, 12);
+    rotr_64_s3!(
+        rotate_each_word_right16,
+        0x0908_0f0e_0d0c_0b0a,
+        0x0100_0706_0504_0302
+    );
+    rotr_64!(rotate_each_word_right20, 20);
+    rotr_64_s3!(
+        rotate_each_word_right24,
+        0x0a09_080f_0e0d_0c0b,
+        0x0201_0007_0605_0403
+    );
+    rotr_64!(rotate_each_word_right25, 25);
+}
+impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
+    rotr_64!(rotate_each_word_right7, 7);
+    rotr_64!(rotate_each_word_right8, 8);
+    rotr_64!(rotate_each_word_right11, 11);
+    rotr_64!(rotate_each_word_right12, 12);
+    #[inline(always)]
+    fn rotate_each_word_right16(self) -> Self {
+        Self::new(swap16_s2(self.x))
+    }
+    rotr_64!(rotate_each_word_right20, 20);
+    rotr_64!(rotate_each_word_right24, 24);
+    rotr_64!(rotate_each_word_right25, 25);
+}
+impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn rotate_each_word_right32(self) -> Self {
+        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
+    }
+}
+
+macro_rules! rotr_128 {
+    ($name:ident, $i:expr) => {
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe {
+                _mm_or_si128(
+                    _mm_srli_si128(self.x, $i as i32),
+                    _mm_slli_si128(self.x, 128 - $i as i32),
+                )
+            })
+        }
+    };
+}
+// TODO: completely unoptimized
+impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
+    rotr_128!(rotate_each_word_right7, 7);
+    rotr_128!(rotate_each_word_right8, 8);
+    rotr_128!(rotate_each_word_right11, 11);
+    rotr_128!(rotate_each_word_right12, 12);
+    rotr_128!(rotate_each_word_right16, 16);
+    rotr_128!(rotate_each_word_right20, 20);
+    rotr_128!(rotate_each_word_right24, 24);
+    rotr_128!(rotate_each_word_right25, 25);
+}
+// TODO: completely unoptimized
+impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
+    rotr_128!(rotate_each_word_right32, 32);
+}
+impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
+
+def_vec!(u32x4_sse2, u32);
+def_vec!(u64x2_sse2, u64);
+def_vec!(u128x1_sse2, u128);
+
+impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
+    #[inline(always)]
+    fn to_lanes(self) -> [u32; 4] {
+        unsafe {
+            let x = _mm_cvtsi128_si64(self.x) as u64;
+            let y = _mm_extract_epi64(self.x, 1) as u64;
+            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
+        }
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u32; 4]) -> Self {
+        unsafe {
+            let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
+            x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
+            Self::new(x)
+        }
+    }
+}
+impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
+    #[inline(always)]
+    fn to_lanes(self) -> [u32; 4] {
+        unsafe {
+            let x = _mm_cvtsi128_si64(self.x) as u64;
+            let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
+            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
+        }
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u32; 4]) -> Self {
+        unsafe {
+            let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
+            let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
+            let x = _mm_cvtsi64_si128(x);
+            let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
+            Self::new(_mm_or_si128(x, y))
+        }
+    }
+}
+impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
+    #[inline(always)]
+    fn to_lanes(self) -> [u64; 2] {
+        unsafe {
+            [
+                _mm_cvtsi128_si64(self.x) as u64,
+                _mm_extract_epi64(self.x, 1) as u64,
+            ]
+        }
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u64; 2]) -> Self {
+        unsafe {
+            let mut x = _mm_cvtsi64_si128(xs[0] as i64);
+            x = _mm_insert_epi64(x, xs[1] as i64, 1);
+            Self::new(x)
+        }
+    }
+}
+impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
+    #[inline(always)]
+    fn to_lanes(self) -> [u64; 2] {
+        unsafe {
+            [
+                _mm_cvtsi128_si64(self.x) as u64,
+                _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
+            ]
+        }
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u64; 2]) -> Self {
+        unsafe {
+            let x = _mm_cvtsi64_si128(xs[0] as i64);
+            let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
+            Self::new(_mm_or_si128(x, y))
+        }
+    }
+}
+impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn to_lanes(self) -> [u128; 1] {
+        unimplemented!()
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u128; 1]) -> Self {
+        unimplemented!("{:?}", xs)
+    }
+}
+
+impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
+{
+    #[inline(always)]
+    fn to_lanes(self) -> [u64; 4] {
+        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
+        [a[0], a[1], b[0], b[1]]
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u64; 4]) -> Self {
+        let (a, b) = (
+            u64x2_sse2::from_lanes([xs[0], xs[1]]),
+            u64x2_sse2::from_lanes([xs[2], xs[3]]),
+        );
+        x2::new([a, b])
+    }
+}
+
+macro_rules! impl_into {
+    ($from:ident, $to:ident) => {
+        impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
+            #[inline(always)]
+            fn from(x: $from<S3, S4, NI>) -> Self {
+                $to::new(x.x)
+            }
+        }
+    };
+}
+
+impl_into!(u128x1_sse2, u32x4_sse2);
+impl_into!(u128x1_sse2, u64x2_sse2);
+
+impl_bitops32!(u32x4_sse2);
+impl_bitops64!(u64x2_sse2);
+impl_bitops128!(u128x1_sse2);
+
+impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
+    u32x4_sse2<S3, S4, NI>: BSwap
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
+    u64x2_sse2<S3, S4, NI>: BSwap
+{
+}
+impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
+impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
+impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
+impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
+
+impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
+where
+    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
+    Machine86<S3, S4, NI>: Machine,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>:
+        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
+    Machine86<S3, S4, NI>: Machine,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
+where
+    u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
+    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
+{
+}
+
+impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
+where
+    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
+    Machine86<YesS3, YesS4, NI>: Machine,
+{
+}
+impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
+where
+    u64x2_sse2<YesS3, YesS4, NI>:
+        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
+    Machine86<YesS3, YesS4, NI>: Machine,
+{
+}
+impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
+where
+    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
+    Machine86<YesS3, YesS4, NI>: Machine,
+    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
+    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
+{
+}
+
+impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
+    #[inline(always)]
+    unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
+        Self::new(_mm_set_epi32(
+            xs[3] as i32,
+            xs[2] as i32,
+            xs[1] as i32,
+            xs[0] as i32,
+        ))
+    }
+}
+
+impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
+where
+    Self: MultiLane<[u32; 4]>,
+{
+    #[inline(always)]
+    fn extract(self, i: u32) -> u32 {
+        self.to_lanes()[i as usize]
+    }
+    #[inline(always)]
+    fn insert(self, v: u32, i: u32) -> Self {
+        Self::new(unsafe {
+            match i {
+                0 => _mm_insert_epi32(self.x, v as i32, 0),
+                1 => _mm_insert_epi32(self.x, v as i32, 1),
+                2 => _mm_insert_epi32(self.x, v as i32, 2),
+                3 => _mm_insert_epi32(self.x, v as i32, 3),
+                _ => unreachable!(),
+            }
+        })
+    }
+}
+impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
+where
+    Self: MultiLane<[u32; 4]>,
+{
+    #[inline(always)]
+    fn extract(self, i: u32) -> u32 {
+        self.to_lanes()[i as usize]
+    }
+    #[inline(always)]
+    fn insert(self, v: u32, i: u32) -> Self {
+        Self::new(unsafe {
+            match i {
+                0 => {
+                    let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
+                    _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
+                }
+                1 => {
+                    let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
+                    x = _mm_slli_si128(x, 4);
+                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
+                    _mm_shuffle_epi32(x, 0b1110_0001)
+                }
+                2 => {
+                    let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
+                    x = _mm_slli_si128(x, 4);
+                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
+                    _mm_shuffle_epi32(x, 0b1100_1001)
+                }
+                3 => {
+                    let mut x = _mm_slli_si128(self.x, 4);
+                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
+                    _mm_shuffle_epi32(x, 0b0011_1001)
+                }
+                _ => unreachable!(),
+            }
+        })
+    }
+}
+
+impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn shuffle_lane_words2301(self) -> Self {
+        self.shuffle2301()
+    }
+    #[inline(always)]
+    fn shuffle_lane_words1230(self) -> Self {
+        self.shuffle1230()
+    }
+    #[inline(always)]
+    fn shuffle_lane_words3012(self) -> Self {
+        self.shuffle3012()
+    }
+}
+
+impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn shuffle2301(self) -> Self {
+        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
+    }
+    #[inline(always)]
+    fn shuffle1230(self) -> Self {
+        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
+    }
+    #[inline(always)]
+    fn shuffle3012(self) -> Self {
+        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
+    }
+}
+
+impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
+    #[inline(always)]
+    fn shuffle2301(self) -> Self {
+        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
+    }
+    #[inline(always)]
+    fn shuffle3012(self) -> Self {
+        unsafe {
+            x2::new([
+                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
+                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
+            ])
+        }
+    }
+    #[inline(always)]
+    fn shuffle1230(self) -> Self {
+        unsafe {
+            x2::new([
+                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
+                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
+            ])
+        }
+    }
+}
+impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
+    #[inline(always)]
+    fn shuffle2301(self) -> Self {
+        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
+    }
+    #[inline(always)]
+    fn shuffle3012(self) -> Self {
+        unsafe {
+            let a = _mm_srli_si128(self.0[0].x, 8);
+            let b = _mm_slli_si128(self.0[0].x, 8);
+            let c = _mm_srli_si128(self.0[1].x, 8);
+            let d = _mm_slli_si128(self.0[1].x, 8);
+            let da = _mm_or_si128(d, a);
+            let bc = _mm_or_si128(b, c);
+            x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
+        }
+    }
+    #[inline(always)]
+    fn shuffle1230(self) -> Self {
+        unsafe {
+            let a = _mm_srli_si128(self.0[0].x, 8);
+            let b = _mm_slli_si128(self.0[0].x, 8);
+            let c = _mm_srli_si128(self.0[1].x, 8);
+            let d = _mm_slli_si128(self.0[1].x, 8);
+            let da = _mm_or_si128(d, a);
+            let bc = _mm_or_si128(b, c);
+            x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
+        }
+    }
+}
+
+impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
+    #[inline(always)]
+    unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
+        Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
+    }
+}
+
+impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
+    #[inline(always)]
+    fn extract(self, i: u32) -> u64 {
+        unsafe {
+            match i {
+                0 => _mm_cvtsi128_si64(self.x) as u64,
+                1 => _mm_extract_epi64(self.x, 1) as u64,
+                _ => unreachable!(),
+            }
+        }
+    }
+    #[inline(always)]
+    fn insert(self, x: u64, i: u32) -> Self {
+        Self::new(unsafe {
+            match i {
+                0 => _mm_insert_epi64(self.x, x as i64, 0),
+                1 => _mm_insert_epi64(self.x, x as i64, 1),
+                _ => unreachable!(),
+            }
+        })
+    }
+}
+impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
+    #[inline(always)]
+    fn extract(self, i: u32) -> u64 {
+        unsafe {
+            match i {
+                0 => _mm_cvtsi128_si64(self.x) as u64,
+                1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
+                _ => unreachable!(),
+            }
+        }
+    }
+    #[inline(always)]
+    fn insert(self, x: u64, i: u32) -> Self {
+        Self::new(unsafe {
+            match i {
+                0 => _mm_or_si128(
+                    _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
+                    _mm_cvtsi64_si128(x as i64),
+                ),
+                1 => _mm_or_si128(
+                    _mm_move_epi64(self.x),
+                    _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
+                ),
+                _ => unreachable!(),
+            }
+        })
+    }
+}
+
+impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        Self::new(unsafe {
+            let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
+            _mm_shuffle_epi8(self.x, k)
+        })
+    }
+}
+#[inline(always)]
+fn bswap32_s2(x: __m128i) -> __m128i {
+    unsafe {
+        let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
+        y = _mm_shufflehi_epi16(y, 0b0001_1011);
+        y = _mm_shufflelo_epi16(y, 0b0001_1011);
+        let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
+        z = _mm_shufflehi_epi16(z, 0b0001_1011);
+        z = _mm_shufflelo_epi16(z, 0b0001_1011);
+        _mm_packus_epi16(y, z)
+    }
+}
+impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        Self::new(bswap32_s2(self.x))
+    }
+}
+
+impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        Self::new(unsafe {
+            let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
+            _mm_shuffle_epi8(self.x, k)
+        })
+    }
+}
+impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
+    }
+}
+
+impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        Self::new(unsafe {
+            let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
+            _mm_shuffle_epi8(self.x, k)
+        })
+    }
+}
+impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        unimplemented!()
+    }
+}
+
+macro_rules! swapi {
+    ($x:expr, $i:expr, $k:expr) => {
+        unsafe {
+            const K: u8 = $k;
+            let k = _mm_set1_epi8(K as i8);
+            u128x1_sse2::new(_mm_or_si128(
+                _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
+                _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
+            ))
+        }
+    };
+}
+#[inline(always)]
+fn swap16_s2(x: __m128i) -> __m128i {
+    unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
+}
+impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
+    #[inline(always)]
+    fn swap1(self) -> Self {
+        swapi!(self, 1, 0xaa)
+    }
+    #[inline(always)]
+    fn swap2(self) -> Self {
+        swapi!(self, 2, 0xcc)
+    }
+    #[inline(always)]
+    fn swap4(self) -> Self {
+        swapi!(self, 4, 0xf0)
+    }
+    #[inline(always)]
+    fn swap8(self) -> Self {
+        u128x1_sse2::new(unsafe {
+            let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
+            _mm_shuffle_epi8(self.x, k)
+        })
+    }
+    #[inline(always)]
+    fn swap16(self) -> Self {
+        u128x1_sse2::new(unsafe {
+            let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
+            _mm_shuffle_epi8(self.x, k)
+        })
+    }
+    #[inline(always)]
+    fn swap32(self) -> Self {
+        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
+    }
+    #[inline(always)]
+    fn swap64(self) -> Self {
+        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
+    }
+}
+impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
+    #[inline(always)]
+    fn swap1(self) -> Self {
+        swapi!(self, 1, 0xaa)
+    }
+    #[inline(always)]
+    fn swap2(self) -> Self {
+        swapi!(self, 2, 0xcc)
+    }
+    #[inline(always)]
+    fn swap4(self) -> Self {
+        swapi!(self, 4, 0xf0)
+    }
+    #[inline(always)]
+    fn swap8(self) -> Self {
+        u128x1_sse2::new(unsafe {
+            _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
+        })
+    }
+    #[inline(always)]
+    fn swap16(self) -> Self {
+        u128x1_sse2::new(swap16_s2(self.x))
+    }
+    #[inline(always)]
+    fn swap32(self) -> Self {
+        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
+    }
+    #[inline(always)]
+    fn swap64(self) -> Self {
+        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
+    }
+}
+
+#[derive(Copy, Clone)]
+pub struct G0;
+#[derive(Copy, Clone)]
+pub struct G1;
+
+#[allow(non_camel_case_types)]
+pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
+#[allow(non_camel_case_types)]
+pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
+#[allow(non_camel_case_types)]
+pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
+#[allow(non_camel_case_types)]
+pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
+
+#[allow(non_camel_case_types)]
+pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
+#[allow(non_camel_case_types)]
+pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
+#[allow(non_camel_case_types)]
+pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
+
+impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn to_scalars(self) -> [u32; 16] {
+        unsafe { core::mem::transmute(self) }
+    }
+}
+
+impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
+where
+    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
+    u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
+    u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
+where
+    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
+    u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
+    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
+    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
+    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
+{
+}
+
+impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
+where
+    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
+    u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
+{
+}
+impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
+where
+    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
+    u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
+{
+}
+impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
+where
+    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
+{
+}
+impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
+where
+    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
+    u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
+    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
+    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
+    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
+{
+}
+
+impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
+{
+    #[inline(always)]
+    fn extract(self, i: u32) -> u64 {
+        match i {
+            0 => self.0[0].extract(0),
+            1 => self.0[0].extract(1),
+            2 => self.0[1].extract(0),
+            3 => self.0[1].extract(1),
+            _ => panic!(),
+        }
+    }
+    #[inline(always)]
+    fn insert(mut self, w: u64, i: u32) -> Self {
+        match i {
+            0 => self.0[0] = self.0[0].insert(w, 0),
+            1 => self.0[0] = self.0[0].insert(w, 1),
+            2 => self.0[1] = self.0[1].insert(w, 0),
+            3 => self.0[1] = self.0[1].insert(w, 1),
+            _ => panic!(),
+        };
+        self
+    }
+}
+
+impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
+where
+    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
+    u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
+    u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
+    u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
+    u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
+where
+    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
+    u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
+    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
+    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
+{
+}
+
+impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
+where
+    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
+    u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
+{
+}
+impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
+where
+    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
+    u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
+    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
+    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
+{
+}
+
+macro_rules! impl_into_x {
+    ($from:ident, $to:ident) => {
+        impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
+            for x2<$to<S3, S4, NI>, Gt>
+        {
+            #[inline(always)]
+            fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
+                x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
+            }
+        }
+        impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
+            #[inline(always)]
+            fn from(x: x4<$from<S3, S4, NI>>) -> Self {
+                x4::new([
+                    $to::from(x.0[0]),
+                    $to::from(x.0[1]),
+                    $to::from(x.0[2]),
+                    $to::from(x.0[3]),
+                ])
+            }
+        }
+    };
+}
+impl_into_x!(u128x1_sse2, u64x2_sse2);
+impl_into_x!(u128x1_sse2, u32x4_sse2);
+
+///// Debugging
+
+use core::fmt::{Debug, Formatter, Result};
+
+impl<W: PartialEq, G> PartialEq for x2<W, G> {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
+    }
+}
+
+#[allow(unused)]
+#[inline(always)]
+unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
+    let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
+    _mm_cvtsi128_si64(q) == -1
+}
+
+#[inline(always)]
+unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
+    let q = _mm_cmpeq_epi32(x, y);
+    let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
+    let q = _mm_cvtsi128_si64(q);
+    (p & q) == -1
+}
+
+impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { eq128_s2(self.x, rhs.x) }
+    }
+}
+impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
+where
+    Self: Copy + MultiLane<[u32; 4]>,
+{
+    #[cold]
+    fn fmt(&self, fmt: &mut Formatter) -> Result {
+        fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
+    }
+}
+
+impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { eq128_s2(self.x, rhs.x) }
+    }
+}
+impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
+where
+    Self: Copy + MultiLane<[u64; 2]>,
+{
+    #[cold]
+    fn fmt(&self, fmt: &mut Formatter) -> Result {
+        fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
+    }
+}
+
+impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
+{
+    #[cold]
+    fn fmt(&self, fmt: &mut Formatter) -> Result {
+        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
+        fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
+    }
+}
+
+#[cfg(test)]
+#[cfg(target_arch = "x86_64")]
+mod test {
+    use super::*;
+    use crate::x86_64::{SSE2, SSE41, SSSE3};
+    use crate::Machine;
+
+    #[test]
+    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
+    fn test_bswap32_s2_vs_s3() {
+        let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
+        let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
+
+        let s2 = unsafe { SSE2::instance() };
+        let s3 = unsafe { SSSE3::instance() };
+
+        let x_s2 = {
+            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
+            x_s2.bswap()
+        };
+
+        let x_s3 = {
+            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
+            x_s3.bswap()
+        };
+
+        assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) });
+        assert_eq!(x_s2, s2.vec(ys));
+    }
+
+    #[test]
+    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
+    fn test_bswap64_s2_vs_s3() {
+        let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
+        let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
+
+        let s2 = unsafe { SSE2::instance() };
+        let s3 = unsafe { SSSE3::instance() };
+
+        let x_s2 = {
+            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
+            x_s2.bswap()
+        };
+
+        let x_s3 = {
+            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
+            x_s3.bswap()
+        };
+
+        assert_eq!(x_s2, s2.vec(ys));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+    }
+
+    #[test]
+    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
+    fn test_shuffle32_s2_vs_s3() {
+        let xs = [0x0, 0x1, 0x2, 0x3];
+        let ys = [0x2, 0x3, 0x0, 0x1];
+        let zs = [0x1, 0x2, 0x3, 0x0];
+
+        let s2 = unsafe { SSE2::instance() };
+        let s3 = unsafe { SSSE3::instance() };
+
+        let x_s2 = {
+            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
+            x_s2.shuffle2301()
+        };
+        let x_s3 = {
+            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
+            x_s3.shuffle2301()
+        };
+        assert_eq!(x_s2, s2.vec(ys));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+
+        let x_s2 = {
+            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
+            x_s2.shuffle3012()
+        };
+        let x_s3 = {
+            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
+            x_s3.shuffle3012()
+        };
+        assert_eq!(x_s2, s2.vec(zs));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+
+        let x_s2 = x_s2.shuffle1230();
+        let x_s3 = x_s3.shuffle1230();
+        assert_eq!(x_s2, s2.vec(xs));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+    }
+
+    #[test]
+    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
+    fn test_shuffle64_s2_vs_s3() {
+        let xs = [0x0, 0x1, 0x2, 0x3];
+        let ys = [0x2, 0x3, 0x0, 0x1];
+        let zs = [0x1, 0x2, 0x3, 0x0];
+
+        let s2 = unsafe { SSE2::instance() };
+        let s3 = unsafe { SSSE3::instance() };
+
+        let x_s2 = {
+            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
+            x_s2.shuffle2301()
+        };
+        let x_s3 = {
+            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
+            x_s3.shuffle2301()
+        };
+        assert_eq!(x_s2, s2.vec(ys));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+
+        let x_s2 = {
+            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
+            x_s2.shuffle3012()
+        };
+        let x_s3 = {
+            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
+            x_s3.shuffle3012()
+        };
+        assert_eq!(x_s2, s2.vec(zs));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+
+        let x_s2 = x_s2.shuffle1230();
+        let x_s3 = x_s3.shuffle1230();
+        assert_eq!(x_s2, s2.vec(xs));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+    }
+
+    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
+    #[test]
+    fn test_lanes_u32x4() {
+        let xs = [0x1, 0x2, 0x3, 0x4];
+
+        let s2 = unsafe { SSE2::instance() };
+        let s3 = unsafe { SSSE3::instance() };
+        let s4 = unsafe { SSE41::instance() };
+
+        {
+            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
+            let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
+            assert_eq!(x_s2, y_s2);
+            assert_eq!(xs, y_s2.to_lanes());
+        }
+
+        {
+            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
+            let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
+            assert_eq!(x_s3, y_s3);
+            assert_eq!(xs, y_s3.to_lanes());
+        }
+
+        {
+            let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
+            let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
+            assert_eq!(x_s4, y_s4);
+            assert_eq!(xs, y_s4.to_lanes());
+        }
+    }
+
+    #[test]
+    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
+    fn test_lanes_u64x2() {
+        let xs = [0x1, 0x2];
+
+        let s2 = unsafe { SSE2::instance() };
+        let s3 = unsafe { SSSE3::instance() };
+        let s4 = unsafe { SSE41::instance() };
+
+        {
+            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
+            let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
+            assert_eq!(x_s2, y_s2);
+            assert_eq!(xs, y_s2.to_lanes());
+        }
+
+        {
+            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
+            let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
+            assert_eq!(x_s3, y_s3);
+            assert_eq!(xs, y_s3.to_lanes());
+        }
+
+        {
+            let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
+            let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
+            assert_eq!(x_s4, y_s4);
+            assert_eq!(xs, y_s4.to_lanes());
+        }
+    }
+
+    #[test]
+    fn test_vec4_u32x4_s2() {
+        let xs = [1, 2, 3, 4];
+        let s2 = unsafe { SSE2::instance() };
+        let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
+        assert_eq!(x_s2.extract(0), 1);
+        assert_eq!(x_s2.extract(1), 2);
+        assert_eq!(x_s2.extract(2), 3);
+        assert_eq!(x_s2.extract(3), 4);
+        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
+        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
+        assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
+        assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
+    }
+
+    #[test]
+    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
+    fn test_vec4_u32x4_s4() {
+        let xs = [1, 2, 3, 4];
+        let s4 = unsafe { SSE41::instance() };
+        let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
+        assert_eq!(x_s4.extract(0), 1);
+        assert_eq!(x_s4.extract(1), 2);
+        assert_eq!(x_s4.extract(2), 3);
+        assert_eq!(x_s4.extract(3), 4);
+        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
+        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
+        assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
+        assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
+    }
+
+    #[test]
+    fn test_vec2_u64x2_s2() {
+        let xs = [0x1, 0x2];
+        let s2 = unsafe { SSE2::instance() };
+        let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
+        assert_eq!(x_s2.extract(0), 1);
+        assert_eq!(x_s2.extract(1), 2);
+        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
+        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
+    }
+
+    #[test]
+    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
+    fn test_vec4_u64x2_s4() {
+        let xs = [0x1, 0x2];
+        let s4 = unsafe { SSE41::instance() };
+        let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
+        assert_eq!(x_s4.extract(0), 1);
+        assert_eq!(x_s4.extract(1), 2);
+        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
+        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
+    }
+}
+
+pub mod avx2 {
+    #![allow(non_camel_case_types)]
+    use crate::soft::{x2, x4};
+    use crate::types::*;
+    use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0};
+    use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
+    use core::arch::x86_64::*;
+    use core::marker::PhantomData;
+    use core::ops::*;
+
+    #[derive(Copy, Clone)]
+    pub struct u32x4x2_avx2<NI> {
+        x: __m256i,
+        ni: PhantomData<NI>,
+    }
+
+    impl<NI> u32x4x2_avx2<NI> {
+        #[inline(always)]
+        fn new(x: __m256i) -> Self {
+            Self { x, ni: PhantomData }
+        }
+    }
+
+    impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {}
+    impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> {
+        #[inline(always)]
+        unsafe fn unpack(p: vec256_storage) -> Self {
+            Self::new(p.avx)
+        }
+    }
+    impl<NI> StoreBytes for u32x4x2_avx2<NI> {
+        #[inline(always)]
+        unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+            assert_eq!(input.len(), 32);
+            Self::new(_mm256_loadu_si256(input.as_ptr() as *const _))
+        }
+        #[inline(always)]
+        unsafe fn unsafe_read_be(input: &[u8]) -> Self {
+            Self::unsafe_read_le(input).bswap()
+        }
+        #[inline(always)]
+        fn write_le(self, out: &mut [u8]) {
+            unsafe {
+                assert_eq!(out.len(), 32);
+                _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x)
+            }
+        }
+        #[inline(always)]
+        fn write_be(self, out: &mut [u8]) {
+            self.bswap().write_le(out)
+        }
+    }
+    impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> {
+        #[inline(always)]
+        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] {
+            unsafe {
+                [
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
+                ]
+            }
+        }
+        #[inline(always)]
+        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self {
+            Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) })
+        }
+    }
+    impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> {
+        #[inline(always)]
+        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
+            unsafe {
+                match i {
+                    0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
+                    1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
+                    _ => panic!(),
+                }
+            }
+        }
+        #[inline(always)]
+        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
+            Self::new(unsafe {
+                match i {
+                    0 => _mm256_inserti128_si256(self.x, w.x, 0),
+                    1 => _mm256_inserti128_si256(self.x, w.x, 1),
+                    _ => panic!(),
+                }
+            })
+        }
+    }
+    impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {}
+    impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {}
+    macro_rules! shuf_lane_bytes {
+        ($name:ident, $k0:expr, $k1:expr) => {
+            #[inline(always)]
+            fn $name(self) -> Self {
+                Self::new(unsafe {
+                    _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1))
+                })
+            }
+        };
+    }
+    macro_rules! rotr_32 {
+        ($name:ident, $i:expr) => {
+            #[inline(always)]
+            fn $name(self) -> Self {
+                Self::new(unsafe {
+                    _mm256_or_si256(
+                        _mm256_srli_epi32(self.x, $i as i32),
+                        _mm256_slli_epi32(self.x, 32 - $i as i32),
+                    )
+                })
+            }
+        };
+    }
+    impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> {
+        rotr_32!(rotate_each_word_right7, 7);
+        shuf_lane_bytes!(
+            rotate_each_word_right8,
+            0x0c0f_0e0d_080b_0a09,
+            0x0407_0605_0003_0201
+        );
+        rotr_32!(rotate_each_word_right11, 11);
+        rotr_32!(rotate_each_word_right12, 12);
+        shuf_lane_bytes!(
+            rotate_each_word_right16,
+            0x0d0c_0f0e_0908_0b0a,
+            0x0504_0706_0100_0302
+        );
+        rotr_32!(rotate_each_word_right20, 20);
+        shuf_lane_bytes!(
+            rotate_each_word_right24,
+            0x0e0d_0c0f_0a09_080b,
+            0x0605_0407_0201_0003
+        );
+        rotr_32!(rotate_each_word_right25, 25);
+    }
+    impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {}
+    impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage {
+        #[inline(always)]
+        fn from(x: u32x4x2_avx2<NI>) -> Self {
+            Self { avx: x.x }
+        }
+    }
+
+    macro_rules! impl_assign {
+        ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
+            impl<NI> $Assign for $vec<NI>
+            where
+                NI: Copy,
+            {
+                #[inline(always)]
+                fn $assign_fn(&mut self, rhs: Self) {
+                    *self = self.$bin_fn(rhs);
+                }
+            }
+        };
+    }
+    impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor);
+    impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor);
+    impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand);
+    impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add);
+
+    macro_rules! impl_bitop {
+        ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
+            impl<NI> $Op for $vec<NI> {
+                type Output = Self;
+                #[inline(always)]
+                fn $op_fn(self, rhs: Self) -> Self::Output {
+                    Self::new(unsafe { $impl_fn(self.x, rhs.x) })
+                }
+            }
+        };
+    }
+    impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256);
+    impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256);
+    impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256);
+    impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256);
+    impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32);
+
+    impl<NI> Not for u32x4x2_avx2<NI> {
+        type Output = Self;
+        #[inline(always)]
+        fn not(self) -> Self::Output {
+            unsafe {
+                let f = _mm256_set1_epi8(-0x7f);
+                Self::new(f) ^ self
+            }
+        }
+    }
+
+    impl<NI> BSwap for u32x4x2_avx2<NI> {
+        shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
+    }
+
+    impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI>
+    where
+        NI: Copy,
+    {
+        #[inline(always)]
+        fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self {
+            Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) })
+        }
+    }
+
+    impl<NI> LaneWords4 for u32x4x2_avx2<NI> {
+        #[inline(always)]
+        fn shuffle_lane_words1230(self) -> Self {
+            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) })
+        }
+        #[inline(always)]
+        fn shuffle_lane_words2301(self) -> Self {
+            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) })
+        }
+        #[inline(always)]
+        fn shuffle_lane_words3012(self) -> Self {
+            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) })
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////////////////
+
+    pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>;
+    impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {}
+
+    impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        unsafe fn unpack(p: vec512_storage) -> Self {
+            Self::new([
+                u32x4x2_avx2::unpack(p.avx[0]),
+                u32x4x2_avx2::unpack(p.avx[1]),
+            ])
+        }
+    }
+    impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
+            let [a, b] = self.0[0].to_lanes();
+            let [c, d] = self.0[1].to_lanes();
+            [a, b, c, d]
+        }
+        #[inline(always)]
+        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
+            let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]);
+            let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]);
+            Self::new([ab, cd])
+        }
+    }
+    impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
+            match i {
+                0 => self.0[0].extract(0),
+                1 => self.0[0].extract(1),
+                2 => self.0[1].extract(0),
+                3 => self.0[1].extract(1),
+                _ => panic!(),
+            }
+        }
+        #[inline(always)]
+        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
+            Self::new(match i {
+                0 | 1 => [self.0[0].insert(w, i), self.0[1]],
+                2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)],
+                _ => panic!(),
+            })
+        }
+    }
+    impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
+            /*
+             * a00:a01 a10:a11
+             * b00:b01 b10:b11
+             * c00:c01 c10:c11
+             * d00:d01 d10:d11
+             *       =>
+             * a00:b00 c00:d00
+             * a01:b01 c01:d01
+             * a10:b10 c10:d10
+             * a11:b11 c11:d11
+             */
+            unsafe {
+                let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20));
+                let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31));
+                let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20));
+                let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31));
+                let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20));
+                let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31));
+                let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20));
+                let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31));
+                (
+                    Self::new([ab00, cd00]),
+                    Self::new([ab01, cd01]),
+                    Self::new([ab10, cd10]),
+                    Self::new([ab11, cd11]),
+                )
+            }
+        }
+    }
+    impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn to_scalars(self) -> [u32; 16] {
+            unsafe { core::mem::transmute(self) }
+        }
+    }
+    impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage {
+        #[inline(always)]
+        fn from(x: u32x4x4_avx2<NI>) -> Self {
+            Self {
+                avx: [
+                    vec256_storage { avx: x.0[0].x },
+                    vec256_storage { avx: x.0[1].x },
+                ],
+            }
+        }
+    }
+    impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
+            Self::new(unsafe {
+                [
+                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)),
+                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)),
+                ]
+            })
+        }
+    }
+}
+
+
\ No newline at end of file -- cgit v1.2.3-70-g09d2