Source file src/simd/archsimd/internal/simd_test/simd_test.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build goexperiment.simd && amd64
     6  
     7  package simd_test
     8  
     9  import (
    10  	"reflect"
    11  	"simd/archsimd"
    12  	"slices"
    13  	"testing"
    14  )
    15  
    16  var sink any
    17  
    18  func TestType(t *testing.T) {
    19  	// Testing:
    20  	// - Defined as another struct's field is ok
    21  	// - Pointer is ok
    22  	// - Type defition is ok
    23  	// - Type alias is ok
    24  	// - Type conversion is ok
    25  	// - Conversion to interface is ok
    26  	type alias = archsimd.Int32x4
    27  	type maskT archsimd.Mask32x4
    28  	type myStruct struct {
    29  		x alias
    30  		y *archsimd.Int32x4
    31  		z maskT
    32  	}
    33  	vals := [4]int32{1, 2, 3, 4}
    34  	v := myStruct{x: archsimd.LoadInt32x4(&vals)}
    35  	// masking elements 1 and 2.
    36  	want := []int32{2, 4, 0, 0}
    37  	y := archsimd.LoadInt32x4(&vals)
    38  	v.y = &y
    39  	sink = y
    40  
    41  	if !archsimd.X86.AVX512GFNI() {
    42  		t.Skip("Test requires X86.AVX512, not available on this hardware")
    43  		return
    44  	}
    45  	v.z = maskT(archsimd.Mask32x4FromBits(0b0011))
    46  	*v.y = v.y.Add(v.x).Masked(archsimd.Mask32x4(v.z))
    47  
    48  	got := [4]int32{}
    49  	v.y.Store(&got)
    50  	checkSlices(t, got[:], want)
    51  }
    52  
    53  func TestUncomparable(t *testing.T) {
    54  	// Test that simd vectors are not comparable
    55  	var x, y any = archsimd.LoadUint32x4(&[4]uint32{1, 2, 3, 4}), archsimd.LoadUint32x4(&[4]uint32{5, 6, 7, 8})
    56  	shouldPanic := func(fn func()) {
    57  		defer func() {
    58  			if recover() == nil {
    59  				panic("did not panic")
    60  			}
    61  		}()
    62  		fn()
    63  	}
    64  	shouldPanic(func() { _ = x == y })
    65  }
    66  
    67  func TestFuncValue(t *testing.T) {
    68  	// Test that simd intrinsic can be used as a function value.
    69  	xv := [4]int32{1, 2, 3, 4}
    70  	yv := [4]int32{5, 6, 7, 8}
    71  	want := []int32{6, 8, 10, 12}
    72  	x := archsimd.LoadInt32x4(&xv)
    73  	y := archsimd.LoadInt32x4(&yv)
    74  	fn := archsimd.Int32x4.Add
    75  	sink = fn
    76  	x = fn(x, y)
    77  	got := [4]int32{}
    78  	x.Store(&got)
    79  	checkSlices(t, got[:], want)
    80  }
    81  
    82  func TestReflectMethod(t *testing.T) {
    83  	// Test that simd intrinsic can be accessed via reflection.
    84  	// NOTE: we don't yet support reflect method.Call.
    85  	xv := [4]int32{1, 2, 3, 4}
    86  	yv := [4]int32{5, 6, 7, 8}
    87  	want := []int32{6, 8, 10, 12}
    88  	x := archsimd.LoadInt32x4(&xv)
    89  	y := archsimd.LoadInt32x4(&yv)
    90  	m, ok := reflect.TypeOf(x).MethodByName("Add")
    91  	if !ok {
    92  		t.Fatal("Add method not found")
    93  	}
    94  	fn := m.Func.Interface().(func(x, y archsimd.Int32x4) archsimd.Int32x4)
    95  	x = fn(x, y)
    96  	got := [4]int32{}
    97  	x.Store(&got)
    98  	checkSlices(t, got[:], want)
    99  }
   100  
   101  func TestVectorConversion(t *testing.T) {
   102  	if !archsimd.X86.AVX512GFNI() {
   103  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   104  		return
   105  	}
   106  	xv := [4]int32{1, 2, 3, 4}
   107  	x := archsimd.LoadInt32x4(&xv)
   108  	xPromoted := x.AsInt64x2()
   109  	xPromotedDemoted := xPromoted.AsInt32x4()
   110  	got := [4]int32{}
   111  	xPromotedDemoted.Store(&got)
   112  	for i := range 4 {
   113  		if xv[i] != got[i] {
   114  			t.Errorf("Result at %d incorrect: want %d, got %d", i, xv[i], got[i])
   115  		}
   116  	}
   117  }
   118  
   119  func TestMaskConversion(t *testing.T) {
   120  	if !archsimd.X86.AVX512GFNI() {
   121  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   122  		return
   123  	}
   124  	x := archsimd.LoadInt32x4Slice([]int32{5, 0, 7, 0})
   125  	mask := archsimd.Int32x4{}.Sub(x).ToMask()
   126  	y := archsimd.LoadInt32x4Slice([]int32{1, 2, 3, 4}).Add(x).Masked(mask)
   127  	want := [4]int32{6, 0, 10, 0}
   128  	got := make([]int32, 4)
   129  	y.StoreSlice(got)
   130  	checkSlices(t, got[:], want[:])
   131  }
   132  
   133  func TestPermute(t *testing.T) {
   134  	if !archsimd.X86.AVX512() {
   135  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   136  		return
   137  	}
   138  	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
   139  	indices := []uint64{7, 6, 5, 4, 3, 2, 1, 0}
   140  	want := []int64{8, 7, 6, 5, 4, 3, 2, 1}
   141  	got := make([]int64, 8)
   142  	archsimd.LoadInt64x8Slice(x).Permute(archsimd.LoadUint64x8Slice(indices)).StoreSlice(got)
   143  	checkSlices(t, got, want)
   144  }
   145  
   146  func TestPermuteOrZero(t *testing.T) {
   147  	x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
   148  	indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
   149  	want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
   150  	got := make([]uint8, len(x))
   151  	archsimd.LoadUint8x16Slice(x).PermuteOrZero(archsimd.LoadInt8x16Slice(indices)).StoreSlice(got)
   152  	checkSlices(t, got, want)
   153  }
   154  
   155  func TestConcatPermute(t *testing.T) {
   156  	if !archsimd.X86.AVX512() {
   157  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   158  		return
   159  	}
   160  	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
   161  	y := []int64{-1, -2, -3, -4, -5, -6, -7, -8}
   162  	indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
   163  	want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
   164  	got := make([]int64, 8)
   165  	archsimd.LoadInt64x8Slice(x).ConcatPermute(archsimd.LoadInt64x8Slice(y), archsimd.LoadUint64x8Slice(indices)).StoreSlice(got)
   166  	checkSlices(t, got, want)
   167  }
   168  
   169  func TestCompress(t *testing.T) {
   170  	if !archsimd.X86.AVX512() {
   171  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   172  		return
   173  	}
   174  	v1234 := archsimd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
   175  	v2400 := v1234.Compress(archsimd.Mask32x4FromBits(0b1010))
   176  	got := make([]int32, 4)
   177  	v2400.StoreSlice(got)
   178  	want := []int32{2, 4, 0, 0}
   179  	if !slices.Equal(got, want) {
   180  		t.Errorf("want and got differ, want=%v, got=%v", want, got)
   181  	}
   182  }
   183  
   184  func TestExpand(t *testing.T) {
   185  	if !archsimd.X86.AVX512() {
   186  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   187  		return
   188  	}
   189  	v3400 := archsimd.LoadInt32x4Slice([]int32{3, 4, 0, 0})
   190  	v2400 := v3400.Expand(archsimd.Mask32x4FromBits(0b1010))
   191  	got := make([]int32, 4)
   192  	v2400.StoreSlice(got)
   193  	want := []int32{0, 3, 0, 4}
   194  	if !slices.Equal(got, want) {
   195  		t.Errorf("want and got differ, want=%v, got=%v", want, got)
   196  	}
   197  }
   198  
   199  var testShiftAllVal uint64 = 3
   200  
   201  func TestShiftAll(t *testing.T) {
   202  	got := make([]int32, 4)
   203  	archsimd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).ShiftAllLeft(2).StoreSlice(got)
   204  	for _, v := range got {
   205  		if v != 0b1100 {
   206  			t.Errorf("expect 0b1100, got %b", v)
   207  		}
   208  	}
   209  	archsimd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).ShiftAllLeft(testShiftAllVal).StoreSlice(got)
   210  	for _, v := range got {
   211  		if v != 0b11000 {
   212  			t.Errorf("expect 0b11000, got %b", v)
   213  		}
   214  	}
   215  }
   216  
   217  func TestSlicesInt8(t *testing.T) {
   218  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   219  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
   220  	v := archsimd.LoadInt8x32Slice(a)
   221  	b := make([]int8, 32, 32)
   222  	v.StoreSlice(b)
   223  	checkSlices(t, a, b)
   224  }
   225  
   226  func TestSlicesInt8SetElem(t *testing.T) {
   227  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   228  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
   229  	v := archsimd.LoadInt8x16Slice(a)
   230  
   231  	v = v.SetElem(3, 13)
   232  	a[3] = 13
   233  
   234  	b := make([]int8, 16, 16)
   235  	v.StoreSlice(b)
   236  	checkSlices(t, a, b)
   237  }
   238  
   239  func TestSlicesInt8GetElem(t *testing.T) {
   240  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   241  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
   242  	v := archsimd.LoadInt8x16Slice(a)
   243  	e := v.GetElem(2)
   244  	if e != a[2] {
   245  		t.Errorf("GetElem(2) = %d != a[2] = %d", e, a[2])
   246  	}
   247  
   248  }
   249  
   250  func TestSlicesInt8TooShortLoad(t *testing.T) {
   251  	defer func() {
   252  		if r := recover(); r != nil {
   253  			t.Logf("Saw EXPECTED panic %v", r)
   254  		} else {
   255  			t.Errorf("Did not see expected panic")
   256  		}
   257  	}()
   258  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   259  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31} // TOO SHORT, should panic
   260  	v := archsimd.LoadInt8x32Slice(a)
   261  	b := make([]int8, 32, 32)
   262  	v.StoreSlice(b)
   263  	checkSlices(t, a, b)
   264  }
   265  
   266  func TestSlicesInt8TooShortStore(t *testing.T) {
   267  	defer func() {
   268  		if r := recover(); r != nil {
   269  			t.Logf("Saw EXPECTED panic %v", r)
   270  		} else {
   271  			t.Errorf("Did not see expected panic")
   272  		}
   273  	}()
   274  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   275  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
   276  	v := archsimd.LoadInt8x32Slice(a)
   277  	b := make([]int8, 31) // TOO SHORT, should panic
   278  	v.StoreSlice(b)
   279  	checkSlices(t, a, b)
   280  }
   281  
   282  func TestSlicesFloat64(t *testing.T) {
   283  	a := []float64{1, 2, 3, 4, 5, 6, 7, 8} // too long, should be fine
   284  	v := archsimd.LoadFloat64x4Slice(a)
   285  	b := make([]float64, 4, 4)
   286  	v.StoreSlice(b)
   287  	for i := range b {
   288  		if a[i] != b[i] {
   289  			t.Errorf("a and b differ at index %d, a=%f, b=%f", i, a[i], b[i])
   290  		}
   291  	}
   292  }
   293  
   294  // TODO: try to reduce this test to be smaller.
   295  func TestMergeLocals(t *testing.T) {
   296  	testMergeLocalswrapper(t, archsimd.Int64x4.Add)
   297  }
   298  
   299  //go:noinline
   300  func forceSpill() {}
   301  
   302  func testMergeLocalswrapper(t *testing.T, op func(archsimd.Int64x4, archsimd.Int64x4) archsimd.Int64x4) {
   303  	t.Helper()
   304  	s0 := []int64{0, 1, 2, 3}
   305  	s1 := []int64{-1, 0, -1, 0}
   306  	want := []int64{-1, 1, 1, 3}
   307  	v := archsimd.LoadInt64x4Slice(s0)
   308  	m := archsimd.LoadInt64x4Slice(s1)
   309  	forceSpill()
   310  	got := make([]int64, 4)
   311  	gotv := op(v, m)
   312  	gotv.StoreSlice(got)
   313  	for i := range len(want) {
   314  		if !(got[i] == want[i]) {
   315  			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i])
   316  		}
   317  	}
   318  }
   319  
   320  func TestBitMaskFromBits(t *testing.T) {
   321  	if !archsimd.X86.AVX512() {
   322  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   323  		return
   324  	}
   325  	results := [2]int64{}
   326  	want := [2]int64{0, 6}
   327  	m := archsimd.Mask64x2FromBits(0b10)
   328  	archsimd.LoadInt64x2Slice([]int64{1, 2}).Add(archsimd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
   329  	for i := range 2 {
   330  		if results[i] != want[i] {
   331  			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
   332  		}
   333  	}
   334  }
   335  
   336  var maskForTestBitMaskFromBitsLoad = uint8(0b10)
   337  
   338  func TestBitMaskFromBitsLoad(t *testing.T) {
   339  	if !archsimd.X86.AVX512() {
   340  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   341  		return
   342  	}
   343  	results := [2]int64{}
   344  	want := [2]int64{0, 6}
   345  	m := archsimd.Mask64x2FromBits(maskForTestBitMaskFromBitsLoad)
   346  	archsimd.LoadInt64x2Slice([]int64{1, 2}).Add(archsimd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
   347  	for i := range 2 {
   348  		if results[i] != want[i] {
   349  			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
   350  		}
   351  	}
   352  }
   353  
   354  func TestBitMaskToBits(t *testing.T) {
   355  	if !archsimd.X86.AVX512() {
   356  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   357  		return
   358  	}
   359  	if v := archsimd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits(); v != 0b101 {
   360  		t.Errorf("Want 0b101, got %b", v)
   361  	}
   362  }
   363  
   364  var maskForTestBitMaskFromBitsStore uint8
   365  
   366  func TestBitMaskToBitsStore(t *testing.T) {
   367  	if !archsimd.X86.AVX512() {
   368  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   369  		return
   370  	}
   371  	maskForTestBitMaskFromBitsStore = archsimd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits()
   372  	if maskForTestBitMaskFromBitsStore != 0b101 {
   373  		t.Errorf("Want 0b101, got %b", maskForTestBitMaskFromBitsStore)
   374  	}
   375  }
   376  
   377  func TestMergeFloat(t *testing.T) {
   378  	k := make([]int64, 4, 4)
   379  	s := make([]float64, 4, 4)
   380  
   381  	a := archsimd.LoadFloat64x4Slice([]float64{1, 2, 3, 4})
   382  	b := archsimd.LoadFloat64x4Slice([]float64{4, 2, 3, 1})
   383  	g := a.Greater(b)
   384  	g.ToInt64x4().StoreSlice(k)
   385  	c := a.Merge(b, g)
   386  
   387  	c.StoreSlice(s)
   388  
   389  	checkSlices[int64](t, k, []int64{0, 0, 0, -1})
   390  	checkSlices[float64](t, s, []float64{4, 2, 3, 4})
   391  }
   392  
   393  func TestMergeFloat512(t *testing.T) {
   394  	if !archsimd.X86.AVX512() {
   395  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   396  		return
   397  	}
   398  
   399  	k := make([]int64, 8, 8)
   400  	s := make([]float64, 8, 8)
   401  
   402  	a := archsimd.LoadFloat64x8Slice([]float64{1, 2, 3, 4, 5, 6, 7, 8})
   403  	b := archsimd.LoadFloat64x8Slice([]float64{8, 7, 6, 5, 4, 2, 3, 1})
   404  	g := a.Greater(b)
   405  	g.ToInt64x8().StoreSlice(k)
   406  	c := a.Merge(b, g)
   407  	d := a.Masked(g)
   408  
   409  	checkSlices[int64](t, k, []int64{0, 0, 0, 0, -1, -1, -1, -1})
   410  
   411  	c.StoreSlice(s)
   412  	checkSlices[float64](t, s, []float64{8, 7, 6, 5, 5, 6, 7, 8})
   413  
   414  	d.StoreSlice(s)
   415  	checkSlices[float64](t, s, []float64{0, 0, 0, 0, 5, 6, 7, 8})
   416  }
   417  
   418  var ro uint8 = 2
   419  
   420  func TestRotateAllVariable(t *testing.T) {
   421  	if !archsimd.X86.AVX512() {
   422  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   423  		return
   424  	}
   425  	got := make([]int32, 4)
   426  	archsimd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).RotateAllLeft(ro).StoreSlice(got)
   427  	for _, v := range got {
   428  		if v != 0b1100 {
   429  			t.Errorf("Want 0b1100, got %b", v)
   430  		}
   431  	}
   432  }
   433  
   434  func TestBroadcastUint32x4(t *testing.T) {
   435  	s := make([]uint32, 4, 4)
   436  	archsimd.BroadcastUint32x4(123456789).StoreSlice(s)
   437  	checkSlices(t, s, []uint32{123456789, 123456789, 123456789, 123456789})
   438  }
   439  
   440  func TestBroadcastFloat32x8(t *testing.T) {
   441  	s := make([]float32, 8, 8)
   442  	archsimd.BroadcastFloat32x8(123456789).StoreSlice(s)
   443  	checkSlices(t, s, []float32{123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789})
   444  }
   445  
   446  func TestBroadcastFloat64x2(t *testing.T) {
   447  	s := make([]float64, 2, 2)
   448  	archsimd.BroadcastFloat64x2(123456789).StoreSlice(s)
   449  	checkSlices(t, s, []float64{123456789, 123456789})
   450  }
   451  
   452  func TestBroadcastUint64x2(t *testing.T) {
   453  	s := make([]uint64, 2, 2)
   454  	archsimd.BroadcastUint64x2(123456789).StoreSlice(s)
   455  	checkSlices(t, s, []uint64{123456789, 123456789})
   456  }
   457  
   458  func TestBroadcastUint16x8(t *testing.T) {
   459  	s := make([]uint16, 8, 8)
   460  	archsimd.BroadcastUint16x8(12345).StoreSlice(s)
   461  	checkSlices(t, s, []uint16{12345, 12345, 12345, 12345})
   462  }
   463  
   464  func TestBroadcastInt8x32(t *testing.T) {
   465  	s := make([]int8, 32, 32)
   466  	archsimd.BroadcastInt8x32(-123).StoreSlice(s)
   467  	checkSlices(t, s, []int8{-123, -123, -123, -123, -123, -123, -123, -123,
   468  		-123, -123, -123, -123, -123, -123, -123, -123,
   469  		-123, -123, -123, -123, -123, -123, -123, -123,
   470  		-123, -123, -123, -123, -123, -123, -123, -123,
   471  	})
   472  }
   473  
   474  func TestMaskOpt512(t *testing.T) {
   475  	if !archsimd.X86.AVX512() {
   476  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   477  		return
   478  	}
   479  
   480  	k := make([]int64, 8, 8)
   481  	s := make([]float64, 8, 8)
   482  
   483  	a := archsimd.LoadFloat64x8Slice([]float64{2, 0, 2, 0, 2, 0, 2, 0})
   484  	b := archsimd.LoadFloat64x8Slice([]float64{1, 1, 1, 1, 1, 1, 1, 1})
   485  	c := archsimd.LoadFloat64x8Slice([]float64{1, 2, 3, 4, 5, 6, 7, 8})
   486  	d := archsimd.LoadFloat64x8Slice([]float64{2, 4, 6, 8, 10, 12, 14, 16})
   487  	g := a.Greater(b)
   488  	e := c.Add(d).Masked(g)
   489  	e.StoreSlice(s)
   490  	g.ToInt64x8().StoreSlice(k)
   491  	checkSlices[int64](t, k, []int64{-1, 0, -1, 0, -1, 0, -1, 0})
   492  	checkSlices[float64](t, s, []float64{3, 0, 9, 0, 15, 0, 21, 0})
   493  }
   494  
   495  // flattenedTranspose tranposes x and y, regarded as a pair of 2x2
   496  // matrices, but then flattens the rows in order, i.e
   497  // x: ABCD ==> a: A1B2
   498  // y: 1234     b: C3D4
   499  func flattenedTranspose(x, y archsimd.Int32x4) (a, b archsimd.Int32x4) {
   500  	return x.InterleaveLo(y), x.InterleaveHi(y)
   501  }
   502  
   503  func TestFlattenedTranspose(t *testing.T) {
   504  	r := make([]int32, 4, 4)
   505  	s := make([]int32, 4, 4)
   506  
   507  	x := archsimd.LoadInt32x4Slice([]int32{0xA, 0xB, 0xC, 0xD})
   508  	y := archsimd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
   509  	a, b := flattenedTranspose(x, y)
   510  
   511  	a.StoreSlice(r)
   512  	b.StoreSlice(s)
   513  
   514  	checkSlices[int32](t, r, []int32{0xA, 1, 0xB, 2})
   515  	checkSlices[int32](t, s, []int32{0xC, 3, 0xD, 4})
   516  
   517  }
   518  
   519  func TestClearAVXUpperBits(t *testing.T) {
   520  	// Test that ClearAVXUpperBits is safe even if there are SIMD values
   521  	// alive (although usually one should not do this).
   522  	if !archsimd.X86.AVX2() {
   523  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   524  		return
   525  	}
   526  
   527  	r := make([]int64, 4)
   528  	s := make([]int64, 4)
   529  
   530  	x := archsimd.LoadInt64x4Slice([]int64{10, 20, 30, 40})
   531  	y := archsimd.LoadInt64x4Slice([]int64{1, 2, 3, 4})
   532  
   533  	x.Add(y).StoreSlice(r)
   534  	archsimd.ClearAVXUpperBits()
   535  	x.Sub(y).StoreSlice(s)
   536  
   537  	checkSlices[int64](t, r, []int64{11, 22, 33, 44})
   538  	checkSlices[int64](t, s, []int64{9, 18, 27, 36})
   539  }
   540  
   541  func TestLeadingZeros(t *testing.T) {
   542  	if !archsimd.X86.AVX512() {
   543  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   544  		return
   545  	}
   546  
   547  	src := []uint64{0b1111, 0}
   548  	want := []uint64{60, 64}
   549  	got := make([]uint64, 2)
   550  	archsimd.LoadUint64x2Slice(src).LeadingZeros().StoreSlice(got)
   551  	for i := range 2 {
   552  		if want[i] != got[i] {
   553  			t.Errorf("Result incorrect at %d: want %d, got %d", i, want[i], got[i])
   554  		}
   555  	}
   556  }
   557  
   558  func TestIsZero(t *testing.T) {
   559  	v1 := archsimd.LoadUint64x2Slice([]uint64{0, 1})
   560  	v2 := archsimd.LoadUint64x2Slice([]uint64{0, 0})
   561  	if v1.IsZero() {
   562  		t.Errorf("Result incorrect, want false, got true")
   563  	}
   564  	if !v2.IsZero() {
   565  		t.Errorf("Result incorrect, want true, got false")
   566  	}
   567  	if !v1.And(v2).IsZero() {
   568  		t.Errorf("Result incorrect, want true, got false")
   569  	}
   570  	if v1.AndNot(v2).IsZero() {
   571  		t.Errorf("Result incorrect, want false, got true")
   572  	}
   573  	if !v2.And(v1).IsZero() {
   574  		t.Errorf("Result incorrect, want true, got false")
   575  	}
   576  	if !v2.AndNot(v1).IsZero() {
   577  		t.Errorf("Result incorrect, want true, got false")
   578  	}
   579  }
   580  
   581  func TestSelect4FromPairConst(t *testing.T) {
   582  	x := archsimd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
   583  	y := archsimd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
   584  
   585  	llll := x.SelectFromPair(0, 1, 2, 3, y)
   586  	hhhh := x.SelectFromPair(4, 5, 6, 7, y)
   587  	llhh := x.SelectFromPair(0, 1, 6, 7, y)
   588  	hhll := x.SelectFromPair(6, 7, 0, 1, y)
   589  
   590  	lllh := x.SelectFromPair(0, 1, 2, 7, y)
   591  	llhl := x.SelectFromPair(0, 1, 7, 2, y)
   592  	lhll := x.SelectFromPair(0, 7, 1, 2, y)
   593  	hlll := x.SelectFromPair(7, 0, 1, 2, y)
   594  
   595  	hhhl := x.SelectFromPair(4, 5, 6, 0, y)
   596  	hhlh := x.SelectFromPair(4, 5, 0, 6, y)
   597  	hlhh := x.SelectFromPair(4, 0, 5, 6, y)
   598  	lhhh := x.SelectFromPair(0, 4, 5, 6, y)
   599  
   600  	lhlh := x.SelectFromPair(0, 4, 1, 5, y)
   601  	hlhl := x.SelectFromPair(4, 0, 5, 1, y)
   602  	lhhl := x.SelectFromPair(0, 4, 5, 1, y)
   603  	hllh := x.SelectFromPair(4, 0, 1, 5, y)
   604  
   605  	r := make([]int32, 4, 4)
   606  
   607  	foo := func(v archsimd.Int32x4, a, b, c, d int32) {
   608  		v.StoreSlice(r)
   609  		checkSlices[int32](t, r, []int32{a, b, c, d})
   610  	}
   611  
   612  	foo(llll, 0, 1, 2, 3)
   613  	foo(hhhh, 4, 5, 6, 7)
   614  	foo(llhh, 0, 1, 6, 7)
   615  	foo(hhll, 6, 7, 0, 1)
   616  
   617  	foo(lllh, 0, 1, 2, 7)
   618  	foo(llhl, 0, 1, 7, 2)
   619  	foo(lhll, 0, 7, 1, 2)
   620  	foo(hlll, 7, 0, 1, 2)
   621  
   622  	foo(hhhl, 4, 5, 6, 0)
   623  	foo(hhlh, 4, 5, 0, 6)
   624  	foo(hlhh, 4, 0, 5, 6)
   625  	foo(lhhh, 0, 4, 5, 6)
   626  
   627  	foo(lhlh, 0, 4, 1, 5)
   628  	foo(hlhl, 4, 0, 5, 1)
   629  	foo(lhhl, 0, 4, 5, 1)
   630  	foo(hllh, 4, 0, 1, 5)
   631  }
   632  
   633  //go:noinline
   634  func selectFromPairInt32x4(x archsimd.Int32x4, a, b, c, d uint8, y archsimd.Int32x4) archsimd.Int32x4 {
   635  	return x.SelectFromPair(a, b, c, d, y)
   636  }
   637  
   638  func TestSelect4FromPairVar(t *testing.T) {
   639  	x := archsimd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
   640  	y := archsimd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
   641  
   642  	llll := selectFromPairInt32x4(x, 0, 1, 2, 3, y)
   643  	hhhh := selectFromPairInt32x4(x, 4, 5, 6, 7, y)
   644  	llhh := selectFromPairInt32x4(x, 0, 1, 6, 7, y)
   645  	hhll := selectFromPairInt32x4(x, 6, 7, 0, 1, y)
   646  
   647  	lllh := selectFromPairInt32x4(x, 0, 1, 2, 7, y)
   648  	llhl := selectFromPairInt32x4(x, 0, 1, 7, 2, y)
   649  	lhll := selectFromPairInt32x4(x, 0, 7, 1, 2, y)
   650  	hlll := selectFromPairInt32x4(x, 7, 0, 1, 2, y)
   651  
   652  	hhhl := selectFromPairInt32x4(x, 4, 5, 6, 0, y)
   653  	hhlh := selectFromPairInt32x4(x, 4, 5, 0, 6, y)
   654  	hlhh := selectFromPairInt32x4(x, 4, 0, 5, 6, y)
   655  	lhhh := selectFromPairInt32x4(x, 0, 4, 5, 6, y)
   656  
   657  	lhlh := selectFromPairInt32x4(x, 0, 4, 1, 5, y)
   658  	hlhl := selectFromPairInt32x4(x, 4, 0, 5, 1, y)
   659  	lhhl := selectFromPairInt32x4(x, 0, 4, 5, 1, y)
   660  	hllh := selectFromPairInt32x4(x, 4, 0, 1, 5, y)
   661  
   662  	r := make([]int32, 4, 4)
   663  
   664  	foo := func(v archsimd.Int32x4, a, b, c, d int32) {
   665  		v.StoreSlice(r)
   666  		checkSlices[int32](t, r, []int32{a, b, c, d})
   667  	}
   668  
   669  	foo(llll, 0, 1, 2, 3)
   670  	foo(hhhh, 4, 5, 6, 7)
   671  	foo(llhh, 0, 1, 6, 7)
   672  	foo(hhll, 6, 7, 0, 1)
   673  
   674  	foo(lllh, 0, 1, 2, 7)
   675  	foo(llhl, 0, 1, 7, 2)
   676  	foo(lhll, 0, 7, 1, 2)
   677  	foo(hlll, 7, 0, 1, 2)
   678  
   679  	foo(hhhl, 4, 5, 6, 0)
   680  	foo(hhlh, 4, 5, 0, 6)
   681  	foo(hlhh, 4, 0, 5, 6)
   682  	foo(lhhh, 0, 4, 5, 6)
   683  
   684  	foo(lhlh, 0, 4, 1, 5)
   685  	foo(hlhl, 4, 0, 5, 1)
   686  	foo(lhhl, 0, 4, 5, 1)
   687  	foo(hllh, 4, 0, 1, 5)
   688  }
   689  
   690  func TestSelect4FromPairConstGrouped(t *testing.T) {
   691  	x := archsimd.LoadFloat32x8Slice([]float32{0, 1, 2, 3, 10, 11, 12, 13})
   692  	y := archsimd.LoadFloat32x8Slice([]float32{4, 5, 6, 7, 14, 15, 16, 17})
   693  
   694  	llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
   695  	hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
   696  	llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
   697  	hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
   698  
   699  	lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
   700  	llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
   701  	lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
   702  	hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
   703  
   704  	hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
   705  	hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
   706  	hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
   707  	lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
   708  
   709  	lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
   710  	hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
   711  	lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
   712  	hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
   713  
   714  	r := make([]float32, 8, 8)
   715  
   716  	foo := func(v archsimd.Float32x8, a, b, c, d float32) {
   717  		v.StoreSlice(r)
   718  		checkSlices[float32](t, r, []float32{a, b, c, d, 10 + a, 10 + b, 10 + c, 10 + d})
   719  	}
   720  
   721  	foo(llll, 0, 1, 2, 3)
   722  	foo(hhhh, 4, 5, 6, 7)
   723  	foo(llhh, 0, 1, 6, 7)
   724  	foo(hhll, 6, 7, 0, 1)
   725  
   726  	foo(lllh, 0, 1, 2, 7)
   727  	foo(llhl, 0, 1, 7, 2)
   728  	foo(lhll, 0, 7, 1, 2)
   729  	foo(hlll, 7, 0, 1, 2)
   730  
   731  	foo(hhhl, 4, 5, 6, 0)
   732  	foo(hhlh, 4, 5, 0, 6)
   733  	foo(hlhh, 4, 0, 5, 6)
   734  	foo(lhhh, 0, 4, 5, 6)
   735  
   736  	foo(lhlh, 0, 4, 1, 5)
   737  	foo(hlhl, 4, 0, 5, 1)
   738  	foo(lhhl, 0, 4, 5, 1)
   739  	foo(hllh, 4, 0, 1, 5)
   740  }
   741  
   742  func TestSelectFromPairConstGroupedUint32x16(t *testing.T) {
   743  	if !archsimd.X86.AVX512() {
   744  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   745  		return
   746  	}
   747  	x := archsimd.LoadUint32x16Slice([]uint32{0, 1, 2, 3, 10, 11, 12, 13, 20, 21, 22, 23, 30, 31, 32, 33})
   748  	y := archsimd.LoadUint32x16Slice([]uint32{4, 5, 6, 7, 14, 15, 16, 17, 24, 25, 26, 27, 34, 35, 36, 37})
   749  
   750  	llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
   751  	hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
   752  	llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
   753  	hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
   754  
   755  	lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
   756  	llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
   757  	lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
   758  	hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
   759  
   760  	hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
   761  	hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
   762  	hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
   763  	lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
   764  
   765  	lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
   766  	hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
   767  	lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
   768  	hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
   769  
   770  	r := make([]uint32, 16, 16)
   771  
   772  	foo := func(v archsimd.Uint32x16, a, b, c, d uint32) {
   773  		v.StoreSlice(r)
   774  		checkSlices[uint32](t, r, []uint32{a, b, c, d,
   775  			10 + a, 10 + b, 10 + c, 10 + d,
   776  			20 + a, 20 + b, 20 + c, 20 + d,
   777  			30 + a, 30 + b, 30 + c, 30 + d,
   778  		})
   779  	}
   780  
   781  	foo(llll, 0, 1, 2, 3)
   782  	foo(hhhh, 4, 5, 6, 7)
   783  	foo(llhh, 0, 1, 6, 7)
   784  	foo(hhll, 6, 7, 0, 1)
   785  
   786  	foo(lllh, 0, 1, 2, 7)
   787  	foo(llhl, 0, 1, 7, 2)
   788  	foo(lhll, 0, 7, 1, 2)
   789  	foo(hlll, 7, 0, 1, 2)
   790  
   791  	foo(hhhl, 4, 5, 6, 0)
   792  	foo(hhlh, 4, 5, 0, 6)
   793  	foo(hlhh, 4, 0, 5, 6)
   794  	foo(lhhh, 0, 4, 5, 6)
   795  
   796  	foo(lhlh, 0, 4, 1, 5)
   797  	foo(hlhl, 4, 0, 5, 1)
   798  	foo(lhhl, 0, 4, 5, 1)
   799  	foo(hllh, 4, 0, 1, 5)
   800  }
   801  
   802  func TestSelect128FromPair(t *testing.T) {
   803  	x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
   804  	y := archsimd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
   805  
   806  	aa := x.Select128FromPair(0, 0, y)
   807  	ab := x.Select128FromPair(0, 1, y)
   808  	bc := x.Select128FromPair(1, 2, y)
   809  	cd := x.Select128FromPair(2, 3, y)
   810  	da := x.Select128FromPair(3, 0, y)
   811  	dc := x.Select128FromPair(3, 2, y)
   812  
   813  	r := make([]uint64, 4, 4)
   814  
   815  	foo := func(v archsimd.Uint64x4, a, b uint64) {
   816  		a, b = 2*a, 2*b
   817  		v.StoreSlice(r)
   818  		checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
   819  	}
   820  
   821  	foo(aa, 0, 0)
   822  	foo(ab, 0, 1)
   823  	foo(bc, 1, 2)
   824  	foo(cd, 2, 3)
   825  	foo(da, 3, 0)
   826  	foo(dc, 3, 2)
   827  }
   828  
   829  func TestSelect128FromPairError(t *testing.T) {
   830  	x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
   831  	y := archsimd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
   832  
   833  	defer func() {
   834  		if r := recover(); r != nil {
   835  			t.Logf("Saw expected panic %v", r)
   836  		}
   837  	}()
   838  	_ = x.Select128FromPair(0, 4, y)
   839  
   840  	t.Errorf("Should have panicked")
   841  }
   842  
   843  //go:noinline
   844  func select128FromPair(x archsimd.Uint64x4, lo, hi uint8, y archsimd.Uint64x4) archsimd.Uint64x4 {
   845  	return x.Select128FromPair(lo, hi, y)
   846  }
   847  
   848  func TestSelect128FromPairVar(t *testing.T) {
   849  	x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
   850  	y := archsimd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
   851  
   852  	aa := select128FromPair(x, 0, 0, y)
   853  	ab := select128FromPair(x, 0, 1, y)
   854  	bc := select128FromPair(x, 1, 2, y)
   855  	cd := select128FromPair(x, 2, 3, y)
   856  	da := select128FromPair(x, 3, 0, y)
   857  	dc := select128FromPair(x, 3, 2, y)
   858  
   859  	r := make([]uint64, 4, 4)
   860  
   861  	foo := func(v archsimd.Uint64x4, a, b uint64) {
   862  		a, b = 2*a, 2*b
   863  		v.StoreSlice(r)
   864  		checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
   865  	}
   866  
   867  	foo(aa, 0, 0)
   868  	foo(ab, 0, 1)
   869  	foo(bc, 1, 2)
   870  	foo(cd, 2, 3)
   871  	foo(da, 3, 0)
   872  	foo(dc, 3, 2)
   873  }
   874  
   875  func TestSelect2FromPairConst(t *testing.T) {
   876  	x := archsimd.LoadUint64x2Slice([]uint64{0, 1})
   877  	y := archsimd.LoadUint64x2Slice([]uint64{2, 3})
   878  
   879  	ll := x.SelectFromPair(0, 1, y)
   880  	hh := x.SelectFromPair(3, 2, y)
   881  	lh := x.SelectFromPair(0, 3, y)
   882  	hl := x.SelectFromPair(2, 1, y)
   883  
   884  	r := make([]uint64, 2, 2)
   885  
   886  	foo := func(v archsimd.Uint64x2, a, b uint64) {
   887  		v.StoreSlice(r)
   888  		checkSlices[uint64](t, r, []uint64{a, b})
   889  	}
   890  
   891  	foo(ll, 0, 1)
   892  	foo(hh, 3, 2)
   893  	foo(lh, 0, 3)
   894  	foo(hl, 2, 1)
   895  }
   896  
   897  func TestSelect2FromPairConstGroupedUint(t *testing.T) {
   898  	x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 10, 11})
   899  	y := archsimd.LoadUint64x4Slice([]uint64{2, 3, 12, 13})
   900  
   901  	ll := x.SelectFromPairGrouped(0, 1, y)
   902  	hh := x.SelectFromPairGrouped(3, 2, y)
   903  	lh := x.SelectFromPairGrouped(0, 3, y)
   904  	hl := x.SelectFromPairGrouped(2, 1, y)
   905  
   906  	r := make([]uint64, 4, 4)
   907  
   908  	foo := func(v archsimd.Uint64x4, a, b uint64) {
   909  		v.StoreSlice(r)
   910  		checkSlices[uint64](t, r, []uint64{a, b, a + 10, b + 10})
   911  	}
   912  
   913  	foo(ll, 0, 1)
   914  	foo(hh, 3, 2)
   915  	foo(lh, 0, 3)
   916  	foo(hl, 2, 1)
   917  }
   918  
   919  func TestSelect2FromPairConstGroupedFloat(t *testing.T) {
   920  	x := archsimd.LoadFloat64x4Slice([]float64{0, 1, 10, 11})
   921  	y := archsimd.LoadFloat64x4Slice([]float64{2, 3, 12, 13})
   922  
   923  	ll := x.SelectFromPairGrouped(0, 1, y)
   924  	hh := x.SelectFromPairGrouped(3, 2, y)
   925  	lh := x.SelectFromPairGrouped(0, 3, y)
   926  	hl := x.SelectFromPairGrouped(2, 1, y)
   927  
   928  	r := make([]float64, 4, 4)
   929  
   930  	foo := func(v archsimd.Float64x4, a, b float64) {
   931  		v.StoreSlice(r)
   932  		checkSlices[float64](t, r, []float64{a, b, a + 10, b + 10})
   933  	}
   934  
   935  	foo(ll, 0, 1)
   936  	foo(hh, 3, 2)
   937  	foo(lh, 0, 3)
   938  	foo(hl, 2, 1)
   939  }
   940  
   941  func TestSelect2FromPairConstGroupedInt(t *testing.T) {
   942  	x := archsimd.LoadInt64x4Slice([]int64{0, 1, 10, 11})
   943  	y := archsimd.LoadInt64x4Slice([]int64{2, 3, 12, 13})
   944  
   945  	ll := x.SelectFromPairGrouped(0, 1, y)
   946  	hh := x.SelectFromPairGrouped(3, 2, y)
   947  	lh := x.SelectFromPairGrouped(0, 3, y)
   948  	hl := x.SelectFromPairGrouped(2, 1, y)
   949  
   950  	r := make([]int64, 4, 4)
   951  
   952  	foo := func(v archsimd.Int64x4, a, b int64) {
   953  		v.StoreSlice(r)
   954  		checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10})
   955  	}
   956  
   957  	foo(ll, 0, 1)
   958  	foo(hh, 3, 2)
   959  	foo(lh, 0, 3)
   960  	foo(hl, 2, 1)
   961  }
   962  
   963  func TestSelect2FromPairConstGroupedInt512(t *testing.T) {
   964  	if !archsimd.X86.AVX512() {
   965  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   966  		return
   967  	}
   968  
   969  	x := archsimd.LoadInt64x8Slice([]int64{0, 1, 10, 11, 20, 21, 30, 31})
   970  	y := archsimd.LoadInt64x8Slice([]int64{2, 3, 12, 13, 22, 23, 32, 33})
   971  
   972  	ll := x.SelectFromPairGrouped(0, 1, y)
   973  	hh := x.SelectFromPairGrouped(3, 2, y)
   974  	lh := x.SelectFromPairGrouped(0, 3, y)
   975  	hl := x.SelectFromPairGrouped(2, 1, y)
   976  
   977  	r := make([]int64, 8, 8)
   978  
   979  	foo := func(v archsimd.Int64x8, a, b int64) {
   980  		v.StoreSlice(r)
   981  		checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10, a + 20, b + 20, a + 30, b + 30})
   982  	}
   983  
   984  	foo(ll, 0, 1)
   985  	foo(hh, 3, 2)
   986  	foo(lh, 0, 3)
   987  	foo(hl, 2, 1)
   988  }
   989  
   990  func TestString(t *testing.T) {
   991  	x := archsimd.LoadUint32x4Slice([]uint32{0, 1, 2, 3})
   992  	y := archsimd.LoadInt64x4Slice([]int64{-4, -5, -6, -7})
   993  	z := archsimd.LoadFloat32x4Slice([]float32{0.5, 1.5, -2.5, 3.5e9})
   994  	w := archsimd.LoadFloat64x4Slice([]float64{0.5, 1.5, -2.5, 3.5e9})
   995  
   996  	sx := "{0,1,2,3}"
   997  	sy := "{-4,-5,-6,-7}"
   998  	sz := "{0.5,1.5,-2.5,3.5e+09}"
   999  	sw := sz
  1000  
  1001  	if x.String() != sx {
  1002  		t.Errorf("x=%s wanted %s", x, sx)
  1003  	}
  1004  	if y.String() != sy {
  1005  		t.Errorf("y=%s wanted %s", y, sy)
  1006  	}
  1007  	if z.String() != sz {
  1008  		t.Errorf("z=%s wanted %s", z, sz)
  1009  	}
  1010  	if w.String() != sw {
  1011  		t.Errorf("w=%s wanted %s", w, sw)
  1012  	}
  1013  	t.Logf("w=%s", w)
  1014  	t.Logf("x=%s", x)
  1015  	t.Logf("y=%s", y)
  1016  	t.Logf("z=%s", z)
  1017  }
  1018  
  1019  // a returns an slice of 16 int32
  1020  func a() []int32 {
  1021  	return make([]int32, 16, 16)
  1022  }
  1023  
  1024  // applyTo3 returns a 16-element slice of the results of
  1025  // applying f to the respective elements of vectors x, y, and z.
  1026  func applyTo3(x, y, z archsimd.Int32x16, f func(x, y, z int32) int32) []int32 {
  1027  	ax, ay, az := a(), a(), a()
  1028  	x.StoreSlice(ax)
  1029  	y.StoreSlice(ay)
  1030  	z.StoreSlice(az)
  1031  
  1032  	r := a()
  1033  	for i := range r {
  1034  		r[i] = f(ax[i], ay[i], az[i])
  1035  	}
  1036  	return r
  1037  }
  1038  
  1039  // applyTo3 returns a 16-element slice of the results of
  1040  // applying f to the respective elements of vectors x, y, z, and w.
  1041  func applyTo4(x, y, z, w archsimd.Int32x16, f func(x, y, z, w int32) int32) []int32 {
  1042  	ax, ay, az, aw := a(), a(), a(), a()
  1043  	x.StoreSlice(ax)
  1044  	y.StoreSlice(ay)
  1045  	z.StoreSlice(az)
  1046  	w.StoreSlice(aw)
  1047  
  1048  	r := make([]int32, len(ax), len(ax))
  1049  	for i := range r {
  1050  		r[i] = f(ax[i], ay[i], az[i], aw[i])
  1051  	}
  1052  	return r
  1053  }
  1054  
  1055  func TestSelectTernOptInt32x16(t *testing.T) {
  1056  	if !archsimd.X86.AVX512() {
  1057  		t.Skip("Test requires X86.AVX512, not available on this hardware")
  1058  		return
  1059  	}
  1060  	ax := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
  1061  	ay := []int32{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}
  1062  	az := []int32{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}
  1063  	aw := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
  1064  	am := []int32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
  1065  
  1066  	x := archsimd.LoadInt32x16Slice(ax)
  1067  	y := archsimd.LoadInt32x16Slice(ay)
  1068  	z := archsimd.LoadInt32x16Slice(az)
  1069  	w := archsimd.LoadInt32x16Slice(aw)
  1070  	m := archsimd.LoadInt32x16Slice(am)
  1071  
  1072  	foo := func(v archsimd.Int32x16, s []int32) {
  1073  		r := make([]int32, 16, 16)
  1074  		v.StoreSlice(r)
  1075  		checkSlices[int32](t, r, s)
  1076  	}
  1077  
  1078  	t0 := w.Xor(y).Xor(z)
  1079  	ft0 := func(w, y, z int32) int32 {
  1080  		return w ^ y ^ z
  1081  	}
  1082  	foo(t0, applyTo3(w, y, z, ft0))
  1083  
  1084  	t1 := m.And(w.Xor(y).Xor(z.Not()))
  1085  	ft1 := func(m, w, y, z int32) int32 {
  1086  		return m & (w ^ y ^ ^z)
  1087  	}
  1088  	foo(t1, applyTo4(m, w, y, z, ft1))
  1089  
  1090  	t2 := x.Xor(y).Xor(z).And(x.Xor(y).Xor(z.Not()))
  1091  	ft2 := func(x, y, z int32) int32 {
  1092  		return (x ^ y ^ z) & (x ^ y ^ ^z)
  1093  	}
  1094  	foo(t2, applyTo3(x, y, z, ft2))
  1095  }
  1096  
  1097  func TestMaskedMerge(t *testing.T) {
  1098  	x := archsimd.LoadInt64x4Slice([]int64{1, 2, 3, 4})
  1099  	y := archsimd.LoadInt64x4Slice([]int64{5, 6, 1, 1})
  1100  	z := archsimd.LoadInt64x4Slice([]int64{-1, -2, -3, -4})
  1101  	res := make([]int64, 4)
  1102  	expected := []int64{6, 8, -3, -4}
  1103  	mask := x.Less(y)
  1104  	if archsimd.X86.AVX512() {
  1105  		x.Add(y).Merge(z, mask).StoreSlice(res)
  1106  	} else {
  1107  		x.Add(y).Merge(z, mask).StoreSlice(res)
  1108  	}
  1109  	for i := range 4 {
  1110  		if res[i] != expected[i] {
  1111  			t.Errorf("got %d wanted %d", res[i], expected[i])
  1112  		}
  1113  	}
  1114  }
  1115  
  1116  func TestDotProductQuadruple(t *testing.T) {
  1117  	if !archsimd.X86.AVXVNNI() {
  1118  		t.Skip("Test requires X86.AVXVNNI, not available on this hardware")
  1119  		return
  1120  	}
  1121  	xd := make([]int8, 16)
  1122  	yd := make([]uint8, 16)
  1123  	zd := make([]int32, 4)
  1124  	wanted1 := make([]int32, 4)
  1125  	wanted2 := make([]int32, 4)
  1126  	res1 := make([]int32, 4)
  1127  	res2 := make([]int32, 4)
  1128  	for i := range 4 {
  1129  		xd[i] = 5
  1130  		yd[i] = 6
  1131  		zd[i] = 3
  1132  		wanted1[i] = 30
  1133  		wanted2[i] = 30
  1134  	}
  1135  	x := archsimd.LoadInt8x16Slice(xd)
  1136  	y := archsimd.LoadUint8x16Slice(yd)
  1137  	z := archsimd.LoadInt32x4Slice(zd)
  1138  	x.DotProductQuadruple(y).StoreSlice(res1)
  1139  	x.DotProductQuadruple(y).Add(z).StoreSlice(res1)
  1140  	for i := range 4 {
  1141  		if res1[i] != wanted1[i] {
  1142  			t.Errorf("got %d wanted %d", res1[i], wanted1[i])
  1143  		}
  1144  		if res2[i] != wanted2[i] {
  1145  			t.Errorf("got %d wanted %d", res2[i], wanted2[i])
  1146  		}
  1147  	}
  1148  }
  1149  
  1150  func TestPermuteScalars(t *testing.T) {
  1151  	x := []int32{11, 12, 13, 14}
  1152  	want := []int32{12, 13, 14, 11}
  1153  	got := make([]int32, 4)
  1154  	archsimd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got)
  1155  	checkSlices(t, got, want)
  1156  }
  1157  
  1158  func TestPermuteScalarsGrouped(t *testing.T) {
  1159  	x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
  1160  	want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
  1161  	got := make([]int32, 8)
  1162  	archsimd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got)
  1163  	checkSlices(t, got, want)
  1164  }
  1165  
  1166  func TestPermuteScalarsHi(t *testing.T) {
  1167  	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
  1168  	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
  1169  	got := make([]int16, len(x))
  1170  	archsimd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got)
  1171  	checkSlices(t, got, want)
  1172  }
  1173  
  1174  func TestPermuteScalarsLo(t *testing.T) {
  1175  	x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
  1176  	want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
  1177  	got := make([]int16, len(x))
  1178  	archsimd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got)
  1179  	checkSlices(t, got, want)
  1180  }
  1181  
  1182  func TestPermuteScalarsHiGrouped(t *testing.T) {
  1183  	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
  1184  	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
  1185  	got := make([]int16, len(x))
  1186  	archsimd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got)
  1187  	checkSlices(t, got, want)
  1188  }
  1189  
  1190  func TestPermuteScalarsLoGrouped(t *testing.T) {
  1191  	x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
  1192  	want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
  1193  	got := make([]int16, len(x))
  1194  	archsimd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
  1195  	checkSlices(t, got, want)
  1196  }
  1197  
  1198  func TestClMul(t *testing.T) {
  1199  	var x = archsimd.LoadUint64x2Slice([]uint64{1, 5})
  1200  	var y = archsimd.LoadUint64x2Slice([]uint64{3, 9})
  1201  
  1202  	foo := func(v archsimd.Uint64x2, s []uint64) {
  1203  		r := make([]uint64, 2, 2)
  1204  		v.StoreSlice(r)
  1205  		checkSlices[uint64](t, r, s)
  1206  	}
  1207  
  1208  	foo(x.CarrylessMultiply(0, 0, y), []uint64{3, 0})
  1209  	foo(x.CarrylessMultiply(0, 1, y), []uint64{9, 0})
  1210  	foo(x.CarrylessMultiply(1, 0, y), []uint64{15, 0})
  1211  	foo(x.CarrylessMultiply(1, 1, y), []uint64{45, 0})
  1212  	foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0})
  1213  
  1214  }
  1215  

View as plain text