@@ -197,6 +197,384 @@ def S.lshr(a, x):
197
197
return result
198
198
```
199
199
200
+ #### LUT1 zero
201
+
202
+ Gets elements from ` a ` located at the index specified by ` idx ` .
203
+ Elements whose index is out of bounds are set to ` 0 ` .
204
+
205
+ - ` vec.v8.lut1_z(idx: vec.v8, a: vec.v8) -> vec.v8 `
206
+ - ` vec.v16.lut1_z(idx: vec.v16, a: vec.v16) -> vec.v16 `
207
+ - ` vec.v32.lut1_z(idx: vec.v32, a: vec.v32) -> vec.v32 `
208
+ - ` vec.v64.lut1_z(idx: vec.v64, a: vec.v64) -> vec.v64 `
209
+ - ` vec.v128.lut1_z(idx: vec.v128, a: vec.v128) -> vec.v128 `
210
+
211
+ ``` python
212
+ def vec.S.lut1_z(idx, a):
213
+ result = vec.S.New()
214
+ for i in range (vec.S.length):
215
+ if idx[i] < vec.S.length:
216
+ result[i] = a[idx[i]]
217
+ else :
218
+ result[i] = 0
219
+ return result
220
+ ```
221
+
222
+ #### LUT1 merge
223
+
224
+ Gets elements from ` a ` located at the index specified by ` idx ` .
225
+ Elements whose index is out of bounds are taken from ` fallback ` .
226
+
227
+ - ` vec.v8.lut1_m(idx: vec.v8, a: vec.v8, fallback: vec.v8) -> vec.v8 `
228
+ - ` vec.v16.lut1_m(idx: vec.v16, a: vec.v16, fallback: vec.v16) -> vec.v16 `
229
+ - ` vec.v32.lut1_m(idx: vec.v32, a: vec.v32, fallback: vec.v32) -> vec.v32 `
230
+ - ` vec.v64.lut1_m(idx: vec.v64, a: vec.v64, fallback: vec.v64) -> vec.v64 `
231
+
232
+ ``` python
233
+ def vec.S.lut1_m(idx, a, fallback):
234
+ result = vec.S.New()
235
+ for i in range (vec.S.length):
236
+ if idx[i] < vec.S.length:
237
+ result[i] = a[idx[i]]
238
+ else :
239
+ result[i] = fallback[i]
240
+ return result
241
+ ```
242
+
243
+ #### LUT2 zero
244
+
245
+ Gets elements from ` a ` and ` b ` located at the index specified by ` idx ` .
246
+ If the index is lower than length, elements are taken from ` a ` , if index is between length and 2 * length, elements are taken from ` b ` .
247
+ Elements whose index is out of bounds are set to ` 0 ` .
248
+
249
+ - ` vec.v8.lut2_z(idx: vec.v8, a: vec.v8, b: vec.v8) -> vec.v8 `
250
+ - ` vec.v16.lut2_z(idx: vec.v16, a: vec.v16, b: vec.v16) -> vec.v16 `
251
+ - ` vec.v32.lut2_z(idx: vec.v32, a: vec.v32, b: vec.v32) -> vec.v32 `
252
+ - ` vec.v64.lut2_z(idx: vec.v64, a: vec.v64, b: vec.v64) -> vec.v64 `
253
+ - ` vec.v128.lut2_z(idx: vec.v128, a: vec.v128, b: vec.v128) -> vec.v128 `
254
+
255
+ ``` python
256
+ def vec.S.lut2_z(idx, a):
257
+ result = vec.S.New()
258
+ for i in range (vec.S.length):
259
+ if idx[i] < vec.S.length:
260
+ result[i] = a[idx[i]]
261
+ elif idx[i] < 2 * vec.S.length:
262
+ result[i] = b[idx[i] - vec.S.length]
263
+ else :
264
+ result[i] = 0
265
+ return result
266
+ ```
267
+ #### LUT2 merge
268
+
269
+ Gets elements from ` a ` and ` b ` located at the index specified by ` idx ` .
270
+ If the index is lower than length, elements are taken from ` a ` , if index is between length and 2 * length, elements are taken from ` b ` .
271
+ Elements whose index is out of bounds are taken from fallback.
272
+
273
+ - ` vec.v8.lut2_m(idx: vec.v8, a: vec.v8, b: vec.v8, fallback: vec.v8) -> vec.v8 `
274
+ - ` vec.v16.lut2_m(idx: vec.v16, a: vec.v16, b: vec.v16, fallback: vec.v16) -> vec.v16 `
275
+ - ` vec.v32.lut2_m(idx: vec.v32, a: vec.v32, b: vec.v32, fallback: vec.v32) -> vec.v32 `
276
+ - ` vec.v64.lut2_m(idx: vec.v64, a: vec.v64, b: vec.v64, fallback: vec.v64) -> vec.v64 `
277
+ - ` vec.v128.lut2_m(idx: vec.v128, a: vec.v128, b: vec.v128, fallback: vec.v128) -> vec.v128 `
278
+
279
+ ``` python
280
+ def vec.S.lut2_m(idx, a, b, fallback):
281
+ result = vec.S.New()
282
+ for i in range (vec.S.length):
283
+ if idx[i] < vec.S.length:
284
+ result[i] = a[idx[i]]
285
+ elif idx[i] < 2 * vec.S.length:
286
+ result[i] = b[idx[i] - vec.S.length]
287
+ else :
288
+ result[i] = fallback[i]
289
+ return result
290
+ ```
291
+
292
+ #### V128 shuffle
293
+
294
+ Applies shuffle to each v128 of the vector.
295
+
296
+ - ` vec.i8x16.shuffle(a: vec.v128, b: vec.v128, imm: ImmLaneIdx32[16]) -> vec.v128 `
297
+
298
+ ``` python
299
+ def vec.i8x16.shuffle(a, b, imm):
300
+ result = vec.v128.New()
301
+ for i in range (vec.v128.length):
302
+ result[i] = i8x16.shuffle(a[i], b[i], imm)
303
+ return result
304
+ ```
305
+
306
+ #### V128 swizzle
307
+
308
+ Applies swizzle to each v128 of the vector.
309
+
310
+ - ` vec.i8x16.swizzle(a: vec.v128, s: vec.v128) -> vec.v128 `
311
+
312
+ ``` python
313
+ def vec.i8x16.swizzle(idx, a, s):
314
+ result = vec.v128.New()
315
+ for i in range (vec.v128.length):
316
+ result[i] = i8x16.swizzle(a[i], s[i], imm)
317
+ return result
318
+ ```
319
+
320
+ #### Splat lane
321
+
322
+ Gets a single lane from vector and broadcast it to the entire vector.
323
+ ` idx ` is interpreted modulo the cardinal of the vector.
324
+
325
+ - ` vec.v8.splat_lane(v: vec.v8, idx: i32) -> vec.v8 `
326
+ - ` vec.v16.splat_lane(v: vec.v16, idx: i32) -> vec.v16 `
327
+ - ` vec.v32.splat_lane(v: vec.v32, idx: i32) -> vec.v32 `
328
+ - ` vec.v64.splat_lane(v: vec.v64, idx: i32) -> vec.v64 `
329
+ - ` vec.v128.splat_lane(v: vec.v128, idx: i32) -> vec.v128 `
330
+
331
+ ``` python
332
+ def vec.S.splat_lane(v, imm):
333
+ idx = idx % vec.S.length
334
+ result = vec.S.New()
335
+ for i in range (vec.S.length):
336
+ result[i] = v[idx]
337
+ return result
338
+ ```
339
+
340
+ #### Concat
341
+
342
+ Copies elements from vector ` a ` from first active element to last active element.
343
+ Inner inactive elements are also copied.
344
+ The remaining elements are set from the first elements from ` b ` .
345
+
346
+ - ` vec.v8.concat(m: vec.m8, a: vec.v8, b: vec.v8) -> vec.v8 `
347
+ - ` vec.v16.concat(m: vec.m16, a: vec.v16, b: vec.v16) -> vec.v16 `
348
+ - ` vec.v32.concat(m: vec.m32, a: vec.v32, b: vec.v32) -> vec.v32 `
349
+ - ` vec.v64.concat(m: vec.m64, a: vec.v64, b: vec.v64) -> vec.v64 `
350
+ - ` vec.v128.concat(m: vec.m128, a: vec.v128, b: vec.v128) -> vec.v128 `
351
+
352
+
353
+ ``` python
354
+ def vec.S.concat(m, a, b):
355
+ begin = - 1
356
+ end = - 1
357
+ for i in range (vec.S.length):
358
+ if m[i]:
359
+ end = i + 1
360
+ if begin < 0 :
361
+ begin = i
362
+
363
+ result = vec.S.New()
364
+ i = 0
365
+ for j in range (begin, end):
366
+ result[i] = a[j]
367
+ i += 1
368
+ for j in range (0 , vec.S.length - i):
369
+ result[i] = b[j]
370
+ i += 1
371
+ return result
372
+ ```
373
+
374
+ #### Lane shift
375
+
376
+ Concats the 2 input vector to form a single double-width vector.
377
+ Shifts this double-width vector by ` n ` lane to the left (to LSB).
378
+ Extracts the lower half of the shifted vector.
379
+ ` n ` is interpreted modulo the length of the vector.
380
+
381
+
382
+ - ` vec.v8.lane_shift(a: vec.v8, b: vec.v8, n: i32) -> vec.v8 `
383
+ - ` vec.v16.lane_shift(a: vec.v16, b: vec.v16, n: i32) -> vec.v16 `
384
+ - ` vec.v32.lane_shift(a: vec.v32, b: vec.v32, n: i32) -> vec.v32 `
385
+ - ` vec.v64.lane_shift(a: vec.v64, b: vec.v64, n: i32) -> vec.v64 `
386
+ - ` vec.v128.lane_shift(a: vec.v128, b: vec.v128, n: i32) -> vec.v128 `
387
+
388
+ ``` python
389
+ def vec.S.lane_shift(a, b, n):
390
+ result = vec.S.New()
391
+ n = n % vec.S.length
392
+ for i in range (0 , n):
393
+ result[i] = a[i + n]
394
+ for i in range (n, vec.S.length):
395
+ result[i] = b[i - n]
396
+ return result
397
+ ```
398
+
399
+ #### Interleave even
400
+
401
+ Extracts even elements from both input and interleaves them.
402
+
403
+ - ` vec.v8.interleave_even(a: vec.v8, b: vec.v8) -> vec.v8 `
404
+ - ` vec.v16.interleave_even(a: vec.v16, b: vec.v16) -> vec.v16 `
405
+ - ` vec.v32.interleave_even(a: vec.v32, b: vec.v32) -> vec.v32 `
406
+ - ` vec.v64.interleave_even(a: vec.v64, b: vec.v64) -> vec.v64 `
407
+ - ` vec.v128.interleave_even(a: vec.v128, b: vec.v128) -> vec.v128 `
408
+ - ` vec.m8.interleave_even(a: vec.m8, b: mec.m8) -> vec.m8 `
409
+ - ` vec.m16.interleave_even(a: vec.m16, b: mec.m16) -> vec.m16 `
410
+ - ` vec.m32.interleave_even(a: vec.m32, b: mec.m32) -> vec.m32 `
411
+ - ` vec.m64.interleave_even(a: vec.m64, b: mec.m64) -> vec.m64 `
412
+ - ` vec.m128.interleave_even(a: vec.m128, b: mec.m128) -> vec.m128 `
413
+
414
+
415
+ ``` python
416
+ def vec.S.interleave_even(a, b):
417
+ result = vec.S.New()
418
+ for i in range (vec.S.length/ 2 ):
419
+ result[2 * i] = a[2 * i]
420
+ result[2 * i + 1 ] = b[2 * i]
421
+ return result
422
+ ```
423
+
424
+ Note:
425
+
426
+ > - can be implemented with ` TRN1 ` on Neon/SVE
427
+
428
+ #### Interleave odd
429
+
430
+ Extracts odd elements from both input and interleaves them.
431
+
432
+ - ` vec.v8.interleave_odd(a: vec.v8, b: vec.v8) -> vec.v8 `
433
+ - ` vec.v16.interleave_odd(a: vec.v16, b: vec.v16) -> vec.v16 `
434
+ - ` vec.v32.interleave_odd(a: vec.v32, b: vec.v32) -> vec.v32 `
435
+ - ` vec.v64.interleave_odd(a: vec.v64, b: vec.v64) -> vec.v64 `
436
+ - ` vec.v128.interleave_odd(a: vec.v128, b: vec.v128) -> vec.v128 `
437
+ - ` vec.m8.interleave_odd(a: vec.m8, b: vec.m8) -> vec.m8 `
438
+ - ` vec.m16.interleave_odd(a: vec.m16, b: vec.m16) -> vec.m16 `
439
+ - ` vec.m32.interleave_odd(a: vec.m32, b: vec.m32) -> vec.m32 `
440
+ - ` vec.m64.interleave_odd(a: vec.m64, b: vec.m64) -> vec.m64 `
441
+ - ` vec.m128.interleave_odd(a: vec.m128, b: vec.m128) -> vec.m128 `
442
+
443
+
444
+ ``` python
445
+ def vec.S.interleave_odd(a, b):
446
+ result = vec.S.New()
447
+ for i in range (vec.S.length/ 2 ):
448
+ result[2 * i] = a[2 * i+ 1 ]
449
+ result[2 * i + 1 ] = b[2 * i+ 1 ]
450
+ return result
451
+ ```
452
+
453
+ Note:
454
+
455
+ > - can be implemented with ` TRN2 ` on Neon/SVE
456
+
457
+ #### Concat even
458
+
459
+ Extracts even elements from both input and concatenate them.
460
+
461
+ - ` vec.v8.concat_even(a: vec.v8, b: vec.v8) -> vec.v8 `
462
+ - ` vec.v16.concat_even(a: vec.v16, b: vec.v16) -> vec.v16 `
463
+ - ` vec.v32.concat_even(a: vec.v32, b: vec.v32) -> vec.v32 `
464
+ - ` vec.v64.concat_even(a: vec.v64, b: vec.v64) -> vec.v64 `
465
+ - ` vec.v128.concat_even(a: vec.v128, b: vec.v128) -> vec.v128 `
466
+ - ` vec.m8.concat_even(a: vec.m8, b: vec.m8) -> vec.m8 `
467
+ - ` vec.m16.concat_even(a: vec.m16, b: vec.m16) -> vec.m16 `
468
+ - ` vec.m32.concat_even(a: vec.m32, b: vec.m32) -> vec.m32 `
469
+ - ` vec.m64.concat_even(a: vec.m64, b: vec.m64) -> vec.m64 `
470
+ - ` vec.m128.concat_even(a: vec.m128, b: vec.m128) -> vec.m128 `
471
+
472
+
473
+ ``` python
474
+ def vec.S.concat_even(a, b):
475
+ result = vec.S.New()
476
+
477
+ for i in range (vec.S.length/ 2 ):
478
+ result[i] = a[2 * i]
479
+ for i in range (vec.S.length/ 2 ):
480
+ result[i + vec.S.length/ 2 ] = b[2 * i]
481
+ return result
482
+ ```
483
+
484
+ Note:
485
+
486
+ > - can be implemented with ` UZP1 ` on Neon/SVE
487
+ > - Wrapping narrowing integer conversions could be implemented with this function
488
+
489
+ #### Concat odd
490
+
491
+ Extracts odd elements from both input and concatenate them.
492
+
493
+ - ` vec.v8.concat_odd(a: vec.v8, b: vec.v8) -> vec.v8 `
494
+ - ` vec.v16.concat_odd(a: vec.v16, b: vec.v16) -> vec.v16 `
495
+ - ` vec.v32.concat_odd(a: vec.v32, b: vec.v32) -> vec.v32 `
496
+ - ` vec.v64.concat_odd(a: vec.v64, b: vec.v64) -> vec.v64 `
497
+ - ` vec.v128.concat_odd(a: vec.v128, b: vec.v128) -> vec.v128 `
498
+ - ` vec.m8.concat_odd(a: vec.m8, b: vec.m8) -> vec.m8 `
499
+ - ` vec.m16.concat_odd(a: vec.m16, b: vec.m16) -> vec.m16 `
500
+ - ` vec.m32.concat_odd(a: vec.m32, b: vec.m32) -> vec.m32 `
501
+ - ` vec.m64.concat_odd(a: vec.m64, b: vec.m64) -> vec.m64 `
502
+ - ` vec.m128.concat_odd(a: vec.m128, b: vec.m128) -> vec.m128 `
503
+
504
+
505
+ ``` python
506
+ def vec.S.concat_odd(a, b):
507
+ result = vec.S.New()
508
+
509
+ for i in range (vec.S.length/ 2 ):
510
+ result[i] = a[2 * i+ 1 ]
511
+ for i in range (vec.S.length/ 2 ):
512
+ result[i + vec.S.length/ 2 ] = b[2 * i+ 1 ]
513
+ return result
514
+ ```
515
+
516
+ Note:
517
+
518
+ > - can be implemented with ` UZP2 ` on Neon/SVE
519
+
520
+ #### Interleave low
521
+
522
+ Extracts the lower half of both input and interleaves their elements.
523
+
524
+ - ` vec.v8.interleave_low(a: vec.v8, b: vec.v8) -> vec.v8 `
525
+ - ` vec.v16.interleave_low(a: vec.v16, b: vec.v16) -> vec.v16 `
526
+ - ` vec.v32.interleave_low(a: vec.v32, b: vec.v32) -> vec.v32 `
527
+ - ` vec.v64.interleave_low(a: vec.v64, b: vec.v64) -> vec.v64 `
528
+ - ` vec.v128.interleave_low(a: vec.v128, b: vec.v128) -> vec.v128 `
529
+ - ` vec.m8.interleave_low(a: vec.m8, b: vec.m8) -> vec.m8 `
530
+ - ` vec.m16.interleave_low(a: vec.m16, b: vec.m16) -> vec.m16 `
531
+ - ` vec.m32.interleave_low(a: vec.m32, b: vec.m32) -> vec.m32 `
532
+ - ` vec.m64.interleave_low(a: vec.m64, b: vec.m64) -> vec.m64 `
533
+ - ` vec.m128.interleave_low(a: vec.m128, b: vec.m128) -> vec.m128 `
534
+
535
+
536
+ ``` python
537
+ def vec.S.interleave_low(a, b):
538
+ result = vec.S.New()
539
+ for i in range (vec.S.length/ 2 ):
540
+ result[2 * i] = a[i]
541
+ result[2 * i + 1 ] = b[i]
542
+ return result
543
+ ```
544
+
545
+ Note:
546
+
547
+ > - can be implemented with ` ZIP1 ` on Neon/SVE
548
+
549
+ #### Interleave high
550
+
551
+ Extracts the higher half of both input and interleaves their elements.
552
+
553
+ - ` vec.v8.interleave_high(a: vec.v8, b: vec.v8) -> vec.v8 `
554
+ - ` vec.v16.interleave_high(a: vec.v16, b: vec.v16) -> vec.v16 `
555
+ - ` vec.v32.interleave_high(a: vec.v32, b: vec.v32) -> vec.v32 `
556
+ - ` vec.v64.interleave_high(a: vec.v64, b: vec.v64) -> vec.v64 `
557
+ - ` vec.v128.interleave_high(a: vec.v128, b: vec.v128) -> vec.v128 `
558
+ - ` vec.m8.interleave_high(a: vec.m8, b: vec.m8) -> vec.m8 `
559
+ - ` vec.m16.interleave_high(a: vec.m16, b: vec.m16) -> vec.m16 `
560
+ - ` vec.m32.interleave_high(a: vec.m32, b: vec.m32) -> vec.m32 `
561
+ - ` vec.m64.interleave_high(a: vec.m64, b: vec.m64) -> vec.m64 `
562
+ - ` vec.m128.interleave_high(a: vec.m128, b: vec.m128) -> vec.m128 `
563
+
564
+
565
+ ``` python
566
+ def vec.S.interleave_high(a, b):
567
+ result = vec.S.New()
568
+ for i in range (vec.S.length/ 2 ):
569
+ result[2 * i] = a[i + vec.S.length/ 2 ]
570
+ result[2 * i + 1 ] = b[i + vec.S.length/ 2 ]
571
+ return result
572
+ ```
573
+
574
+ Note:
575
+
576
+ > - can be implemented with ` ZIP2 ` on Neon/SVE
577
+
200
578
### Integer arithmetic
201
579
202
580
Wrapping integer arithmetic discards the high bits of the result.
0 commit comments