mirror of https://github.com/sipwise/rtpengine.git
Benchmarks vs the native C implementation:
C: 123.790894 s
AVX2: 1.567766 s
AVX512: 0.897813 s
Change-Id: Ieeeb7ce3bb2d59dbd3a057ce07e0b35c9f9c73e9
pull/1623/head
parent
be7e810469
commit
adad19fb4f
@ -0,0 +1,78 @@
|
||||
#if defined(__x86_64__)
|
||||
|
||||
.global mvr2s_avx2
|
||||
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
|
||||
.text
|
||||
|
||||
# void mvr2s_avx2(float *in, const uint16_t len, int16_t *out);
|
||||
# convert float array to int16 array with rounding and int16 saturation
|
||||
mvr2s_avx2:
|
||||
vmovups mask(%rip), %ymm3 # mask for vpermd
|
||||
|
||||
ldmxcsr csr(%rip) # set "round to nearest"
|
||||
|
||||
mov %rsi, %rax
|
||||
and $-8, %al # 8 samples at a time
|
||||
|
||||
xor %rcx, %rcx
|
||||
loop:
|
||||
cmp %rax, %rcx
|
||||
jge remainder
|
||||
|
||||
vmovups (%rdi,%rcx,4), %ymm0 # load, 32-bit size
|
||||
|
||||
# v8_float = {-4, -3.20000005, -1.70000005, -0.5, 0, 38000, -38000, 0},
|
||||
# ->
|
||||
# v8_int32 = {-4, -3, -2, 0, 0, 38000, -38000, 0},
|
||||
vcvtps2dq %ymm0, %ymm1
|
||||
|
||||
# v8_int32 = {-4, -3, -2, 0, 0, 38000, -38000, 0},
|
||||
# ->
|
||||
# v16_int16 = {-4, -3, -2, 0, -4, -3, -2, 0, 0, 32767, -32768, 0, 0, 32767, -32768, 0},
|
||||
vpackssdw %ymm1, %ymm1, %ymm0
|
||||
|
||||
# v16_int16 = {-4, -3, -2, 0, -4, -3, -2, 0, 0, 32767, -32768, 0, 0, 32767, -32768, 0},
|
||||
# ->
|
||||
# v16_int16 = {-4, -3, -2, 0, 0, 32767, -32768, 0, -4, -3, -4, -3, -4, -3, -4, -3},
|
||||
vpermd %ymm0, %ymm3, %ymm1
|
||||
|
||||
# v8_int16 = {-4, -3, -2, 0, 0, 32767, -32768, 0},
|
||||
vmovdqu %xmm1, (%rdx,%rcx,2) # store, 16-bit size
|
||||
|
||||
add $8, %rcx # 8 samples at a time
|
||||
jmp loop
|
||||
|
||||
remainder:
|
||||
cmp %rsi, %rcx
|
||||
jge done
|
||||
|
||||
movss (%rdi,%rcx,4), %xmm0
|
||||
vcvtps2dq %xmm0, %xmm1
|
||||
vpackssdw %xmm1, %xmm1, %xmm0
|
||||
movq %xmm0, %rax
|
||||
mov %ax, (%rdx,%rcx,2)
|
||||
|
||||
inc %rcx
|
||||
jmp remainder
|
||||
|
||||
done:
|
||||
ret
|
||||
|
||||
.data
|
||||
|
||||
mask:
|
||||
.byte 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x01, 0x00, 0x00, 0x00
|
||||
.byte 0x04, 0x00, 0x00, 0x00
|
||||
.byte 0x05, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00
|
||||
.byte 0x00, 0x00, 0x00, 0x00
|
||||
|
||||
csr:
|
||||
.byte 0x80, 0x1f, 0x00, 0x00 # [ IM DM ZM OM UM PM ]
|
||||
|
||||
#endif
|
||||
@ -0,0 +1,59 @@
|
||||
#if defined(__x86_64__)
|
||||
|
||||
.global mvr2s_avx512
|
||||
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
|
||||
.text
|
||||
|
||||
# void mvr2s_avx512(float *in, const uint16_t len, int16_t *out);
|
||||
# convert float array to int16 array with rounding and int16 saturation
|
||||
mvr2s_avx512:
|
||||
ldmxcsr csr(%rip) # set "round to nearest"
|
||||
|
||||
mov %rsi, %rax
|
||||
and $-16, %al # 16 samples at a time
|
||||
|
||||
xor %rcx, %rcx
|
||||
loop:
|
||||
cmp %rax, %rcx
|
||||
jge remainder
|
||||
|
||||
vmovups (%rdi,%rcx,4), %zmm0 # load, 32-bit size
|
||||
|
||||
# v16_float = {-2, -2.20000005, -1.70000005, -1.5, 0, 0, 2, 2.20000005, 1.70000005, 1.5, -19187.207, 15405.2158, -4437.91748, -18747.3066, -3701.35034, -19959.6738},
|
||||
# ->
|
||||
# v16_int32 = {-2, -2, -2, -2, 0, 0, 2, 2, 2, 2, -19187, 15405, -4438, -18747, -3701, -19960},
|
||||
vcvtps2dq %zmm0, %zmm1
|
||||
|
||||
# v16_int32 = {-2, -2, -2, -2, 0, 0, 2, 2, 2, 2, -19187, 15405, -4438, -18747, -3701, -19960},
|
||||
# ->
|
||||
# v16_int16 = {-2, -2, -2, -2, 0, 0, 2, 2, 2, 2, -19187, 15405, -4438, -18747, -3701, -19960},
|
||||
vpmovsdw %zmm1, %ymm0
|
||||
|
||||
vmovdqu %ymm0, (%rdx,%rcx,2) # store, 16-bit size
|
||||
|
||||
add $16, %rcx # 16 samples at a time
|
||||
jmp loop
|
||||
|
||||
remainder:
|
||||
cmp %rsi, %rcx
|
||||
jge done
|
||||
|
||||
vmovss (%rdi,%rcx,4), %xmm0
|
||||
vcvtps2dq %ymm0, %ymm1
|
||||
vpmovsdw %ymm1, %xmm0
|
||||
vpextrw $0, %xmm0, (%rdx,%rcx,2)
|
||||
|
||||
inc %rcx
|
||||
jmp remainder
|
||||
|
||||
done:
|
||||
ret
|
||||
|
||||
.data
|
||||
|
||||
csr:
|
||||
.byte 0x80, 0x1f, 0x00, 0x00 # [ IM DM ZM OM UM PM ]
|
||||
|
||||
#endif
|
||||
Loading…
Reference in new issue