MT#55447 SIMD implementation for float conversion

Benchmarks vs the native C implementation:

     C: 123.790894 s
  AVX2: 1.567766 s
AVX512: 0.897813 s

Change-Id: Ieeeb7ce3bb2d59dbd3a057ce07e0b35c9f9c73e9
pull/1623/head
Richard Fuchs 3 years ago
parent be7e810469
commit adad19fb4f

2
daemon/.gitignore vendored

@ -20,3 +20,5 @@ dtmflib.c
dtmf_rx_fillin.h
*-test.c
spandsp_logging.h
mvr2s_x64_avx512.S
mvr2s_x64_avx2.S

@ -86,8 +86,9 @@ SRCS= main.c kernel.c poller.c aux.c control_tcp.c call.c control_udp.c redis.c
LIBSRCS= loglib.c auxlib.c rtplib.c str.c socket.c streambuf.c ssllib.c dtmflib.c
ifeq ($(with_transcoding),yes)
LIBSRCS+= codeclib.strhash.c resample.c
LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S
endif
OBJS= $(SRCS:.c=.o) $(LIBSRCS:.c=.o)
OBJS= $(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(LIBASM:.S=.o)
PODS= rtpengine.pod
MANS= $(PODS:.pod=.8)

@ -36,6 +36,10 @@ static packetizer_f packetizer_passthrough; // pass frames as they arrive in AVP
static packetizer_f packetizer_samplestream; // flat stream of samples
static packetizer_f packetizer_amr;
static void (*simd_float2int16_array)(float *in, const uint16_t len, int16_t *out);
static void codeclib_key_value_parse(const str *instr, bool need_value,
void (*cb)(str *key, str *value, void *data), void *data);
@ -87,6 +91,14 @@ static void generic_cn_dtx_cleanup(decoder_t *);
static int generic_cn_dtx(decoder_t *, GQueue *, int);
#if defined(__x86_64__)
// mvr2s_x64_avx2.S
void mvr2s_avx2(float *in, const uint16_t len, int16_t *out);
// mvr2s_x64_avx512.S
void mvr2s_avx512(float *in, const uint16_t len, int16_t *out);
#endif
static void *evs_lib_handle;
@ -1149,6 +1161,33 @@ void codeclib_free(void) {
dlclose(evs_lib_handle);
}
static void arch_init(void) {
#if defined(__x86_64__)
int32_t ebx_7h0h;
__asm (
"mov $7, %%eax" "\n\t"
"xor %%ecx, %%ecx" "\n\t"
"cpuid" "\n\t"
"mov %%ebx, %0" "\n\t"
: "=rm" (ebx_7h0h)
:
: "eax", "ebx", "ecx", "edx"
);
bool has_avx2 = !!(ebx_7h0h & (1L << 5));
bool has_avx512bw = !!(ebx_7h0h & (1L << 30));
bool has_avx512f = !!(ebx_7h0h & (1L << 16));
if (has_avx512bw && has_avx512f)
simd_float2int16_array = mvr2s_avx512;
else if (has_avx2)
simd_float2int16_array = mvr2s_avx2;
#endif
}
void codeclib_init(int print) {
#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(58, 9, 100)
av_register_all();
@ -1158,6 +1197,8 @@ void codeclib_init(int print) {
avformat_network_init();
av_log_set_callback(avlog_ilog);
arch_init();
codecs_ht = g_hash_table_new(str_case_hash, str_case_equal);
codecs_ht_by_av = g_hash_table_new(g_direct_hash, g_direct_equal);
@ -4224,7 +4265,6 @@ static int evs_decoder_input(decoder_t *dec, const str *data, GQueue *out) {
else
evs_amr_dec_out(dec->u.evs, tmp);
evs_syn_output(tmp, n_samples, (void *) frame->extended_data[0]);
// XXX ^ use something SIMD accelerated? ffmpeg?
}
else {
if (!is_amr)
@ -4341,7 +4381,10 @@ static void evs_load_so(const char *path) {
evs_dec_out = dlsym(evs_lib_handle, "evs_dec");
if (!evs_dec_out)
goto err;
evs_syn_output = dlsym(evs_lib_handle, "syn_output");
if (simd_float2int16_array)
evs_syn_output = simd_float2int16_array;
else
evs_syn_output = dlsym(evs_lib_handle, "syn_output");
if (!evs_syn_output)
goto err;
evs_amr_dec_out = dlsym(evs_lib_handle, "amr_wb_dec");

@ -12,7 +12,7 @@ debug:
BUILD_TEST_ALTS = fix_frame_channel_layout.h dtmf_rx_fillin.h spandsp_logging.h
clean:
rm -f $(OBJS) $(TARGET) $(LIBSRCS) $(DAEMONSRCS) $(MANS) $(ADD_CLEAN) core core.*
rm -f $(OBJS) $(TARGET) $(LIBSRCS) $(LIBASM) $(DAEMONSRCS) $(MANS) $(ADD_CLEAN) core core.*
rm -f $(BUILD_TEST_ALTS) $(BUILD_TEST_ALTS:.h=-test.c) $(BUILD_TEST_ALTS:.h=-test) *.strhash.c $(HASHSRCS)
install:
@ -24,6 +24,11 @@ $(LIBSRCS): $(patsubst %,../lib/%,$(LIBSRCS))
echo '#line 1' && \
cat ../lib/"$@" ) > "$@"
$(LIBASM): $(patsubst %,../lib/%,$(LIBASM))
( echo '/******** GENERATED FILE ********/' && \
echo '#line 1' && \
cat ../lib/"$@" ) > "$@"
$(DAEMONSRCS) $(HASHSRCS): $(patsubst %,../daemon/%,$(DAEMONSRCS)) $(patsubst %,../daemon/%,$(HASHSRCS))
( echo '/******** GENERATED FILE ********/' && \
echo '#line 1' && \

@ -0,0 +1,78 @@
#if defined(__x86_64__)
.global mvr2s_avx2
.section .note.GNU-stack,"",@progbits
.text
# void mvr2s_avx2(float *in, const uint16_t len, int16_t *out);
# convert float array to int16 array with rounding and int16 saturation
mvr2s_avx2:
vmovups mask(%rip), %ymm3 # mask for vpermd
ldmxcsr csr(%rip) # set "round to nearest"
mov %rsi, %rax
and $-8, %al # 8 samples at a time
xor %rcx, %rcx
loop:
cmp %rax, %rcx
jge remainder
vmovups (%rdi,%rcx,4), %ymm0 # load, 32-bit size
# v8_float = {-4, -3.20000005, -1.70000005, -0.5, 0, 38000, -38000, 0},
# ->
# v8_int32 = {-4, -3, -2, 0, 0, 38000, -38000, 0},
vcvtps2dq %ymm0, %ymm1
# v8_int32 = {-4, -3, -2, 0, 0, 38000, -38000, 0},
# ->
# v16_int16 = {-4, -3, -2, 0, -4, -3, -2, 0, 0, 32767, -32768, 0, 0, 32767, -32768, 0},
vpackssdw %ymm1, %ymm1, %ymm0
# v16_int16 = {-4, -3, -2, 0, -4, -3, -2, 0, 0, 32767, -32768, 0, 0, 32767, -32768, 0},
# ->
# v16_int16 = {-4, -3, -2, 0, 0, 32767, -32768, 0, -4, -3, -4, -3, -4, -3, -4, -3},
vpermd %ymm0, %ymm3, %ymm1
# v8_int16 = {-4, -3, -2, 0, 0, 32767, -32768, 0},
vmovdqu %xmm1, (%rdx,%rcx,2) # store, 16-bit size
add $8, %rcx # 8 samples at a time
jmp loop
remainder:
cmp %rsi, %rcx
jge done
movss (%rdi,%rcx,4), %xmm0
vcvtps2dq %xmm0, %xmm1
vpackssdw %xmm1, %xmm1, %xmm0
movq %xmm0, %rax
mov %ax, (%rdx,%rcx,2)
inc %rcx
jmp remainder
done:
ret
.data
mask:
.byte 0x00, 0x00, 0x00, 0x00
.byte 0x01, 0x00, 0x00, 0x00
.byte 0x04, 0x00, 0x00, 0x00
.byte 0x05, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00
.byte 0x00, 0x00, 0x00, 0x00
csr:
.byte 0x80, 0x1f, 0x00, 0x00 # [ IM DM ZM OM UM PM ]
#endif

@ -0,0 +1,59 @@
#if defined(__x86_64__)
.global mvr2s_avx512
.section .note.GNU-stack,"",@progbits
.text
# void mvr2s_avx512(float *in, const uint16_t len, int16_t *out);
# convert float array to int16 array with rounding and int16 saturation
mvr2s_avx512:
ldmxcsr csr(%rip) # set "round to nearest"
mov %rsi, %rax
and $-16, %al # 16 samples at a time
xor %rcx, %rcx
loop:
cmp %rax, %rcx
jge remainder
vmovups (%rdi,%rcx,4), %zmm0 # load, 32-bit size
# v16_float = {-2, -2.20000005, -1.70000005, -1.5, 0, 0, 2, 2.20000005, 1.70000005, 1.5, -19187.207, 15405.2158, -4437.91748, -18747.3066, -3701.35034, -19959.6738},
# ->
# v16_int32 = {-2, -2, -2, -2, 0, 0, 2, 2, 2, 2, -19187, 15405, -4438, -18747, -3701, -19960},
vcvtps2dq %zmm0, %zmm1
# v16_int32 = {-2, -2, -2, -2, 0, 0, 2, 2, 2, 2, -19187, 15405, -4438, -18747, -3701, -19960},
# ->
# v16_int16 = {-2, -2, -2, -2, 0, 0, 2, 2, 2, 2, -19187, 15405, -4438, -18747, -3701, -19960},
vpmovsdw %zmm1, %ymm0
vmovdqu %ymm0, (%rdx,%rcx,2) # store, 16-bit size
add $16, %rcx # 16 samples at a time
jmp loop
remainder:
cmp %rsi, %rcx
jge done
vmovss (%rdi,%rcx,4), %xmm0
vcvtps2dq %ymm0, %ymm1
vpmovsdw %ymm1, %xmm0
vpextrw $0, %xmm0, (%rdx,%rcx,2)
inc %rcx
jmp remainder
done:
ret
.data
csr:
.byte 0x80, 0x1f, 0x00, 0x00 # [ IM DM ZM OM UM PM ]
#endif

@ -18,3 +18,5 @@ dtmflib.c
*-test
*-test.c
*.8
mvr2s_x64_avx512.S
mvr2s_x64_avx2.S

@ -35,7 +35,8 @@ SRCS= epoll.c garbage.c inotify.c main.c metafile.c stream.c recaux.c packet.c
decoder.c output.c mix.c db.c log.c forward.c tag.c poller.c notify.c
LIBSRCS= loglib.c auxlib.c rtplib.c codeclib.strhash.c resample.c str.c socket.c streambuf.c ssllib.c \
dtmflib.c
OBJS= $(SRCS:.c=.o) $(LIBSRCS:.c=.o)
LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S
OBJS= $(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(LIBASM:.S=.o)
PODS= rtpengine-recording.pod
MANS= $(PODS:.pod=.8)

2
t/.gitignore vendored

@ -74,3 +74,5 @@ websocket.c
test-stats
ssllib.c
time-fudge-preload.so
mvr2s_x64_avx2.S
mvr2s_x64_avx512.S

@ -80,9 +80,10 @@ DAEMONSRCS+= codec.c call.c ice.c kernel.c media_socket.c stun.c bencode.c polle
cookie_cache.c udp_listener.c homer.c load.c cdr.c dtmf.c timerthread.c \
media_player.c jitter_buffer.c t38.c tcp_listener.c mqtt.c websocket.c cli.c
HASHSRCS+= call_interfaces.c control_ng.c sdp.c janus.c
LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S
endif
OBJS= $(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(DAEMONSRCS:.c=.o) $(HASHSRCS:.c=.strhash.o)
OBJS= $(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(DAEMONSRCS:.c=.o) $(HASHSRCS:.c=.strhash.o) $(LIBASM:.S=.o)
COMMONOBJS= str.o auxlib.o rtplib.o loglib.o ssllib.o
@ -264,7 +265,7 @@ test-stats: test-stats.o $(COMMONOBJS) codeclib.strhash.o resample.o codec.o ssr
control_ng.strhash.o graphite.o \
streambuf.o cookie_cache.o udp_listener.o homer.o load.o cdr.o dtmf.o timerthread.o \
media_player.o jitter_buffer.o dtmflib.o t38.o tcp_listener.o mqtt.o janus.strhash.o \
websocket.o cli.o
websocket.o cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o
test-transcode: test-transcode.o $(COMMONOBJS) codeclib.strhash.o resample.o codec.o ssrc.o call.o ice.o aux.o \
kernel.o media_socket.o stun.o bencode.o socket.o poller.o dtls.o recording.o statistics.o \
@ -272,12 +273,13 @@ test-transcode: test-transcode.o $(COMMONOBJS) codeclib.strhash.o resample.o cod
control_ng.strhash.o \
streambuf.o cookie_cache.o udp_listener.o homer.o load.o cdr.o dtmf.o timerthread.o \
media_player.o jitter_buffer.o dtmflib.o t38.o tcp_listener.o mqtt.o janus.strhash.o websocket.o \
cli.o
cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o
test-resample: test-resample.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o
test-resample: test-resample.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o mvr2s_x64_avx2.o \
mvr2s_x64_avx512.o
test-payload-tracker: test-payload-tracker.o $(COMMONOBJS) ssrc.o aux.o auxlib.o rtp.o crypto.o codeclib.strhash.o \
resample.o dtmflib.o
resample.o dtmflib.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o
test-kernel-module: test-kernel-module.o $(COMMONOBJS) kernel.o

Loading…
Cancel
Save