MT#55447 SIMD implementation for float conversion

Benchmarks vs the native C implementation: C: 123.790894 s AVX2: 1.567766 s AVX512: 0.897813 s Change-Id: Ieeeb7ce3bb2d59dbd3a057ce07e0b35c9f9c73e9
3 years ago · adad19fb4f
parent be7e810469
commit adad19fb4f
10 changed files with 205 additions and 10 deletions
--- a/daemon/.gitignore
+++ b/daemon/.gitignore
@ -20,3 +20,5 @@ dtmflib.c
 dtmf_rx_fillin.h
 *-test.c
 spandsp_logging.h
+mvr2s_x64_avx512.S
+mvr2s_x64_avx2.S
--- a/daemon/Makefile
+++ b/daemon/Makefile
@ -86,8 +86,9 @@ SRCS=		main.c kernel.c poller.c aux.c control_tcp.c call.c control_udp.c redis.c
 LIBSRCS=	loglib.c auxlib.c rtplib.c str.c socket.c streambuf.c ssllib.c dtmflib.c
 ifeq ($(with_transcoding),yes)
 LIBSRCS+=	codeclib.strhash.c resample.c
+LIBASM=		mvr2s_x64_avx2.S mvr2s_x64_avx512.S
 endif
-OBJS=		$(SRCS:.c=.o) $(LIBSRCS:.c=.o)
+OBJS=		$(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(LIBASM:.S=.o)

 PODS=		rtpengine.pod
 MANS=		$(PODS:.pod=.8)
--- a/lib/codeclib.c
+++ b/lib/codeclib.c
@ -36,6 +36,10 @@ static packetizer_f packetizer_passthrough; // pass frames as they arrive in AVP
 static packetizer_f packetizer_samplestream; // flat stream of samples
 static packetizer_f packetizer_amr;

+static void (*simd_float2int16_array)(float *in, const uint16_t len, int16_t *out);
+
+
+
 static void codeclib_key_value_parse(const str *instr, bool need_value,
 		void (*cb)(str *key, str *value, void *data), void *data);

@ -87,6 +91,14 @@ static void generic_cn_dtx_cleanup(decoder_t *);
 static int generic_cn_dtx(decoder_t *, GQueue *, int);


+#if defined(__x86_64__)
+// mvr2s_x64_avx2.S
+void mvr2s_avx2(float *in, const uint16_t len, int16_t *out);
+
+// mvr2s_x64_avx512.S
+void mvr2s_avx512(float *in, const uint16_t len, int16_t *out);
+#endif
+


 static void *evs_lib_handle;
@ -1149,6 +1161,33 @@ void codeclib_free(void) {
 		dlclose(evs_lib_handle);
 }

+
+static void arch_init(void) {
+#if defined(__x86_64__)
+	int32_t ebx_7h0h;
+
+	__asm (
+		"mov $7, %%eax"		"\n\t"
+		"xor %%ecx, %%ecx"	"\n\t"
+		"cpuid"			"\n\t"
+		"mov %%ebx, %0"		"\n\t"
+		: "=rm" (ebx_7h0h)
+		:
+		: "eax", "ebx", "ecx", "edx"
+	    );
+
+	bool has_avx2 = !!(ebx_7h0h & (1L << 5));
+	bool has_avx512bw = !!(ebx_7h0h & (1L << 30));
+	bool has_avx512f = !!(ebx_7h0h & (1L << 16));
+
+	if (has_avx512bw && has_avx512f)
+		simd_float2int16_array = mvr2s_avx512;
+	else if (has_avx2)
+		simd_float2int16_array = mvr2s_avx2;
+#endif
+}
+
+
 void codeclib_init(int print) {
 #if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(58, 9, 100)
 	av_register_all();
@ -1158,6 +1197,8 @@ void codeclib_init(int print) {
 	avformat_network_init();
 	av_log_set_callback(avlog_ilog);

+	arch_init();
+
 	codecs_ht = g_hash_table_new(str_case_hash, str_case_equal);
 	codecs_ht_by_av = g_hash_table_new(g_direct_hash, g_direct_equal);

@ -4224,7 +4265,6 @@ static int evs_decoder_input(decoder_t *dec, const str *data, GQueue *out) {
 				else
 					evs_amr_dec_out(dec->u.evs, tmp);
 				evs_syn_output(tmp, n_samples, (void *) frame->extended_data[0]);
-				// XXX ^ use something SIMD accelerated? ffmpeg?
 			}
 			else {
 				if (!is_amr)
@ -4341,7 +4381,10 @@ static void evs_load_so(const char *path) {
 		evs_dec_out = dlsym(evs_lib_handle, "evs_dec");
 		if (!evs_dec_out)
 			goto err;
-		evs_syn_output = dlsym(evs_lib_handle, "syn_output");
+		if (simd_float2int16_array)
+			evs_syn_output = simd_float2int16_array;
+		else
+			evs_syn_output = dlsym(evs_lib_handle, "syn_output");
 		if (!evs_syn_output)
 			goto err;
 		evs_amr_dec_out = dlsym(evs_lib_handle, "amr_wb_dec");
--- a/lib/common.Makefile
+++ b/lib/common.Makefile
@ -12,7 +12,7 @@ debug:
 BUILD_TEST_ALTS = fix_frame_channel_layout.h dtmf_rx_fillin.h spandsp_logging.h

 clean:
-	rm -f $(OBJS) $(TARGET) $(LIBSRCS) $(DAEMONSRCS) $(MANS) $(ADD_CLEAN) core core.*
+	rm -f $(OBJS) $(TARGET) $(LIBSRCS) $(LIBASM) $(DAEMONSRCS) $(MANS) $(ADD_CLEAN) core core.*
 	rm -f $(BUILD_TEST_ALTS) $(BUILD_TEST_ALTS:.h=-test.c) $(BUILD_TEST_ALTS:.h=-test) *.strhash.c $(HASHSRCS)

 install:
@ -24,6 +24,11 @@ $(LIBSRCS):	$(patsubst %,../lib/%,$(LIBSRCS))
 		echo '#line 1' && \
 		cat ../lib/"$@" ) > "$@"

+$(LIBASM):	$(patsubst %,../lib/%,$(LIBASM))
+		( echo '/******** GENERATED FILE ********/' && \
+		echo '#line 1' && \
+		cat ../lib/"$@" ) > "$@"
+
 $(DAEMONSRCS) $(HASHSRCS):	$(patsubst %,../daemon/%,$(DAEMONSRCS)) $(patsubst %,../daemon/%,$(HASHSRCS))
 		( echo '/******** GENERATED FILE ********/' && \
 		echo '#line 1' && \
--- a/lib/mvr2s_x64_avx2.S
+++ b/lib/mvr2s_x64_avx2.S
@ -0,0 +1,78 @@
+#if defined(__x86_64__)
+
+.global mvr2s_avx2
+
+.section	.note.GNU-stack,"",@progbits
+
+.text
+
+	# void mvr2s_avx2(float *in, const uint16_t len, int16_t *out);
+	# convert float array to int16 array with rounding and int16 saturation
+mvr2s_avx2:
+	vmovups mask(%rip), %ymm3	# mask for vpermd
+
+	ldmxcsr csr(%rip)	# set "round to nearest"
+
+	mov %rsi, %rax
+	and $-8, %al		# 8 samples at a time
+
+	xor %rcx, %rcx
+loop:
+	cmp %rax, %rcx
+	jge remainder
+
+	vmovups (%rdi,%rcx,4), %ymm0	# load, 32-bit size
+
+	# v8_float = {-4, -3.20000005, -1.70000005, -0.5, 0, 38000, -38000, 0},
+	#    ->
+	# v8_int32 = {-4, -3, -2, 0, 0, 38000, -38000, 0},
+	vcvtps2dq %ymm0, %ymm1
+
+	# v8_int32 = {-4, -3, -2, 0, 0, 38000, -38000, 0},
+	#    ->
+	# v16_int16 = {-4, -3, -2, 0, -4, -3, -2, 0, 0, 32767, -32768, 0, 0, 32767, -32768, 0},
+	vpackssdw %ymm1, %ymm1, %ymm0
+
+	# v16_int16 = {-4, -3, -2, 0, -4, -3, -2, 0, 0, 32767, -32768, 0, 0, 32767, -32768, 0},
+	#    ->
+	# v16_int16 = {-4, -3, -2, 0, 0, 32767, -32768, 0, -4, -3, -4, -3, -4, -3, -4, -3},
+	vpermd %ymm0, %ymm3, %ymm1
+
+	# v8_int16 = {-4, -3, -2, 0, 0, 32767, -32768, 0},
+	vmovdqu %xmm1, (%rdx,%rcx,2)	# store, 16-bit size
+
+	add $8, %rcx		# 8 samples at a time
+	jmp loop
+
+remainder:
+	cmp %rsi, %rcx
+	jge done
+
+	movss (%rdi,%rcx,4), %xmm0
+	vcvtps2dq %xmm0, %xmm1
+	vpackssdw %xmm1, %xmm1, %xmm0
+	movq %xmm0, %rax
+	mov %ax, (%rdx,%rcx,2)
+
+	inc %rcx
+	jmp remainder
+
+done:
+	ret
+
+.data
+
+mask:
+	.byte 0x00, 0x00, 0x00, 0x00
+	.byte 0x01, 0x00, 0x00, 0x00
+	.byte 0x04, 0x00, 0x00, 0x00
+	.byte 0x05, 0x00, 0x00, 0x00
+	.byte 0x00, 0x00, 0x00, 0x00
+	.byte 0x00, 0x00, 0x00, 0x00
+	.byte 0x00, 0x00, 0x00, 0x00
+	.byte 0x00, 0x00, 0x00, 0x00
+
+csr:
+	.byte 0x80, 0x1f, 0x00, 0x00	# [ IM DM ZM OM UM PM ]
+
+#endif
--- a/lib/mvr2s_x64_avx512.S
+++ b/lib/mvr2s_x64_avx512.S
@ -0,0 +1,59 @@
+#if defined(__x86_64__)
+
+.global mvr2s_avx512
+
+.section	.note.GNU-stack,"",@progbits
+
+.text
+
+	# void mvr2s_avx512(float *in, const uint16_t len, int16_t *out);
+	# convert float array to int16 array with rounding and int16 saturation
+mvr2s_avx512:
+	ldmxcsr csr(%rip)	# set "round to nearest"
+
+	mov %rsi, %rax
+	and $-16, %al		# 16 samples at a time
+
+	xor %rcx, %rcx
+loop:
+	cmp %rax, %rcx
+	jge remainder
+
+	vmovups (%rdi,%rcx,4), %zmm0	# load, 32-bit size
+
+	# v16_float = {-2, -2.20000005, -1.70000005, -1.5, 0, 0, 2, 2.20000005, 1.70000005, 1.5, -19187.207, 15405.2158, -4437.91748, -18747.3066, -3701.35034, -19959.6738},
+	#    ->
+	# v16_int32 = {-2, -2, -2, -2, 0, 0, 2, 2, 2, 2, -19187, 15405, -4438, -18747, -3701, -19960},
+	vcvtps2dq %zmm0, %zmm1
+
+	# v16_int32 = {-2, -2, -2, -2, 0, 0, 2, 2, 2, 2, -19187, 15405, -4438, -18747, -3701, -19960},
+	#    ->
+	# v16_int16 = {-2, -2, -2, -2, 0, 0, 2, 2, 2, 2, -19187, 15405, -4438, -18747, -3701, -19960},
+	vpmovsdw %zmm1, %ymm0
+
+	vmovdqu %ymm0, (%rdx,%rcx,2)	# store, 16-bit size
+
+	add $16, %rcx		# 16 samples at a time
+	jmp loop
+
+remainder:
+	cmp %rsi, %rcx
+	jge done
+
+	vmovss (%rdi,%rcx,4), %xmm0
+	vcvtps2dq %ymm0, %ymm1
+	vpmovsdw %ymm1, %xmm0
+	vpextrw $0, %xmm0, (%rdx,%rcx,2)
+
+	inc %rcx
+	jmp remainder
+
+done:
+	ret
+
+.data
+
+csr:
+	.byte 0x80, 0x1f, 0x00, 0x00	# [ IM DM ZM OM UM PM ]
+
+#endif
--- a/recording-daemon/.gitignore
+++ b/recording-daemon/.gitignore
@ -18,3 +18,5 @@ dtmflib.c
 *-test
 *-test.c
 *.8
+mvr2s_x64_avx512.S
+mvr2s_x64_avx2.S
--- a/recording-daemon/Makefile
+++ b/recording-daemon/Makefile
@ -35,7 +35,8 @@ SRCS=		epoll.c garbage.c inotify.c main.c metafile.c stream.c recaux.c packet.c
 		decoder.c output.c mix.c db.c log.c forward.c tag.c poller.c notify.c
 LIBSRCS=	loglib.c auxlib.c rtplib.c codeclib.strhash.c resample.c str.c socket.c streambuf.c ssllib.c \
 		dtmflib.c
-OBJS=		$(SRCS:.c=.o) $(LIBSRCS:.c=.o)
+LIBASM=		mvr2s_x64_avx2.S mvr2s_x64_avx512.S
+OBJS=		$(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(LIBASM:.S=.o)

 PODS=		rtpengine-recording.pod
 MANS=		$(PODS:.pod=.8)
--- a/t/.gitignore
+++ b/t/.gitignore
@ -74,3 +74,5 @@ websocket.c
 test-stats
 ssllib.c
 time-fudge-preload.so
+mvr2s_x64_avx2.S
+mvr2s_x64_avx512.S
--- a/t/Makefile
+++ b/t/Makefile
@ -80,9 +80,10 @@ DAEMONSRCS+=	codec.c call.c ice.c kernel.c media_socket.c stun.c bencode.c polle
 		cookie_cache.c udp_listener.c homer.c load.c cdr.c dtmf.c timerthread.c \
 		media_player.c jitter_buffer.c t38.c tcp_listener.c mqtt.c websocket.c cli.c
 HASHSRCS+=	call_interfaces.c control_ng.c sdp.c janus.c
+LIBASM=		mvr2s_x64_avx2.S mvr2s_x64_avx512.S
 endif

-OBJS=		$(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(DAEMONSRCS:.c=.o) $(HASHSRCS:.c=.strhash.o)
+OBJS=		$(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(DAEMONSRCS:.c=.o) $(HASHSRCS:.c=.strhash.o) $(LIBASM:.S=.o)

 COMMONOBJS=	str.o auxlib.o rtplib.o loglib.o ssllib.o

@ -264,7 +265,7 @@ test-stats:	test-stats.o $(COMMONOBJS) codeclib.strhash.o resample.o codec.o ssr
 	control_ng.strhash.o graphite.o \
 	streambuf.o cookie_cache.o udp_listener.o homer.o load.o cdr.o dtmf.o timerthread.o \
 	media_player.o jitter_buffer.o dtmflib.o t38.o tcp_listener.o mqtt.o janus.strhash.o \
-	websocket.o cli.o
+	websocket.o cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o

 test-transcode:	test-transcode.o $(COMMONOBJS) codeclib.strhash.o resample.o codec.o ssrc.o call.o ice.o aux.o \
 	kernel.o media_socket.o stun.o bencode.o socket.o poller.o dtls.o recording.o statistics.o \
@ -272,12 +273,13 @@ test-transcode:	test-transcode.o $(COMMONOBJS) codeclib.strhash.o resample.o cod
 	control_ng.strhash.o \
 	streambuf.o cookie_cache.o udp_listener.o homer.o load.o cdr.o dtmf.o timerthread.o \
 	media_player.o jitter_buffer.o dtmflib.o t38.o tcp_listener.o mqtt.o janus.strhash.o websocket.o \
-	cli.o
+	cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o

-test-resample:	test-resample.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o
+test-resample:	test-resample.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o mvr2s_x64_avx2.o \
+	mvr2s_x64_avx512.o

 test-payload-tracker: test-payload-tracker.o $(COMMONOBJS) ssrc.o aux.o auxlib.o rtp.o crypto.o codeclib.strhash.o \
-	resample.o dtmflib.o
+	resample.o dtmflib.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o

 test-kernel-module: test-kernel-module.o $(COMMONOBJS) kernel.o