MT#56471 add SIMD implementation of s16_mix_in

Change-Id: Ia7b47f7b32ca1042f9f32828da476ff5360a1c72
2 years ago · 9fc1b547e3
parent 7247ef4027
commit 9fc1b547e3
10 changed files with 170 additions and 8 deletions
--- a/daemon/.gitignore
+++ b/daemon/.gitignore
@ -23,3 +23,6 @@ spandsp_logging.h
 mvr2s_x64_avx512.S
 mvr2s_x64_avx2.S
 mix_buffer.c
+mix_in_x64_avx2.S
+mix_in_x64_avx512bw.S
+mix_in_x64_sse2.S
--- a/daemon/Makefile
+++ b/daemon/Makefile
@ -87,7 +87,7 @@ SRCS=		main.c kernel.c poller.c helpers.c control_tcp.c call.c control_udp.c red
 LIBSRCS=	loglib.c auxlib.c rtplib.c str.c socket.c streambuf.c ssllib.c dtmflib.c mix_buffer.c
 ifeq ($(with_transcoding),yes)
 LIBSRCS+=	codeclib.strhash.c resample.c
-LIBASM=		mvr2s_x64_avx2.S mvr2s_x64_avx512.S
+LIBASM=		mvr2s_x64_avx2.S mvr2s_x64_avx512.S mix_in_x64_avx2.S mix_in_x64_avx512bw.S mix_in_x64_sse2.S
 endif
 OBJS=		$(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(LIBASM:.S=.o)

--- a/lib/mix_buffer.c
+++ b/lib/mix_buffer.c
@ -23,6 +23,20 @@ struct mix_buffer_ssrc_source {
 };


+
+#if defined(__x86_64__)
+// mix_in_x64_sse2.S
+mix_in_fn_t s16_mix_in_sse2;
+
+// mix_in_x64_avx2.S
+mix_in_fn_t s16_mix_in_avx2;
+
+// mix_in_x64_avx512.S
+mix_in_fn_t s16_mix_in_avx512;
+#endif
+
+
+
 static void s16_mix_in_c(void *restrict dst, const void *restrict src, unsigned int samples) {
 	int16_t *d = dst;
 	const int16_t *s = src;
@ -39,17 +53,28 @@ static void s16_mix_in_c(void *restrict dst, const void *restrict src, unsigned
 }


+#ifndef ASAN_BUILD
 static mix_in_fn_t *resolve_s16_mix_in(void) {
+#if defined(__x86_64__)
+	if (rtpe_has_cpu_flag(RTPE_CPU_FLAG_AVX512BW))
+		return s16_mix_in_avx512;
+	if (rtpe_has_cpu_flag(RTPE_CPU_FLAG_AVX2))
+		return s16_mix_in_avx2;
+	if (rtpe_has_cpu_flag(RTPE_CPU_FLAG_SSE2))
+		return s16_mix_in_sse2;
+#endif
 	return s16_mix_in_c;
 }
 static mix_in_fn_t s16_mix_in __attribute__ ((ifunc ("resolve_s16_mix_in")));
+#else
+#define s16_mix_in s16_mix_in_c
+#endif


 const struct mix_buffer_impl impl_s16_c = {
 	.sample_size = sizeof(int16_t),
 	.mix_in = s16_mix_in,
 };
-// TODO: SIMD-accelerated implementations


 // must be locked already
--- a/lib/mix_in_x64_avx2.S
+++ b/lib/mix_in_x64_avx2.S
@ -0,0 +1,41 @@
+#if defined(__linux__) && defined(__ELF__)
+.section	.note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__x86_64__)
+
+.global s16_mix_in_avx2
+
+.text
+
+# 16 bits in 256 bits = 16 samples at a time
+s16_mix_in_avx2:
+	mov %rdx, %rax
+	and $-16, %al			# 16 samples at a time
+	xor %rcx, %rcx
+loop:
+	cmp %rax, %rcx
+	jge remainder
+	vmovdqu (%rdi,%rcx,2), %ymm0	# 16-bit size
+	vpaddsw (%rsi,%rcx,2), %ymm0, %ymm1
+	vmovdqu %ymm1, (%rdi,%rcx,2)	# 16-bit size
+	add $16, %rcx			# 16 samples at a time
+	jmp loop
+remainder:
+	xor %r8, %r8
+	xor %r9, %r9
+	cmp %rdx, %rcx
+	jge done
+	mov (%rsi,%rcx,2), %r8w		# 16-bit size
+	mov (%rdi,%rcx,2), %r9w		# 16-bit size
+	movd %r8, %xmm0
+	movd %r9, %xmm1
+	paddsw %xmm0, %xmm1
+	movd %xmm1, %r8
+	mov %r8w, (%rdi,%rcx,2)		# 16-bit size
+	inc %rcx
+	jmp remainder
+done:
+	ret
+
+#endif
--- a/lib/mix_in_x64_avx512bw.S
+++ b/lib/mix_in_x64_avx512bw.S
@ -0,0 +1,41 @@
+#if defined(__linux__) && defined(__ELF__)
+.section	.note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__x86_64__)
+
+.global s16_mix_in_avx512
+
+.text
+
+# 16 bits in 512 bits = 32 samples at a time
+s16_mix_in_avx512:
+	mov %rdx, %rax
+	and $-32, %al			# 32 samples at a time
+	xor %rcx, %rcx
+loop:
+	cmp %rax, %rcx
+	jge remainder
+	vmovdqu16 (%rdi,%rcx,2), %zmm0	# 16-bit size
+	vpaddsw (%rsi,%rcx,2), %zmm0, %zmm1
+	vmovdqu16 %zmm1, (%rdi,%rcx,2)	# 16-bit size
+	add $32, %rcx			# 32 samples at a time
+	jmp loop
+remainder:
+	xor %r8, %r8
+	xor %r9, %r9
+	cmp %rdx, %rcx
+	jge done
+	mov (%rsi,%rcx,2), %r8w		# 16-bit size
+	mov (%rdi,%rcx,2), %r9w		# 16-bit size
+	movd %r8, %xmm0
+	movd %r9, %xmm1
+	paddsw %xmm0, %xmm1
+	movd %xmm1, %r8
+	mov %r8w, (%rdi,%rcx,2)		# 16-bit size
+	inc %rcx
+	jmp remainder
+done:
+	ret
+
+#endif
--- a/lib/mix_in_x64_sse2.S
+++ b/lib/mix_in_x64_sse2.S
@ -0,0 +1,42 @@
+#if defined(__linux__) && defined(__ELF__)
+.section	.note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__x86_64__)
+
+.global s16_mix_in_sse2
+
+.text
+
+# 16 bits in 128 bits = 8 samples at a time
+s16_mix_in_sse2:
+	mov %rdx, %rax
+	and $-8, %al			# 8 samples at a time
+	xor %rcx, %rcx
+loop:
+	cmp %rax, %rcx
+	jge remainder
+	movdqu (%rdi,%rcx,2), %xmm0	# 16-bit size
+	movdqu (%rsi,%rcx,2), %xmm1	# 16-bit size
+	paddsw %xmm0, %xmm1
+	movdqu %xmm1, (%rdi,%rcx,2)	# 16-bit size
+	add $8, %rcx			# 8 samples at a time
+	jmp loop
+remainder:
+	xor %r8, %r8
+	xor %r9, %r9
+	cmp %rdx, %rcx
+	jge done
+	mov (%rsi,%rcx,2), %r8w		# 16-bit size
+	mov (%rdi,%rcx,2), %r9w		# 16-bit size
+	movd %r8, %xmm0
+	movd %r9, %xmm1
+	paddsw %xmm0, %xmm1
+	movd %xmm1, %r8
+	mov %r8w, (%rdi,%rcx,2)		# 16-bit size
+	inc %rcx
+	jmp remainder
+done:
+	ret
+
+#endif
--- a/recording-daemon/.gitignore
+++ b/recording-daemon/.gitignore
@ -20,3 +20,6 @@ dtmflib.c
 *.8
 mvr2s_x64_avx512.S
 mvr2s_x64_avx2.S
+mix_in_x64_avx2.S
+mix_in_x64_avx512bw.S
+mix_in_x64_sse2.S
--- a/recording-daemon/Makefile
+++ b/recording-daemon/Makefile
@ -36,7 +36,7 @@ SRCS=		epoll.c garbage.c inotify.c main.c metafile.c stream.c recaux.c packet.c
 		decoder.c output.c mix.c db.c log.c forward.c tag.c poller.c notify.c
 LIBSRCS=	loglib.c auxlib.c rtplib.c codeclib.strhash.c resample.c str.c socket.c streambuf.c ssllib.c \
 		dtmflib.c
-LIBASM=		mvr2s_x64_avx2.S mvr2s_x64_avx512.S
+LIBASM=		mvr2s_x64_avx2.S mvr2s_x64_avx512.S mix_in_x64_avx2.S mix_in_x64_avx512bw.S mix_in_x64_sse2.S
 OBJS=		$(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(LIBASM:.S=.o)

 MDS=		rtpengine-recording.ronn
--- a/t/.gitignore
+++ b/t/.gitignore
@ -79,3 +79,6 @@ mvr2s_x64_avx512.S
 test-mix-buffer
 mix_buffer.c
 audio_player.c
+mix_in_x64_avx2.S
+mix_in_x64_avx512bw.S
+mix_in_x64_sse2.S
--- a/t/Makefile
+++ b/t/Makefile
@ -82,7 +82,7 @@ DAEMONSRCS+=	codec.c call.c ice.c kernel.c media_socket.c stun.c bencode.c polle
 		media_player.c jitter_buffer.c t38.c tcp_listener.c mqtt.c websocket.c cli.c \
 		audio_player.c
 HASHSRCS+=	call_interfaces.c control_ng.c sdp.c janus.c
-LIBASM=		mvr2s_x64_avx2.S mvr2s_x64_avx512.S
+LIBASM=		mvr2s_x64_avx2.S mvr2s_x64_avx512.S mix_in_x64_avx2.S mix_in_x64_avx512bw.S mix_in_x64_sse2.S
 endif

 OBJS=		$(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(DAEMONSRCS:.c=.o) $(HASHSRCS:.c=.strhash.o) $(LIBASM:.S=.o)
@ -258,7 +258,9 @@ daemon-tests-audio-player-play-media:	daemon-test-deps

 test-bitstr:	test-bitstr.o

-test-mix-buffer:	test-mix-buffer.o $(COMMONOBJS) mix_buffer.o ssrc.o rtp.o crypto.o helpers.o
+test-mix-buffer:	test-mix-buffer.o $(COMMONOBJS) mix_buffer.o ssrc.o rtp.o crypto.o helpers.o \
+	mix_in_x64_avx2.o mix_in_x64_sse2.o mix_in_x64_avx512bw.o codeclib.strhash.o dtmflib.o \
+	mvr2s_x64_avx2.o mvr2s_x64_avx512.o resample.o

 spandsp_send_fax_pcm:	spandsp_send_fax_pcm.o

@ -270,7 +272,7 @@ spandsp_recv_fax_t38:	spandsp_recv_fax_t38.o

 spandsp_raw_fax_tests: spandsp_send_fax_pcm spandsp_recv_fax_pcm spandsp_send_fax_t38 spandsp_recv_fax_t38

-test-amr-decode: test-amr-decode-test.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o
+test-amr-decode: test-amr-decode-test.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o resample.o

 test-amr-encode: test-amr-encode-test.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o

@ -286,7 +288,8 @@ test-stats:	test-stats.o $(COMMONOBJS) codeclib.strhash.o resample.o codec.o ssr
 	control_ng.strhash.o graphite.o \
 	streambuf.o cookie_cache.o udp_listener.o homer.o load.o cdr.o dtmf.o timerthread.o \
 	media_player.o jitter_buffer.o dtmflib.o t38.o tcp_listener.o mqtt.o janus.strhash.o \
-	websocket.o cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o
+	websocket.o cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o \
+	mix_in_x64_avx2.o mix_in_x64_sse2.o mix_in_x64_avx512bw.o

 test-transcode:	test-transcode.o $(COMMONOBJS) codeclib.strhash.o resample.o codec.o ssrc.o call.o ice.o helpers.o \
 	kernel.o media_socket.o stun.o bencode.o socket.o poller.o dtls.o recording.o statistics.o \
@ -294,7 +297,8 @@ test-transcode:	test-transcode.o $(COMMONOBJS) codeclib.strhash.o resample.o cod
 	control_ng.strhash.o \
 	streambuf.o cookie_cache.o udp_listener.o homer.o load.o cdr.o dtmf.o timerthread.o \
 	media_player.o jitter_buffer.o dtmflib.o t38.o tcp_listener.o mqtt.o janus.strhash.o websocket.o \
-	cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o
+	cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o \
+	mix_in_x64_avx2.o mix_in_x64_sse2.o mix_in_x64_avx512bw.o

 test-resample:	test-resample.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o mvr2s_x64_avx2.o \
 	mvr2s_x64_avx512.o