MT#56471 add SIMD implementation of s16_mix_in

Change-Id: Ia7b47f7b32ca1042f9f32828da476ff5360a1c72
pull/1692/head
Richard Fuchs 2 years ago
parent 7247ef4027
commit 9fc1b547e3

3
daemon/.gitignore vendored

@ -23,3 +23,6 @@ spandsp_logging.h
mvr2s_x64_avx512.S
mvr2s_x64_avx2.S
mix_buffer.c
mix_in_x64_avx2.S
mix_in_x64_avx512bw.S
mix_in_x64_sse2.S

@ -87,7 +87,7 @@ SRCS= main.c kernel.c poller.c helpers.c control_tcp.c call.c control_udp.c red
LIBSRCS= loglib.c auxlib.c rtplib.c str.c socket.c streambuf.c ssllib.c dtmflib.c mix_buffer.c
ifeq ($(with_transcoding),yes)
LIBSRCS+= codeclib.strhash.c resample.c
LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S
LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S mix_in_x64_avx2.S mix_in_x64_avx512bw.S mix_in_x64_sse2.S
endif
OBJS= $(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(LIBASM:.S=.o)

@ -23,6 +23,20 @@ struct mix_buffer_ssrc_source {
};
#if defined(__x86_64__)
// mix_in_x64_sse2.S
mix_in_fn_t s16_mix_in_sse2;
// mix_in_x64_avx2.S
mix_in_fn_t s16_mix_in_avx2;
// mix_in_x64_avx512.S
mix_in_fn_t s16_mix_in_avx512;
#endif
static void s16_mix_in_c(void *restrict dst, const void *restrict src, unsigned int samples) {
int16_t *d = dst;
const int16_t *s = src;
@ -39,17 +53,28 @@ static void s16_mix_in_c(void *restrict dst, const void *restrict src, unsigned
}
#ifndef ASAN_BUILD
static mix_in_fn_t *resolve_s16_mix_in(void) {
#if defined(__x86_64__)
if (rtpe_has_cpu_flag(RTPE_CPU_FLAG_AVX512BW))
return s16_mix_in_avx512;
if (rtpe_has_cpu_flag(RTPE_CPU_FLAG_AVX2))
return s16_mix_in_avx2;
if (rtpe_has_cpu_flag(RTPE_CPU_FLAG_SSE2))
return s16_mix_in_sse2;
#endif
return s16_mix_in_c;
}
static mix_in_fn_t s16_mix_in __attribute__ ((ifunc ("resolve_s16_mix_in")));
#else
#define s16_mix_in s16_mix_in_c
#endif
const struct mix_buffer_impl impl_s16_c = {
.sample_size = sizeof(int16_t),
.mix_in = s16_mix_in,
};
// TODO: SIMD-accelerated implementations
// must be locked already

@ -0,0 +1,41 @@
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
#if defined(__x86_64__)
.global s16_mix_in_avx2
.text
# 16 bits in 256 bits = 16 samples at a time
s16_mix_in_avx2:
mov %rdx, %rax
and $-16, %al # 16 samples at a time
xor %rcx, %rcx
loop:
cmp %rax, %rcx
jge remainder
vmovdqu (%rdi,%rcx,2), %ymm0 # 16-bit size
vpaddsw (%rsi,%rcx,2), %ymm0, %ymm1
vmovdqu %ymm1, (%rdi,%rcx,2) # 16-bit size
add $16, %rcx # 16 samples at a time
jmp loop
remainder:
xor %r8, %r8
xor %r9, %r9
cmp %rdx, %rcx
jge done
mov (%rsi,%rcx,2), %r8w # 16-bit size
mov (%rdi,%rcx,2), %r9w # 16-bit size
movd %r8, %xmm0
movd %r9, %xmm1
paddsw %xmm0, %xmm1
movd %xmm1, %r8
mov %r8w, (%rdi,%rcx,2) # 16-bit size
inc %rcx
jmp remainder
done:
ret
#endif

@ -0,0 +1,41 @@
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
#if defined(__x86_64__)
.global s16_mix_in_avx512
.text
# 16 bits in 512 bits = 32 samples at a time
s16_mix_in_avx512:
mov %rdx, %rax
and $-32, %al # 32 samples at a time
xor %rcx, %rcx
loop:
cmp %rax, %rcx
jge remainder
vmovdqu16 (%rdi,%rcx,2), %zmm0 # 16-bit size
vpaddsw (%rsi,%rcx,2), %zmm0, %zmm1
vmovdqu16 %zmm1, (%rdi,%rcx,2) # 16-bit size
add $32, %rcx # 32 samples at a time
jmp loop
remainder:
xor %r8, %r8
xor %r9, %r9
cmp %rdx, %rcx
jge done
mov (%rsi,%rcx,2), %r8w # 16-bit size
mov (%rdi,%rcx,2), %r9w # 16-bit size
movd %r8, %xmm0
movd %r9, %xmm1
paddsw %xmm0, %xmm1
movd %xmm1, %r8
mov %r8w, (%rdi,%rcx,2) # 16-bit size
inc %rcx
jmp remainder
done:
ret
#endif

@ -0,0 +1,42 @@
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
#if defined(__x86_64__)
.global s16_mix_in_sse2
.text
# 16 bits in 128 bits = 8 samples at a time
s16_mix_in_sse2:
mov %rdx, %rax
and $-8, %al # 8 samples at a time
xor %rcx, %rcx
loop:
cmp %rax, %rcx
jge remainder
movdqu (%rdi,%rcx,2), %xmm0 # 16-bit size
movdqu (%rsi,%rcx,2), %xmm1 # 16-bit size
paddsw %xmm0, %xmm1
movdqu %xmm1, (%rdi,%rcx,2) # 16-bit size
add $8, %rcx # 8 samples at a time
jmp loop
remainder:
xor %r8, %r8
xor %r9, %r9
cmp %rdx, %rcx
jge done
mov (%rsi,%rcx,2), %r8w # 16-bit size
mov (%rdi,%rcx,2), %r9w # 16-bit size
movd %r8, %xmm0
movd %r9, %xmm1
paddsw %xmm0, %xmm1
movd %xmm1, %r8
mov %r8w, (%rdi,%rcx,2) # 16-bit size
inc %rcx
jmp remainder
done:
ret
#endif

@ -20,3 +20,6 @@ dtmflib.c
*.8
mvr2s_x64_avx512.S
mvr2s_x64_avx2.S
mix_in_x64_avx2.S
mix_in_x64_avx512bw.S
mix_in_x64_sse2.S

@ -36,7 +36,7 @@ SRCS= epoll.c garbage.c inotify.c main.c metafile.c stream.c recaux.c packet.c
decoder.c output.c mix.c db.c log.c forward.c tag.c poller.c notify.c
LIBSRCS= loglib.c auxlib.c rtplib.c codeclib.strhash.c resample.c str.c socket.c streambuf.c ssllib.c \
dtmflib.c
LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S
LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S mix_in_x64_avx2.S mix_in_x64_avx512bw.S mix_in_x64_sse2.S
OBJS= $(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(LIBASM:.S=.o)
MDS= rtpengine-recording.ronn

3
t/.gitignore vendored

@ -79,3 +79,6 @@ mvr2s_x64_avx512.S
test-mix-buffer
mix_buffer.c
audio_player.c
mix_in_x64_avx2.S
mix_in_x64_avx512bw.S
mix_in_x64_sse2.S

@ -82,7 +82,7 @@ DAEMONSRCS+= codec.c call.c ice.c kernel.c media_socket.c stun.c bencode.c polle
media_player.c jitter_buffer.c t38.c tcp_listener.c mqtt.c websocket.c cli.c \
audio_player.c
HASHSRCS+= call_interfaces.c control_ng.c sdp.c janus.c
LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S
LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S mix_in_x64_avx2.S mix_in_x64_avx512bw.S mix_in_x64_sse2.S
endif
OBJS= $(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(DAEMONSRCS:.c=.o) $(HASHSRCS:.c=.strhash.o) $(LIBASM:.S=.o)
@ -258,7 +258,9 @@ daemon-tests-audio-player-play-media: daemon-test-deps
test-bitstr: test-bitstr.o
test-mix-buffer: test-mix-buffer.o $(COMMONOBJS) mix_buffer.o ssrc.o rtp.o crypto.o helpers.o
test-mix-buffer: test-mix-buffer.o $(COMMONOBJS) mix_buffer.o ssrc.o rtp.o crypto.o helpers.o \
mix_in_x64_avx2.o mix_in_x64_sse2.o mix_in_x64_avx512bw.o codeclib.strhash.o dtmflib.o \
mvr2s_x64_avx2.o mvr2s_x64_avx512.o resample.o
spandsp_send_fax_pcm: spandsp_send_fax_pcm.o
@ -270,7 +272,7 @@ spandsp_recv_fax_t38: spandsp_recv_fax_t38.o
spandsp_raw_fax_tests: spandsp_send_fax_pcm spandsp_recv_fax_pcm spandsp_send_fax_t38 spandsp_recv_fax_t38
test-amr-decode: test-amr-decode-test.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o
test-amr-decode: test-amr-decode-test.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o resample.o
test-amr-encode: test-amr-encode-test.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o
@ -286,7 +288,8 @@ test-stats: test-stats.o $(COMMONOBJS) codeclib.strhash.o resample.o codec.o ssr
control_ng.strhash.o graphite.o \
streambuf.o cookie_cache.o udp_listener.o homer.o load.o cdr.o dtmf.o timerthread.o \
media_player.o jitter_buffer.o dtmflib.o t38.o tcp_listener.o mqtt.o janus.strhash.o \
websocket.o cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o
websocket.o cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o \
mix_in_x64_avx2.o mix_in_x64_sse2.o mix_in_x64_avx512bw.o
test-transcode: test-transcode.o $(COMMONOBJS) codeclib.strhash.o resample.o codec.o ssrc.o call.o ice.o helpers.o \
kernel.o media_socket.o stun.o bencode.o socket.o poller.o dtls.o recording.o statistics.o \
@ -294,7 +297,8 @@ test-transcode: test-transcode.o $(COMMONOBJS) codeclib.strhash.o resample.o cod
control_ng.strhash.o \
streambuf.o cookie_cache.o udp_listener.o homer.o load.o cdr.o dtmf.o timerthread.o \
media_player.o jitter_buffer.o dtmflib.o t38.o tcp_listener.o mqtt.o janus.strhash.o websocket.o \
cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o
cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o \
mix_in_x64_avx2.o mix_in_x64_sse2.o mix_in_x64_avx512bw.o
test-resample: test-resample.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o mvr2s_x64_avx2.o \
mvr2s_x64_avx512.o

Loading…
Cancel
Save