Implement Configurable TCP Keepalive Settings in PJSIP Transports

This commit introduces configurable TCP keepalive settings for both TCP and TLS transports. The changes allow for finer control over TCP connection keepalives, enhancing stability and reliability in environments prone to connection timeouts or where intermediate devices may prematurely close idle connections. This has proven necessary and has already been tested in production in several specialized environments where access to the underlying transport is unreliable in ways invisible to the operating system directly, so these keepalive and timeout mechanisms are necessary. Fixes #657
1 year ago · c8ab570c6f
parent 2de1a68339
commit c8ab570c6f
6 changed files with 184 additions and 10 deletions
--- a/configs/samples/pjsip.conf.sample
+++ b/configs/samples/pjsip.conf.sample
@ -171,6 +171,32 @@
 ;type=transport
 ;protocol=flow

+; Example IPv4 TCP transport with Keepalive options
+;
+;[transport-tcp]
+;type=transport
+;protocol=tcp
+;bind=0.0.0.0
+;tcp_keepalive_enable=yes        ; Enable TCP keepalive (yes/no)
+;tcp_keepalive_idle_time=30      ; Time in seconds the connection needs to remain idle before TCP starts sending keepalive probes
+;tcp_keepalive_interval_time=10  ; The time in seconds between individual keepalive probes
+;tcp_keepalive_probe_count=5     ; The maximum number of keepalive probes TCP should send before dropping the connection
+
+; Example IPv4 TLS transport with Keepalive options
+;
+;[transport-tls]
+;type=transport
+;protocol=tls
+;bind=0.0.0.0
+;cert_file=/path/to/mycert.crt
+;priv_key_file=/path/to/mykey.key
+;cipher=ADH-AES256-SHA,ADH-AES128-SHA
+;method=tlsv1
+;tcp_keepalive_enable=yes        ; Enable TCP keepalive (yes/no)
+;tcp_keepalive_idle_time=30      ; Time in seconds the connection needs to remain idle before TCP starts sending keepalive probes
+;tcp_keepalive_interval_time=10  ; The time in seconds between individual keepalive probes
+;tcp_keepalive_probe_count=5     ; The maximum number of keepalive probes TCP should send before dropping the connection
+
 ;===============OUTBOUND REGISTRATION WITH OUTBOUND AUTHENTICATION============
 ;
 ; This is a simple registration that works with some SIP trunking providers.
--- a/contrib/ast-db-manage/config/versions/8fce8496f03e_add_tcp_keepalive_settings_to_ps_.py
+++ b/contrib/ast-db-manage/config/versions/8fce8496f03e_add_tcp_keepalive_settings_to_ps_.py
@ -0,0 +1,28 @@
+"""Add TCP keepalive settings to ps_transports
+
+Revision ID: 8fce8496f03e
+Revises: 74dc751dfe8e
+Create Date: 2024-03-18 17:00:17.148018
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = '8fce8496f03e'
+down_revision = '74dc751dfe8e'
+
+from alembic import op
+import sqlalchemy as sa
+
+def upgrade():
+    with op.batch_alter_table('ps_transports') as batch_op:
+        batch_op.add_column(sa.Column('tcp_keepalive_enable', sa.Boolean(), nullable=True))
+        batch_op.add_column(sa.Column('tcp_keepalive_idle_time', sa.Integer(), nullable=True))
+        batch_op.add_column(sa.Column('tcp_keepalive_interval_time', sa.Integer(), nullable=True))
+        batch_op.add_column(sa.Column('tcp_keepalive_probe_count', sa.Integer(), nullable=True))
+
+def downgrade():
+    with op.batch_alter_table('ps_transports') as batch_op:
+        batch_op.drop_column('tcp_keepalive_enable')
+        batch_op.drop_column('tcp_keepalive_idle_time')
+        batch_op.drop_column('tcp_keepalive_interval_time')
+        batch_op.drop_column('tcp_keepalive_probe_count')
--- a/include/asterisk/res_pjsip.h
+++ b/include/asterisk/res_pjsip.h
@ -299,6 +299,14 @@ struct ast_sip_transport {
 	int symmetric_transport;
 	/*! This is a flow to another target */
 	int flow;
+	/*! Enable TCP keepalive */
+	int tcp_keepalive_enable;
+	/*! Time in seconds the connection needs to remain idle before TCP starts sending keepalive probes */
+	int tcp_keepalive_idle_time;
+	/*! The time in seconds between individual keepalive probes */
+	int tcp_keepalive_interval_time;
+	/*! The maximum number of keepalive probes TCP should send before dropping the connection */
+	int tcp_keepalive_probe_count;
 };

 #define SIP_SORCERY_DOMAIN_ALIAS_TYPE "domain_alias"
--- a/res/res_pjsip/config_transport.c
+++ b/res/res_pjsip/config_transport.c
@ -828,17 +828,55 @@ static int transport_apply(const struct ast_sorcery *sorcery, void *obj)
 	} else if (transport->type == AST_TRANSPORT_TCP) {
 		pjsip_tcp_transport_cfg cfg;
 		static int option = 1;
+		int sockopt_count = 0;

 		pjsip_tcp_transport_cfg_default(&cfg, temp_state->state->host.addr.sa_family);
 		cfg.bind_addr = temp_state->state->host;
 		cfg.async_cnt = transport->async_operations;
 		set_qos(transport, &cfg.qos_params);
+
 		/* sockopt_params.options is copied to each newly connected socket */
-		cfg.sockopt_params.options[0].level = pj_SOL_TCP();
-		cfg.sockopt_params.options[0].optname = pj_TCP_NODELAY();
-		cfg.sockopt_params.options[0].optval = &option;
-		cfg.sockopt_params.options[0].optlen = sizeof(option);
-		cfg.sockopt_params.cnt = 1;
+		cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+		cfg.sockopt_params.options[sockopt_count].optname = pj_TCP_NODELAY();
+		cfg.sockopt_params.options[sockopt_count].optval = &option;
+		cfg.sockopt_params.options[sockopt_count].optlen = sizeof(option);
+		sockopt_count++;
+
+		if (transport->tcp_keepalive_enable) {
+#if defined(PJ_MAX_SOCKOPT_PARAMS) && PJ_MAX_SOCKOPT_PARAMS >= 5
+			ast_log(LOG_DEBUG, "TCP Keepalive enabled for transport '%s'. Idle Time: %d, Interval: %d, Count: %d\n",
+				ast_sorcery_object_get_id(obj), transport->tcp_keepalive_idle_time, transport->tcp_keepalive_interval_time, transport->tcp_keepalive_probe_count);
+
+			cfg.sockopt_params.options[sockopt_count].level = pj_SOL_SOCKET();
+			cfg.sockopt_params.options[sockopt_count].optname = SO_KEEPALIVE;
+			cfg.sockopt_params.options[sockopt_count].optval = &option;
+			cfg.sockopt_params.options[sockopt_count].optlen = sizeof(option);
+			sockopt_count++;
+
+			cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+			cfg.sockopt_params.options[sockopt_count].optname = TCP_KEEPIDLE;
+			cfg.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_idle_time;
+			cfg.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_idle_time);
+			sockopt_count++;
+
+			cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+			cfg.sockopt_params.options[sockopt_count].optname = TCP_KEEPINTVL;
+			cfg.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_interval_time;
+			cfg.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_interval_time);
+			sockopt_count++;
+
+			cfg.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+			cfg.sockopt_params.options[sockopt_count].optname = TCP_KEEPCNT;
+			cfg.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_probe_count;
+			cfg.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_probe_count);
+			sockopt_count++;
+#else
+			ast_log(LOG_WARNING, "TCP keepalive settings for '%s' not set due to PJSIP built without support for setting all options. Consider using bundled PJSIP.\n",
+				ast_sorcery_object_get_id(obj));
+#endif
+		}
+
+		cfg.sockopt_params.cnt = sockopt_count;

 		for (i = 0; i < BIND_TRIES && res != PJ_SUCCESS; i++) {
 			if (perm_state && perm_state->state && perm_state->state->factory
@ -853,6 +891,7 @@ static int transport_apply(const struct ast_sorcery *sorcery, void *obj)
 	} else if (transport->type == AST_TRANSPORT_TLS) {
 #if defined(PJ_HAS_SSL_SOCK) && PJ_HAS_SSL_SOCK != 0
 		static int option = 1;
+		int sockopt_count = 0;

 		if (transport->async_operations > 1 && ast_compare_versions(pj_get_version(), "2.5.0") < 0) {
 			ast_log(LOG_ERROR, "Transport: %s: When protocol=tls and pjproject version < 2.5.0, async_operations can't be > 1\n",
@ -864,11 +903,47 @@ static int transport_apply(const struct ast_sorcery *sorcery, void *obj)
 		set_qos(transport, &temp_state->state->tls.qos_params);

 		/* sockopt_params.options is copied to each newly connected socket */
-		temp_state->state->tls.sockopt_params.options[0].level = pj_SOL_TCP();
-		temp_state->state->tls.sockopt_params.options[0].optname = pj_TCP_NODELAY();
-		temp_state->state->tls.sockopt_params.options[0].optval = &option;
-		temp_state->state->tls.sockopt_params.options[0].optlen = sizeof(option);
-		temp_state->state->tls.sockopt_params.cnt = 1;
+		temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+		temp_state->state->tls.sockopt_params.options[sockopt_count].optname = pj_TCP_NODELAY();
+		temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &option;
+		temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(option);
+		sockopt_count++;
+
+		if (transport->tcp_keepalive_enable) {
+#if defined(PJ_MAX_SOCKOPT_PARAMS) && PJ_MAX_SOCKOPT_PARAMS >= 5
+			ast_log(LOG_DEBUG, "TCP Keepalive enabled for transport '%s'. Idle Time: %d, Interval: %d, Count: %d\n",
+				ast_sorcery_object_get_id(obj), transport->tcp_keepalive_idle_time, transport->tcp_keepalive_interval_time, transport->tcp_keepalive_probe_count);
+
+			temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_SOCKET();
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optname = SO_KEEPALIVE;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &option;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(option);
+			sockopt_count++;
+
+			temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optname = TCP_KEEPIDLE;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_idle_time;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_idle_time);
+			sockopt_count++;
+
+			temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optname = TCP_KEEPINTVL;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_interval_time;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_interval_time);
+			sockopt_count++;
+
+			temp_state->state->tls.sockopt_params.options[sockopt_count].level = pj_SOL_TCP();
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optname = TCP_KEEPCNT;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optval = &transport->tcp_keepalive_probe_count;
+			temp_state->state->tls.sockopt_params.options[sockopt_count].optlen = sizeof(transport->tcp_keepalive_probe_count);
+			sockopt_count++;
+#else
+			ast_log(LOG_WARNING, "TCP keepalive settings for '%s' not set due to PJSIP built without support for setting all options. Consider using bundled PJSIP.\n",
+				ast_sorcery_object_get_id(obj));
+#endif
+		}
+
+		temp_state->state->tls.sockopt_params.cnt = sockopt_count;

 		for (i = 0; i < BIND_TRIES && res != PJ_SUCCESS; i++) {
 			if (perm_state && perm_state->state && perm_state->state->factory
@ -1760,6 +1835,10 @@ int ast_sip_initialize_sorcery_transport(void)
 	ast_sorcery_object_field_register_custom(sorcery, "transport", "require_client_cert", "", transport_tls_bool_handler, require_client_cert_to_str, NULL, 0, 0);
 	ast_sorcery_object_field_register_custom(sorcery, "transport", "allow_wildcard_certs", "", transport_tls_bool_handler, allow_wildcard_certs_to_str, NULL, 0, 0);
 	ast_sorcery_object_field_register_custom(sorcery, "transport", "method", "", transport_tls_method_handler, tls_method_to_str, NULL, 0, 0);
+	ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_enable", "no", OPT_BOOL_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_enable));
+	ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_idle_time", "30", OPT_INT_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_idle_time));
+	ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_interval_time", "1", OPT_INT_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_interval_time));
+	ast_sorcery_object_field_register(sorcery, "transport", "tcp_keepalive_probe_count", "5", OPT_INT_T, 0, FLDSET(struct ast_sip_transport, tcp_keepalive_probe_count));
 #if defined(PJ_HAS_SSL_SOCK) && PJ_HAS_SSL_SOCK != 0
 	ast_sorcery_object_field_register_custom(sorcery, "transport", "cipher", "", transport_tls_cipher_handler, transport_tls_cipher_to_str, NULL, 0, 0);
 #endif
--- a/res/res_pjsip/pjsip_config.xml
+++ b/res/res_pjsip/pjsip_config.xml
@ -1798,6 +1798,30 @@
 				<configOption name="require_client_cert" default="false">
 					<synopsis>Require client certificate (TLS ONLY, not WSS)</synopsis>
 				</configOption>
+				<configOption name="tcp_keepalive_enable" default="no">
+					<synopsis>Enable TCP keepalive</synopsis>
+					<description><para>
+						When set to 'yes', TCP keepalive messages are sent to verify that the endpoint is still reachable. This can help detect dead TCP connections in environments where connections may be silently dropped (e.g., NAT timeouts).
+					</para></description>
+				</configOption>
+				<configOption name="tcp_keepalive_idle_time" default="30">
+					<synopsis>Idle time before the first TCP keepalive probe is sent</synopsis>
+					<description><para>
+						Specifies the amount of time in seconds that the connection must be idle before the first TCP keepalive probe is sent. An idle connection is defined as a connection in which no data has been sent or received by the application.
+					</para></description>
+				</configOption>
+				<configOption name="tcp_keepalive_interval_time" default="10">
+					<synopsis>Interval between TCP keepalive probes</synopsis>
+					<description><para>
+						Specifies the interval in seconds between individual TCP keepalive probes, once the first probe is sent. This interval is used for subsequent probes if the peer does not respond to the previous probe.
+					</para></description>
+				</configOption>
+				<configOption name="tcp_keepalive_probe_count" default="5">
+					<synopsis>Maximum number of TCP keepalive probes</synopsis>
+					<description><para>
+						Specifies the maximum number of TCP keepalive probes to send before considering the connection dead and notifying the application. If the peer does not respond after this many probes, the connection is considered broken.
+					</para></description>
+				</configOption>
 				<configOption name="type">
 					<synopsis>Must be of type 'transport'.</synopsis>
 				</configOption>
--- a/third-party/pjproject/patches/config_site.h
+++ b/third-party/pjproject/patches/config_site.h
@ -35,6 +35,15 @@
 #define PJ_IOQUEUE_HAS_SAFE_UNREG 1
 #define PJ_IOQUEUE_MAX_EVENTS_IN_SINGLE_POLL (16)

+/*
+ * Increase the number of socket options available. This adjustment is necessary
+ * to accommodate additional TCP keepalive settings required for optimizing SIP
+ * transport stability, especially in environments prone to connection timeouts.
+ * The default limit is insufficient when configuring all desired keepalive
+ * parameters along with standard socket options.
+ */
+#define PJ_MAX_SOCKOPT_PARAMS 5
+
 #define PJ_SCANNER_USE_BITWISE	0
 #define PJ_OS_HAS_CHECK_STACK	0