mgcp-client: Add keepalive feature

The `keepalive` feature in libosmo-mgcp-client allows scheduling periodical
queries on the MGCP layer in order to make sure it is reachable and hence
obtain information on the state of the MGCP link.
This patch only uses it to print the status on the VTY, but it will be used
too in a follow-up commit by the MGW Pool when picking an MGW from the pool:
MGWs whose link is considered to be DOWN are skipped.

The feature consists of:
- A `keepalive request-interval` which will trigger a transmission of an MGCP
  AuditEndpoint command targeting endpoint with name `keepalive request-endpoint`.
  This interval is updated every time any message is transmitted in the MGCP
  link, meaning the MGCP AuditEndpoint message is only triggered if no message
  has been transmitted since `keepalive request-interval` seconds ago.
- A `keepalive timeout` considering the MGW to be non-reachable (link DOWN) if
  no message is received over that amount of time.

The `keepalive` parameters are to be preferrably configured so that
"keepalive request-interval" * 2 < "keepalive timeout".

Example VTY configuration of `keepalive` feature in libosmo-mgcp-client:
----
 mgw 0
  ...
  keepalive request-interval 20 <1>
  keepalive request-endpoint null <2>
  keepalive timeout 50 <3>
----

<1> Transmit an MGCP AuditEndpoint message to the MGW if no message has been
    sent to it over last 10 seconds
<2> The MGCP AuditEndpoint targets the `null` endpoint. This is a special
    endpoint available at OsmoMGW for those purposes, but any available
    endpoint can be configured and used instead.
<3> Consider the MGCP link to be DOWN if no message is received from the
    MGW over the last 50 seconds

NOTE: The `keepalive` feature is disabled by default, and must be explicitly
      configured in order to enable it.

Related: SYS#6481
Change-Id: I3dc74c78548d017f272da863d5282dc5e0020ca3
diff --git a/include/osmocom/mgcp_client/mgcp_client.h b/include/osmocom/mgcp_client/mgcp_client.h
index 46ec210..9a0611a 100644
--- a/include/osmocom/mgcp_client/mgcp_client.h
+++ b/include/osmocom/mgcp_client/mgcp_client.h
@@ -11,7 +11,7 @@
 #define MGCP_CLIENT_LOCAL_PORT_DEFAULT 0
 #define MGCP_CLIENT_REMOTE_ADDR_DEFAULT "127.0.0.1"
 #define MGCP_CLIENT_REMOTE_PORT_DEFAULT 2427
-
+#define MGCP_CLIENT_KEEPALIVE_DEFAULT_ENDP "null"
 #define MGCP_CLIENT_MGW_STR "Configure MGCP connection to Media Gateway\n"
 
 struct msgb;
@@ -36,6 +36,12 @@
 
 	/* human readable name / description */
 	char *description;
+
+	struct {
+		uint32_t timeout_sec;
+		uint32_t req_interval_sec;
+		char req_endpoint_name[MGCP_ENDPOINT_MAXLEN];
+	} keepalive;
 };
 
 typedef unsigned int mgcp_trans_id_t;
diff --git a/include/osmocom/mgcp_client/mgcp_client_internal.h b/include/osmocom/mgcp_client/mgcp_client_internal.h
index 4e97a48..d0ae76b 100644
--- a/include/osmocom/mgcp_client/mgcp_client_internal.h
+++ b/include/osmocom/mgcp_client/mgcp_client_internal.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <osmocom/core/write_queue.h>
+#include <osmocom/core/timer.h>
 
 #define MSGB_CB_MGCP_TRANS_ID 0
 
@@ -16,6 +17,9 @@
 	mgcp_trans_id_t next_trans_id;
 	struct llist_head responses_pending;
 	struct mgcp_client_pool_member *pool_member;
+	struct osmo_timer_list keepalive_tx_timer;
+	struct osmo_timer_list keepalive_rx_timer;
+	bool conn_up;
 };
 
 struct mgcp_response_pending {
diff --git a/src/libosmo-mgcp-client/mgcp_client.c b/src/libosmo-mgcp-client/mgcp_client.c
index f74255c..677a4f3 100644
--- a/src/libosmo-mgcp-client/mgcp_client.c
+++ b/src/libosmo-mgcp-client/mgcp_client.c
@@ -202,6 +202,11 @@
 		.local_port = -1,
 		.remote_addr = NULL,
 		.remote_port = -1,
+		.keepalive = {
+			.timeout_sec = 0, /* disabled */
+			.req_interval_sec = 0, /* disabled */
+			.req_endpoint_name = MGCP_CLIENT_KEEPALIVE_DEFAULT_ENDP,
+		},
 	};
 
 	INIT_LLIST_HEAD(&conf->reset_epnames);
@@ -696,6 +701,14 @@
 	r = talloc_zero(mgcp, struct mgcp_response);
 	OSMO_ASSERT(r);
 
+	/* Re-arm keepalive timer if enabled */
+	if (OSMO_UNLIKELY(mgcp->conn_up == false)) {
+		LOGPMGW(mgcp, LOGL_NOTICE, "MGCP link to MGW now considered UP\n");
+		mgcp->conn_up = true;
+	}
+	if (mgcp->actual.keepalive.timeout_sec > 0)
+		osmo_timer_schedule(&mgcp->keepalive_rx_timer, mgcp->actual.keepalive.timeout_sec, 0);
+
 	rc = mgcp_response_parse_head(r, msg);
 	if (rc) {
 		LOGPMGW(mgcp, LOGL_ERROR, "Cannot parse MGCP response (head)\n");
@@ -766,10 +779,14 @@
 		osmo_escape_str((const char *)msg->data, OSMO_MIN(42, msg->len)));
 
 	ret = write(fd->fd, msg->data, msg->len);
-	if (ret != msg->len)
+	if (OSMO_UNLIKELY(ret != msg->len))
 		LOGPMGW(mgcp, LOGL_ERROR, "Failed to Tx MGCP: %s: %d='%s'; msg: len=%u '%s'...\n",
 			osmo_sock_get_name2(fd->fd), errno, strerror(errno),
 			msg->len, osmo_escape_str((const char *)msg->data, OSMO_MIN(42, msg->len)));
+
+	/* Re-arm the keepalive Tx timer: */
+	if (mgcp->actual.keepalive.req_interval_sec > 0)
+		osmo_timer_schedule(&mgcp->keepalive_tx_timer, mgcp->actual.keepalive.req_interval_sec, 0);
 	return ret;
 }
 
@@ -807,6 +824,39 @@
 	mgcp_client_tx(mgcp, msgb_dlcx, &_ignore_mgcp_response, NULL);
 }
 
+/* Format AuditEndpoint message (fire and forget) and send it off to the MGW */
+static void _mgcp_client_send_auep(struct mgcp_client *mgcp, const char *epname)
+{
+	struct msgb *msgb_auep;
+	struct mgcp_msg mgcp_msg_auep = {
+		.verb = MGCP_VERB_AUEP,
+		.presence = MGCP_MSG_PRESENCE_ENDPOINT,
+	};
+	OSMO_STRLCPY_ARRAY(mgcp_msg_auep.endpoint, epname);
+	msgb_auep = mgcp_msg_gen(mgcp, &mgcp_msg_auep);
+	mgcp_client_tx(mgcp, msgb_auep, &_ignore_mgcp_response, NULL);
+}
+
+static void mgcp_client_keepalive_tx_timer_cb(void *data)
+{
+	struct mgcp_client *mgcp = (struct mgcp_client *)data;
+	LOGPMGW(mgcp, LOGL_INFO, "Triggering keepalive MGCP request\n");
+	const char *epname = _mgcp_client_name_append_domain(mgcp, mgcp->actual.keepalive.req_endpoint_name);
+	_mgcp_client_send_auep(mgcp, epname);
+
+	/* Re-arm the timer: */
+	osmo_timer_schedule(&mgcp->keepalive_tx_timer, mgcp->actual.keepalive.req_interval_sec, 0);
+}
+
+static void mgcp_client_keepalive_rx_timer_cb(void *data)
+{
+	struct mgcp_client *mgcp = (struct mgcp_client *)data;
+	LOGPMGW(mgcp, LOGL_ERROR, "MGCP link to MGW now considered DOWN (keepalive timeout, more than %u seconds with no answer from MGW)\n",
+		mgcp->actual.keepalive.timeout_sec);
+	mgcp->conn_up = false;
+	/* TODO: Potentially time out all ongoing transactions for that MGW. Maybe based on VTY cfg? */
+}
+
 struct mgcp_client *mgcp_client_init(void *ctx,
 				     struct mgcp_client_conf *conf)
 {
@@ -851,6 +901,15 @@
 	if (conf->description)
 		mgcp->actual.description = talloc_strdup(mgcp, conf->description);
 
+	osmo_wqueue_init(&mgcp->wq, 1024);
+	mgcp->wq.read_cb = mgcp_do_read;
+	mgcp->wq.write_cb = mgcp_do_write;
+	osmo_fd_setup(&mgcp->wq.bfd, -1, OSMO_FD_READ, osmo_wqueue_bfd_cb, mgcp, 0);
+
+	memcpy(&mgcp->actual.keepalive, &conf->keepalive, sizeof(conf->keepalive));
+	osmo_timer_setup(&mgcp->keepalive_tx_timer, mgcp_client_keepalive_tx_timer_cb, mgcp);
+	osmo_timer_setup(&mgcp->keepalive_rx_timer, mgcp_client_keepalive_rx_timer_cb, mgcp);
+
 	return mgcp;
 }
 
@@ -859,24 +918,17 @@
  *  \returns 0 on success, -EINVAL on error. */
 int mgcp_client_connect(struct mgcp_client *mgcp)
 {
-	struct osmo_wqueue *wq;
 	int rc;
 	struct reset_ep *reset_ep;
 	const char *epname;
+	bool some_dlcx_sent = false;
 
 	if (!mgcp) {
 		LOGPMGW(mgcp, LOGL_FATAL, "Client not initialized properly\n");
 		return -EINVAL;
 	}
 
-	wq = &mgcp->wq;
-	osmo_wqueue_init(wq, 1024);
-	wq->read_cb = mgcp_do_read;
-	wq->write_cb = mgcp_do_write;
-
-	osmo_fd_setup(&wq->bfd, -1, OSMO_FD_READ, osmo_wqueue_bfd_cb, mgcp, 0);
-
-	rc = osmo_sock_init2_ofd(&wq->bfd, AF_UNSPEC, SOCK_DGRAM, IPPROTO_UDP, mgcp->actual.local_addr,
+	rc = osmo_sock_init2_ofd(&mgcp->wq.bfd, AF_UNSPEC, SOCK_DGRAM, IPPROTO_UDP, mgcp->actual.local_addr,
 				 mgcp->actual.local_port, mgcp->actual.remote_addr, mgcp->actual.remote_port,
 				 OSMO_SOCK_F_BIND | OSMO_SOCK_F_CONNECT);
 	if (rc < 0) {
@@ -888,7 +940,7 @@
 		goto error_close_fd;
 	}
 
-	LOGPMGW(mgcp, LOGL_INFO, "MGW connection: %s\n", osmo_sock_get_name2(wq->bfd.fd));
+	LOGPMGW(mgcp, LOGL_INFO, "MGW connection: %s\n", osmo_sock_get_name2(mgcp->wq.bfd.fd));
 
 	/* If configured, send a DLCX message to the endpoints that are configured to
 	 * be reset on startup. Usually this is a wildcarded endpoint. */
@@ -896,11 +948,27 @@
 		epname = _mgcp_client_name_append_domain(mgcp, reset_ep->name);
 		LOGPMGW(mgcp, LOGL_INFO, "Sending DLCX to: %s\n", epname);
 		_mgcp_client_send_dlcx(mgcp, epname);
+		some_dlcx_sent = true;
 	}
+
+	if (!some_dlcx_sent) {
+		if (mgcp->actual.keepalive.req_interval_sec > 0) {
+			/* Attempt an immediate probe to find out if link is UP or DOWN: */
+			osmo_timer_schedule(&mgcp->keepalive_tx_timer, 0, 0);
+		} else {
+			/* Assume link is UP by default, so that this MGW can be selected: */
+			mgcp->conn_up = true;
+		}
+	}
+	/* else: keepalive_tx_timer was already scheduled (if needed) down in the stack during Tx DLCX above */
+
+	if (mgcp->actual.keepalive.timeout_sec > 0)
+		osmo_timer_schedule(&mgcp->keepalive_rx_timer, mgcp->actual.keepalive.timeout_sec, 0);
+
 	return 0;
 error_close_fd:
-	close(wq->bfd.fd);
-	wq->bfd.fd = -1;
+	close(mgcp->wq.bfd.fd);
+	mgcp->wq.bfd.fd = -1;
 	return rc;
 }
 
@@ -924,6 +992,11 @@
 		return;
 	}
 
+	/* Disarm keepalive Tx/Rx timer until next connect() */
+	osmo_timer_del(&mgcp->keepalive_rx_timer);
+	osmo_timer_del(&mgcp->keepalive_tx_timer);
+	mgcp->conn_up = false;
+
 	wq = &mgcp->wq;
 	osmo_wqueue_clear(wq);
 	LOGPMGW(mgcp, LOGL_INFO, "MGCP association: %s -- closed!\n", osmo_sock_get_name2(wq->bfd.fd));
diff --git a/src/libosmo-mgcp-client/mgcp_client_vty.c b/src/libosmo-mgcp-client/mgcp_client_vty.c
index 336733b..a164637 100644
--- a/src/libosmo-mgcp-client/mgcp_client_vty.c
+++ b/src/libosmo-mgcp-client/mgcp_client_vty.c
@@ -28,6 +28,7 @@
 #include <osmocom/vty/command.h>
 #include <osmocom/vty/misc.h>
 #include <osmocom/core/utils.h>
+#include <osmocom/core/timer.h>
 
 #include <osmocom/mgcp_client/mgcp_client.h>
 #include <osmocom/mgcp_client/mgcp_client_internal.h>
@@ -48,7 +49,7 @@
 /* Pointer to the MGCP pool that is managed by mgcp_client_pool_vty_init() */
 static struct mgcp_client_pool *global_mgcp_client_pool = NULL;
 
-struct mgcp_client_conf *get_mgcp_client_config(struct vty *vty)
+static struct mgcp_client_conf *get_mgcp_client_config(struct vty *vty)
 {
 	if (global_mgcp_client_pool && vty->node == global_mgcp_client_pool->vty_node->node)
 		return vty->index;
@@ -61,6 +62,30 @@
 	return global_mgcp_client_conf;
 }
 
+static struct mgcp_client *get_mgcp_client(struct vty *vty)
+{
+	struct mgcp_client_conf *conf = get_mgcp_client_config(vty);
+	struct mgcp_client_pool_member *pool_member;
+
+	if (global_mgcp_client_pool && vty->node == global_mgcp_client_pool->vty_node->node) {
+		llist_for_each_entry(pool_member, &global_mgcp_client_pool->member_list, list) {
+			/* Find matching the conf pointer: */
+			if (&pool_member->conf != conf)
+				continue;
+			return pool_member->client;
+		}
+	}
+
+	/* Global single MGCP config, deprecated: */
+	vty_out(vty, "%% MGCP commands outside of 'mgw' nodes are deprecated. "
+		"You should consider reading the User Manual and migrating to 'mgw' node.%s",
+		VTY_NEWLINE);
+
+	/* There's no way to obtain the struct mgcp_client in old interface, but anyway it's deprecated. */
+	return NULL;
+}
+
+
 DEFUN(cfg_mgw_local_ip, cfg_mgw_local_ip_cmd,
       "local-ip " VTY_IPV46_CMD,
       "local bind to connect to MGW from\n"
@@ -280,6 +305,81 @@
       NO_STR MGW_STR "remove an endpoint name from the reset-endpoint list, e.g. 'rtpbridge/*'\n"
       "Endpoint name, e.g. 'rtpbridge/*' or 'ds/e1-0/s-3/su16-4'.\n")
 
+DEFUN(cfg_mgw_mgw_keepalive_req_interval,
+      cfg_mgw_mgw_keepalive_req_interval_cmd,
+      "keepalive request-interval <0-4294967295>",
+      "Monitor if the MGCP link against MGW is still usable\n"
+      "Send an MGCP command to the MGW at given interval if no other commands are sent\n"
+      "The interval at which send MGCP commands (s), 0 to disable\n")
+{
+	struct mgcp_client_conf *conf = get_mgcp_client_config(vty);
+	struct mgcp_client *mgcp = get_mgcp_client(vty);
+
+	conf->keepalive.req_interval_sec = atoi(argv[0]);
+
+	if (!mgcp)
+		return CMD_SUCCESS;
+
+	/* If client already exists, apply the change immediately if possible: */
+	mgcp->actual.keepalive.req_interval_sec = atoi(argv[0]);
+	if (mgcp->wq.bfd.fd != -1) { /* UDP MGCP socket connected */
+		if (mgcp->actual.keepalive.req_interval_sec > 0) /* Re-schedule: */
+			osmo_timer_schedule(&mgcp->keepalive_tx_timer, mgcp->actual.keepalive.req_interval_sec, 0);
+		else if (osmo_timer_pending(&mgcp->keepalive_tx_timer))
+			osmo_timer_del(&mgcp->keepalive_tx_timer);
+	} /* else: wait until connect() to do first scheduling */
+
+	return CMD_SUCCESS;
+}
+
+DEFUN(cfg_mgw_mgw_keepalive_req_endpoint,
+      cfg_mgw_mgw_keepalive_req_endpoint_cmd,
+      "keepalive request-endpoint NAME",
+      "Monitor if the MGCP link against MGW is still usable\n"
+      "Use a given endpoint name when sending an MGCP command to the MGW for keepalive purposes\n"
+      "The name of the endpoint to use\n")
+{
+	struct mgcp_client_conf *conf = get_mgcp_client_config(vty);
+	struct mgcp_client *mgcp = get_mgcp_client(vty);
+
+	OSMO_STRLCPY_ARRAY(conf->keepalive.req_endpoint_name, argv[0]);
+
+	if (!mgcp)
+		return CMD_SUCCESS;
+
+	/* If client already exists, apply the change immediately if possible: */
+	OSMO_STRLCPY_ARRAY(mgcp->actual.keepalive.req_endpoint_name, argv[0]);
+
+	return CMD_SUCCESS;
+}
+
+DEFUN(cfg_mgw_mgw_keepalive_timeout,
+      cfg_mgw_mgw_keepalive_timeout_cmd,
+      "keepalive timeout <0-4294967295>",
+      "Monitor if the MGCP link against MGW is still usable\n"
+      "Consider the link to the MGW to be down after time without receiving any message from it\n"
+      "The timeout (s), 0 to disable\n")
+{
+	struct mgcp_client_conf *conf = get_mgcp_client_config(vty);
+	struct mgcp_client *mgcp = get_mgcp_client(vty);
+
+	conf->keepalive.timeout_sec = atoi(argv[0]);
+
+	if (!mgcp)
+		return CMD_SUCCESS;
+
+	/* If client already exists, apply the change immediately if possible: */
+	mgcp->actual.keepalive.timeout_sec = atoi(argv[0]);
+	if (mgcp->wq.bfd.fd != -1) { /* UDP MGCP socket connected */
+		if (mgcp->actual.keepalive.timeout_sec > 0) /* Re-schedule: */
+			osmo_timer_schedule(&mgcp->keepalive_rx_timer, mgcp->actual.keepalive.timeout_sec, 0);
+		else if (osmo_timer_pending(&mgcp->keepalive_rx_timer))
+			osmo_timer_del(&mgcp->keepalive_rx_timer);
+	} /* else: wait until connect() to do first scheduling */
+
+	return CMD_SUCCESS;
+}
+
 static int config_write(struct vty *vty, const char *indent, struct mgcp_client_conf *conf)
 {
 	const char *addr;
@@ -317,6 +417,17 @@
 	llist_for_each_entry(reset_ep, &conf->reset_epnames, list)
 		vty_out(vty, "%s%sreset-endpoint %s%s", indent, mgw_prefix, reset_ep->name, VTY_NEWLINE);
 
+	if (conf->keepalive.req_interval_sec != 0)
+		vty_out(vty, "%s%skeepalive request-interval %u%s", indent, mgw_prefix,
+			conf->keepalive.req_interval_sec, VTY_NEWLINE);
+	if (strncmp(conf->keepalive.req_endpoint_name, MGCP_CLIENT_KEEPALIVE_DEFAULT_ENDP,
+		    sizeof(conf->keepalive.req_endpoint_name)) != 0)
+		vty_out(vty, "%s%skeepalive request-endpoint %s%s", indent,  mgw_prefix,
+			conf->keepalive.req_endpoint_name, VTY_NEWLINE);
+	if (conf->keepalive.timeout_sec != 0)
+		vty_out(vty, "%s%skeepalive timeout %u%s", indent,  mgw_prefix,
+			conf->keepalive.timeout_sec, VTY_NEWLINE);
+
 	return CMD_SUCCESS;
 }
 
@@ -347,6 +458,9 @@
 	install_lib_element(node, &cfg_mgw_mgw_endpoint_domain_name_cmd);
 	install_lib_element(node, &cfg_mgw_mgw_reset_ep_name_cmd);
 	install_lib_element(node, &cfg_mgw_mgw_no_reset_ep_name_cmd);
+	install_lib_element(node, &cfg_mgw_mgw_keepalive_req_interval_cmd);
+	install_lib_element(node, &cfg_mgw_mgw_keepalive_req_endpoint_cmd);
+	install_lib_element(node, &cfg_mgw_mgw_keepalive_timeout_cmd);
 
 	osmo_fsm_vty_add_cmds();
 }
@@ -553,8 +667,13 @@
 	}
 
 	llist_for_each_entry(pool_member, &global_mgcp_client_pool->member_list, list) {
+		const struct mgcp_client *cli = pool_member->client;
 		vty_out(vty, "%%  MGW %s%s", mgcp_client_pool_member_name(pool_member), VTY_NEWLINE);
-		vty_out(vty, "%%   mgcp-client:   %s%s", pool_member->client ? "connected" : "disconnected",
+		vty_out(vty, "%%   MGCP link:     %s,%s%s",
+			cli && cli->wq.bfd.fd != -1 ? "connected" : "disconnected",
+			cli && cli->conn_up ?
+				((cli->actual.keepalive.timeout_sec > 0) ? "UP" : "MAYBE") :
+				"DOWN",
 			VTY_NEWLINE);
 		vty_out(vty, "%%   service:       %s%s", pool_member->blocked ? "blocked" : "unblocked", VTY_NEWLINE);
 		vty_out(vty, "%%   ongoing calls: %u%s", pool_member->refcount, VTY_NEWLINE);
diff --git a/tests/mgcp_client/mgcp_client_test.err b/tests/mgcp_client/mgcp_client_test.err
index 0bf6d8f..22ad3cc 100644
--- a/tests/mgcp_client/mgcp_client_test.err
+++ b/tests/mgcp_client/mgcp_client_test.err
@@ -11,6 +11,7 @@
 - cancel succeeds
 DLMGCP MGW(mgw) Canceled transaction 1
 - late response gets discarded
+DLMGCP MGW(mgw) MGCP link to MGW now considered UP
 DLMGCP MGW(mgw) MGCP client: Rx 200 1 OK
 DLMGCP MGW(mgw) Cannot find matching MGCP transaction for trans_id 1
 - canceling again does nothing