From b7ecff22e77887626fd8e8608c4dd73bc7b7366f Mon Sep 17 00:00:00 2001
From: George Joseph <gjoseph@sangoma.com>
Date: Tue, 18 Jan 2022 06:14:31 -0700
Subject: [PATCH] Additional multipart improvements

Added the following APIs:
pjsip_multipart_find_part_by_header()
pjsip_multipart_find_part_by_header_str()
pjsip_multipart_find_part_by_cid_str()
pjsip_multipart_find_part_by_cid_uri()
---
 pjsip/include/pjsip/sip_multipart.h |  83 ++++++++++
 pjsip/src/pjsip/sip_multipart.c     | 223 +++++++++++++++++++++++++++
 pjsip/src/test/multipart_test.c     | 225 +++++++++++++++++++++++++++-
 3 files changed, 530 insertions(+), 1 deletion(-)

diff --git a/pjsip/include/pjsip/sip_multipart.h b/pjsip/include/pjsip/sip_multipart.h
index 1c05767c5..c6b82b0b4 100644
--- a/pjsip/include/pjsip/sip_multipart.h
+++ b/pjsip/include/pjsip/sip_multipart.h
@@ -153,6 +153,89 @@ pjsip_multipart_find_part( const pjsip_msg_body *mp,
 			   const pjsip_media_type *content_type,
 			   const pjsip_multipart_part *start);
 
+/**
+ * Find a body inside multipart bodies which has a header matching the
+ * supplied one. Most useful for finding a part with a specific Content-ID.
+ *
+ * @param pool		Memory pool to use for temp space.
+ * @param mp		The multipart body.
+ * @param search_hdr	Header to search for.
+ * @param start		If specified, the search will begin at
+ * 			start->next part. Otherwise it will begin at
+ * 			the first part in the multipart bodies.
+ *
+ * @return		The first part which has a header matching the
+ * 			specified one, or NULL if not found.
+ */
+PJ_DECL(pjsip_multipart_part*)
+pjsip_multipart_find_part_by_header(pj_pool_t *pool,
+				    const pjsip_msg_body *mp,
+				    void *search_hdr,
+				    const pjsip_multipart_part *start);
+
+/**
+ * Find a body inside multipart bodies which has a header matching the
+ * supplied name and value. Most useful for finding a part with a specific
+ * Content-ID.
+ *
+ * @param pool		Memory pool to use for temp space.
+ * @param mp		The multipart body.
+ * @param hdr_name	Header name to search for.
+ * @param hdr_value	Header value search for.
+ * @param start		If specified, the search will begin at
+ * 			start->next part. Otherwise it will begin at
+ * 			the first part in the multipart bodies.
+ *
+ * @return		The first part which has a header matching the
+ * 			specified one, or NULL if not found.
+ */
+PJ_DECL(pjsip_multipart_part*)
+pjsip_multipart_find_part_by_header_str(pj_pool_t *pool,
+				    const pjsip_msg_body *mp,
+				    const pj_str_t *hdr_name,
+				    const pj_str_t *hdr_value,
+				    const pjsip_multipart_part *start);
+
+
+
+/**
+ * Find a body inside multipart bodies which has a Content-ID value matching the
+ * supplied "cid" URI in pj_str form.  The "cid:" scheme will be assumed if the
+ * URL doesn't start with it.  Enclosing angle brackets will also be handled
+ * correctly if they exist.
+ *
+ * @see RFC2392 Content-ID and Message-ID Uniform Resource Locators
+ *
+ * @param pool	Memory pool to use for temp space.
+ * @param mp	The multipart body.
+ * @param cid	The "cid" URI to search for in pj_str form.
+ *
+ * @return		The first part which has a Content-ID header matching the
+ * 			specified "cid" URI. or NULL if not found.
+ */
+PJ_DECL(pjsip_multipart_part*)
+pjsip_multipart_find_part_by_cid_str(pj_pool_t *pool,
+				 const pjsip_msg_body *mp,
+				 pj_str_t *cid);
+
+/**
+ * Find a body inside multipart bodies which has a Content-ID value matching the
+ * supplied "cid" URI.
+ *
+ * @see RFC2392 Content-ID and Message-ID Uniform Resource Locators
+ *
+ * @param pool	Memory pool to use for temp space.
+ * @param mp	The multipart body.
+ * @param cid	The "cid" URI to search for.
+ *
+ * @return		The first part which had a Content-ID header matching the
+ * 			specified "cid" URI. or NULL if not found.
+ */
+PJ_DECL(pjsip_multipart_part*)
+pjsip_multipart_find_part_by_cid_uri(pj_pool_t *pool,
+				 const pjsip_msg_body *mp,
+				 pjsip_other_uri *cid_uri);
+
 /**
  * Parse multipart message.
  *
diff --git a/pjsip/src/pjsip/sip_multipart.c b/pjsip/src/pjsip/sip_multipart.c
index e7d722d2e..9d8be55b0 100644
--- a/pjsip/src/pjsip/sip_multipart.c
+++ b/pjsip/src/pjsip/sip_multipart.c
@@ -19,6 +19,7 @@
 #include <pjsip/sip_multipart.h>
 #include <pjsip/sip_parser.h>
 #include <pjlib-util/scanner.h>
+#include <pjlib-util/string.h>
 #include <pj/assert.h>
 #include <pj/ctype.h>
 #include <pj/errno.h>
@@ -416,6 +417,220 @@ pjsip_multipart_find_part( const pjsip_msg_body *mp,
     return NULL;
 }
 
+/*
+ * Find a body inside multipart bodies which has the header and value.
+ */
+PJ_DEF(pjsip_multipart_part*)
+pjsip_multipart_find_part_by_header_str(pj_pool_t *pool,
+				    const pjsip_msg_body *mp,
+				    const pj_str_t *hdr_name,
+				    const pj_str_t *hdr_value,
+				    const pjsip_multipart_part *start)
+{
+    struct multipart_data *m_data;
+    pjsip_multipart_part *part;
+    pjsip_hdr *found_hdr;
+    pj_str_t found_hdr_str;
+    pj_str_t found_hdr_value;
+    pj_size_t expected_hdr_slen;
+    pj_size_t buf_size;
+    int hdr_name_len;
+#define REASONABLE_PADDING 32
+#define SEPARATOR_LEN 2
+    /* Must specify mandatory params */
+    PJ_ASSERT_RETURN(mp && hdr_name && hdr_value, NULL);
+
+    /* mp must really point to an actual multipart msg body */
+    PJ_ASSERT_RETURN(mp->print_body==&multipart_print_body, NULL);
+
+    /*
+     * We'll need to "print" each header we find to test it but
+     * allocating a buffer of PJSIP_MAX_URL_SIZE is overkill.
+     * Instead, we'll allocate one large enough to hold the search
+     * header name, the ": " separator, the search hdr value, and
+     * the NULL terminator.  If we can't print the found header
+     * into that buffer then it can't be a match.
+     *
+     * Some header print functions such as generic_int require enough
+     * space to print the maximum possible header length so we'll
+     * add a reasonable amount to the print buffer size.
+     */
+    expected_hdr_slen = hdr_name->slen + SEPARATOR_LEN + hdr_value->slen;
+    buf_size = expected_hdr_slen + REASONABLE_PADDING;
+    found_hdr_str.ptr = pj_pool_alloc(pool, buf_size);
+    found_hdr_str.slen = 0;
+    hdr_name_len = hdr_name->slen + SEPARATOR_LEN;
+
+    m_data = (struct multipart_data*)mp->data;
+
+    if (start)
+	part = start->next;
+    else
+	part = m_data->part_head.next;
+
+    while (part != &m_data->part_head) {
+	found_hdr = NULL;
+	while ((found_hdr = pjsip_hdr_find_by_name(&part->hdr, hdr_name,
+	    (found_hdr ? found_hdr->next : NULL))) != NULL) {
+
+	    found_hdr_str.slen = pjsip_hdr_print_on((void*) found_hdr, found_hdr_str.ptr, buf_size);
+	    /*
+	     * If the buffer was too small (slen = -1) or the result wasn't
+	     * the same length as the search header, it can't be a match.
+	     */
+	    if (found_hdr_str.slen != expected_hdr_slen) {
+		continue;
+	    }
+	    /*
+	     * Set the value overlay to start at the found header value...
+	     */
+	    found_hdr_value.ptr = found_hdr_str.ptr + hdr_name_len;
+	    found_hdr_value.slen = found_hdr_str.slen - hdr_name_len;
+	    /* ...and compare it to the supplied header value. */
+	    if (pj_strcmp(hdr_value, &found_hdr_value) == 0) {
+		return part;
+	    }
+	}
+	part = part->next;
+    }
+    return NULL;
+#undef SEPARATOR_LEN
+#undef REASONABLE_PADDING
+}
+
+PJ_DEF(pjsip_multipart_part*)
+pjsip_multipart_find_part_by_header(pj_pool_t *pool,
+				    const pjsip_msg_body *mp,
+				    void *search_for,
+				    const pjsip_multipart_part *start)
+{
+    struct multipart_data *m_data;
+    pjsip_hdr *search_hdr = search_for;
+    pj_str_t search_buf;
+
+    /* Must specify mandatory params */
+    PJ_ASSERT_RETURN(mp && search_hdr, NULL);
+
+    /* mp must really point to an actual multipart msg body */
+    PJ_ASSERT_RETURN(mp->print_body==&multipart_print_body, NULL);
+
+    /*
+     * Unfortunately, there isn't enough information to determine
+     * the maximum printed size of search_hdr at this point so we
+     * have to allocate a reasonable max.
+     */
+    search_buf.ptr = pj_pool_alloc(pool, PJSIP_MAX_URL_SIZE);
+    search_buf.slen = pjsip_hdr_print_on(search_hdr, search_buf.ptr, PJSIP_MAX_URL_SIZE - 1);
+    if (search_buf.slen <= 0) {
+	return NULL;
+    }
+    /*
+     * Set the header value to start after the header name plus the ":", then
+     * strip leading and trailing whitespace.
+     */
+    search_buf.ptr += (search_hdr->name.slen + 1);
+    search_buf.slen -= (search_hdr->name.slen + 1);
+    pj_strtrim(&search_buf);
+
+    return pjsip_multipart_find_part_by_header_str(pool, mp, &search_hdr->name, &search_buf, start);
+}
+
+/*
+ * Convert a Content-ID URI to it's corresponding header value.
+ * RFC2392 says...
+ * A "cid" URL is converted to the corresponding Content-ID message
+ * header by removing the "cid:" prefix, converting the % encoded
+ * character(s) to their equivalent US-ASCII characters, and enclosing
+ * the remaining parts with an angle bracket pair, "<" and ">".
+ *
+ * This implementation will accept URIs with or without the "cid:"
+ * scheme and optional angle brackets.
+ */
+static pj_str_t cid_uri_to_hdr_value(pj_pool_t *pool, pj_str_t *cid_uri)
+{
+    pj_size_t cid_len = pj_strlen(cid_uri);
+    pj_size_t alloc_len = cid_len + 2 /* for the leading and trailing angle brackets */;
+    pj_str_t uri_overlay;
+    pj_str_t cid_hdr;
+    pj_str_t hdr_overlay;
+
+    pj_strassign(&uri_overlay, cid_uri);
+    /* If the URI is already enclosed in angle brackets, remove them. */
+    if (uri_overlay.ptr[0] == '<') {
+	uri_overlay.ptr++;
+	uri_overlay.slen -= 2;
+    }
+    /* If the URI starts with the "cid:" scheme, skip over it. */
+    if (pj_strncmp2(&uri_overlay, "cid:", 4) == 0) {
+	uri_overlay.ptr += 4;
+	uri_overlay.slen -= 4;
+    }
+    /* Start building */
+    cid_hdr.ptr = pj_pool_alloc(pool, alloc_len);
+    cid_hdr.ptr[0] = '<';
+    cid_hdr.slen = 1;
+    hdr_overlay.ptr = cid_hdr.ptr + 1;
+    hdr_overlay.slen = 0;
+    pj_strcpy_unescape(&hdr_overlay, &uri_overlay);
+    cid_hdr.slen += hdr_overlay.slen;
+    cid_hdr.ptr[cid_hdr.slen] = '>';
+    cid_hdr.slen++;
+
+    return cid_hdr;
+}
+
+PJ_DEF(pjsip_multipart_part*)
+pjsip_multipart_find_part_by_cid_str(pj_pool_t *pool,
+				 const pjsip_msg_body *mp,
+				 pj_str_t *cid)
+{
+    struct multipart_data *m_data;
+    pjsip_multipart_part *part;
+    pjsip_generic_string_hdr *found_hdr;
+    pj_str_t found_hdr_value;
+    static pj_str_t hdr_name = { "Content-ID", 10};
+    pj_str_t hdr_value;
+
+    PJ_ASSERT_RETURN(pool && mp && cid && (pj_strlen(cid) > 0), NULL);
+
+    hdr_value = cid_uri_to_hdr_value(pool, cid);
+    if (pj_strlen(&hdr_value) == 0) {
+	return NULL;
+    }
+
+    m_data = (struct multipart_data*)mp->data;
+    part = m_data->part_head.next;
+
+    while (part != &m_data->part_head) {
+	found_hdr = NULL;
+	while ((found_hdr = pjsip_hdr_find_by_name(&part->hdr, &hdr_name,
+	    (found_hdr ? found_hdr->next : NULL))) != NULL) {
+	    if (pj_strcmp(&hdr_value, &found_hdr->hvalue) == 0) {
+		return part;
+	    }
+	}
+	part = part->next;
+    }
+    return NULL;
+}
+
+PJ_DEF(pjsip_multipart_part*)
+pjsip_multipart_find_part_by_cid_uri(pj_pool_t *pool,
+				 const pjsip_msg_body *mp,
+				 pjsip_other_uri *cid_uri)
+{
+    PJ_ASSERT_RETURN(pool && mp && cid_uri, NULL);
+
+    if (pj_strcmp2(&cid_uri->scheme, "cid") != 0) {
+	return NULL;
+    }
+    /*
+     * We only need to pass the URI content so we
+     * can do that directly.
+     */
+    return pjsip_multipart_find_part_by_cid_str(pool, mp, &cid_uri->content);
+}
+
 /* Parse a multipart part. "pct" is parent content-type  */
 static pjsip_multipart_part *parse_multipart_part(pj_pool_t *pool,
 						  char *start,
@@ -584,6 +799,7 @@ PJ_DEF(pjsip_msg_body*) pjsip_multipart_parse(pj_pool_t *pool,
 		(int)boundary.slen, boundary.ptr));
     }
 
+
     /* Build the delimiter:
      *   delimiter = "--" boundary
      */
@@ -630,6 +846,8 @@ PJ_DEF(pjsip_msg_body*) pjsip_multipart_parse(pj_pool_t *pool,
 	if (*curptr=='\r') ++curptr;
 	if (*curptr!='\n') {
 	    /* Expecting a newline here */
+	    PJ_LOG(2, (THIS_FILE, "Failed to find newline"));
+
 	    return NULL;
 	}
 	++curptr;
@@ -645,6 +863,7 @@ PJ_DEF(pjsip_msg_body*) pjsip_multipart_parse(pj_pool_t *pool,
 	    curptr = pj_strstr(&subbody, &delim);
 	    if (!curptr) {
 		/* We're really expecting end delimiter to be found. */
+		PJ_LOG(2, (THIS_FILE, "Failed to find end-delimiter"));
 		return NULL;
 	    }
 	}
@@ -670,9 +889,13 @@ PJ_DEF(pjsip_msg_body*) pjsip_multipart_parse(pj_pool_t *pool,
 	part = parse_multipart_part(pool, start_body, end_body - start_body,
 				    ctype);
 	if (part) {
+	    TRACE_((THIS_FILE, "Adding part"));
 	    pjsip_multipart_add_part(pool, body, part);
+	} else {
+	    PJ_LOG(2, (THIS_FILE, "Failed to add part"));
 	}
     }
+    TRACE_((THIS_FILE, "pjsip_multipart_parse finished: %p", body));
 
     return body;
 }
diff --git a/pjsip/src/test/multipart_test.c b/pjsip/src/test/multipart_test.c
index 4f16e68bf..97267a290 100644
--- a/pjsip/src/test/multipart_test.c
+++ b/pjsip/src/test/multipart_test.c
@@ -28,6 +28,7 @@
 typedef pj_status_t (*verify_ptr)(pj_pool_t*,pjsip_msg_body*);
 
 static pj_status_t verify1(pj_pool_t *pool, pjsip_msg_body *body);
+static pj_status_t verify2(pj_pool_t *pool, pjsip_msg_body *body);
 
 static struct test_t
 {
@@ -68,7 +69,41 @@ static struct test_t
 		"This is epilogue, which should be ignored too",
 
 		&verify1
+	},
+	{
+		/* Content-type */
+		"multipart", "mixed", "12345",
+
+		/* Body: */
+		"This is the prolog, which should be ignored.\r\n"
+		"--12345\r\n"
+		"Content-Type: text/plain\r\n"
+		"Content-ID: <header1@example.org>\r\n"
+		"Content-ID: <\"header1\"@example.org>\r\n"
+		"Content-Length: 13\r\n"
+		"\r\n"
+		"has header1\r\n"
+		"--12345 \t\r\n"
+		"Content-Type: application/pidf+xml\r\n"
+		"Content-ID: <my header2@example.org>\r\n"
+		"Content-ID: <my\xffheader2@example.org>\r\n"
+		"Content-Length: 13\r\n"
+		"\r\n"
+		"has header2\r\n"
+		"--12345\r\n"
+		"Content-Type: text/plain\r\n"
+		"Content-ID: <my header3@example.org>\r\n"
+		"Content-ID: <header1@example.org>\r\n"
+		"Content-ID: <my header4@example.org>\r\n"
+		"Content-Length: 13\r\n"
+		"\r\n"
+		"has header4\r\n"
+		"--12345--\r\n"
+		"This is epilogue, which should be ignored too",
+
+		&verify2
 	}
+
 };
 
 static void init_media_type(pjsip_media_type *mt,
@@ -87,6 +122,192 @@ static void init_media_type(pjsip_media_type *mt,
     }
 }
 
+static int verify_hdr(pj_pool_t *pool, pjsip_msg_body *multipart_body,
+    void *hdr, char *part_body)
+{
+    pjsip_media_type mt;
+    pjsip_multipart_part *part;
+    pj_str_t the_body;
+
+
+    part = pjsip_multipart_find_part_by_header(pool, multipart_body, hdr, NULL);
+    if (!part) {
+	return -1;
+    }
+
+    the_body.ptr = (char*)part->body->data;
+    the_body.slen = part->body->len;
+
+    if (pj_strcmp2(&the_body, part_body) != 0) {
+	return -2;
+    }
+
+    return 0;
+}
+
+static int verify_cid_str(pj_pool_t *pool, pjsip_msg_body *multipart_body,
+    pj_str_t cid_url, char *part_body)
+{
+    pjsip_media_type mt;
+    pjsip_multipart_part *part;
+    pj_str_t the_body;
+
+    part = pjsip_multipart_find_part_by_cid_str(pool, multipart_body, &cid_url);
+    if (!part) {
+	return -3;
+    }
+
+    the_body.ptr = (char*)part->body->data;
+    the_body.slen = part->body->len;
+
+    if (pj_strcmp2(&the_body, part_body) != 0) {
+	return -4;
+    }
+
+    return 0;
+}
+
+static int verify_cid_uri(pj_pool_t *pool, pjsip_msg_body *multipart_body,
+    pjsip_other_uri *cid_uri, char *part_body)
+{
+    pjsip_media_type mt;
+    pjsip_multipart_part *part;
+    pj_str_t the_body;
+
+    part = pjsip_multipart_find_part_by_cid_uri(pool, multipart_body, cid_uri);
+    if (!part) {
+	return -5;
+    }
+
+    the_body.ptr = (char*)part->body->data;
+    the_body.slen = part->body->len;
+
+    if (pj_strcmp2(&the_body, part_body) != 0) {
+	return -6;
+    }
+
+    return 0;
+}
+
+static pj_status_t verify2(pj_pool_t *pool, pjsip_msg_body *body)
+{
+    int rc = 0;
+    int rcbase = 300;
+    pjsip_other_uri *cid_uri;
+    pjsip_ctype_hdr *ctype_hdr = pjsip_ctype_hdr_create(pool);
+
+    ctype_hdr->media.type = pj_str("application");
+    ctype_hdr->media.subtype = pj_str("pidf+xml");
+
+    rc = verify_hdr(pool, body, ctype_hdr, "has header2");
+    if (rc) {
+	return (rc - rcbase);
+    }
+
+    rcbase += 10;
+    rc = verify_cid_str(pool, body, pj_str("cid:header1@example.org"), "has header1");
+    if (rc) {
+	return (rc - rcbase);
+    }
+
+    rcbase += 10;
+    rc = verify_cid_str(pool, body, pj_str("%22header1%22@example.org"), "has header1");
+    if (rc) {
+	return (rc - rcbase);
+    }
+
+    cid_uri = pjsip_uri_get_uri(pjsip_parse_uri(pool, "<cid:%22header1%22@example.org>",
+	strlen("<cid:%22header1%22@example.org>"), 0));
+    rcbase += 10;
+    rc = verify_cid_uri(pool, body, cid_uri, "has header1");
+    if (rc) {
+	return (rc - rcbase);
+    }
+
+    rcbase += 10;
+    rc = verify_cid_str(pool, body, pj_str("<cid:my%20header2@example.org>"), "has header2");
+    if (rc) {
+	return (rc - rcbase);
+    }
+
+    rcbase += 10;
+    rc = verify_cid_str(pool, body, pj_str("cid:my%ffheader2@example.org"), "has header2");
+    if (rc) {
+	return (rc - rcbase);
+    }
+
+    cid_uri = pjsip_uri_get_uri(pjsip_parse_uri(pool, "<cid:my%ffheader2@example.org>",
+	strlen("<cid:my%ffheader2@example.org>"), 0));
+    rcbase += 10;
+    rc = verify_cid_uri(pool, body, cid_uri, "has header2");
+    if (rc) {
+	return (rc - rcbase);
+    }
+
+    rcbase += 10;
+    rc = verify_cid_str(pool, body, pj_str("cid:my%20header3@example.org"), "has header4");
+    if (rc) {
+	return (rc - rcbase);
+    }
+
+    rcbase += 10;
+    rc = verify_cid_str(pool, body, pj_str("<cid:my%20header4@example.org>"), "has header4");
+    if (rc) {
+	return (rc - rcbase);
+    }
+
+    cid_uri = pjsip_uri_get_uri(pjsip_parse_uri(pool, "<cid:my%20header4@example.org>",
+	strlen("<cid:my%20header4@example.org>"), 0));
+    rcbase += 10;
+    rc = verify_cid_uri(pool, body, cid_uri, "has header4");
+    if (rc) {
+	return (rc - rcbase);
+    }
+
+    rcbase += 10;
+    rc = verify_cid_str(pool, body, pj_str("<my%20header3@example.org>"), "has header4");
+    if (rc) {
+	return (rc - rcbase);
+    }
+
+    /* These should all fail for malformed or missing URI */
+    rcbase += 10;
+    rc = verify_cid_str(pool, body, pj_str("cid:"), "has header4");
+    if (!rc) {
+	return (rc - rcbase);
+    }
+
+    rcbase += 10;
+    rc = verify_cid_str(pool, body, pj_str(""), "has header4");
+    if (!rc) {
+	return (rc - rcbase);
+    }
+
+    rcbase += 10;
+    rc = verify_cid_str(pool, body, pj_str("<>"), "has header4");
+    if (!rc) {
+	return (rc - rcbase);
+    }
+
+    rcbase += 10;
+    rc = verify_cid_str(pool, body, pj_str("<cid>"), "has header4");
+    if (!rc) {
+	return (rc - rcbase);
+    }
+
+    /*
+     * This is going to pass but the ' ' in the uri is un-encoded which is invalid
+     * so we should never see it.
+     */
+    rcbase += 10;
+    rc = verify_cid_str(pool, body, pj_str("cid:my header3@example.org"), "has header4");
+    if (rc) {
+	return (rc - rcbase);
+    }
+
+    return 0;
+}
+
 static int verify_part(pjsip_multipart_part *part,
 		       char *h_content_type,
 		       char *h_content_subtype,
@@ -236,8 +457,10 @@ static int parse_test(void)
 
 	pj_strdup2_with_null(pool, &str, p_tests[i].msg);
 	body = pjsip_multipart_parse(pool, str.ptr, str.slen, &ctype, 0);
-	if (!body)
+	if (!body) {
+	    pj_pool_release(pool);
 	    return -100;
+	}
 
 	if (p_tests[i].verify) {
 	    rc = p_tests[i].verify(pool, body);
-- 
2.34.1