diff options
author | Michael Brown | 2014-02-27 14:32:53 +0100 |
---|---|---|
committer | Michael Brown | 2014-02-27 14:32:53 +0100 |
commit | 76675365271291beb9ddaeec10da14f4faa55ecc (patch) | |
tree | 0143200258d478e381b9d492bead2bdda91fe865 /src/core/uri.c | |
parent | [params] Use reference counters for form parameter lists (diff) | |
download | ipxe-76675365271291beb9ddaeec10da14f4faa55ecc.tar.gz ipxe-76675365271291beb9ddaeec10da14f4faa55ecc.tar.xz ipxe-76675365271291beb9ddaeec10da14f4faa55ecc.zip |
[uri] Refactor URI parsing and formatting
Add support for parsing of URIs containing literal IPv6 addresses
(e.g. "http://[fe80::69ff:fe50:5845%25net0]/boot.ipxe").
Duplicate URIs by directly copying the relevant fields, rather than by
formatting and reparsing a URI string. This relaxes the requirements
on the URI formatting code and allows it to focus on generating
human-readable URIs (e.g. by not escaping ':' characters within
literal IPv6 addresses). As a side-effect, this allows relative URIs
containing parameter lists (e.g. "../boot.php##params") to function
as expected.
Add validity check for FTP paths to ensure that only printable
characters are accepted (since FTP is a human-readable line-based
protocol with no support for character escaping).
Construct TFTP next-server+filename URIs directly, rather than parsing
a constructed "tftp://..." string,
Add self-tests for URI functions.
Signed-off-by: Michael Brown <mcb30@ipxe.org>
Diffstat (limited to 'src/core/uri.c')
-rw-r--r-- | src/core/uri.c | 510 |
1 files changed, 330 insertions, 180 deletions
diff --git a/src/core/uri.c b/src/core/uri.c index 499fb975..9ec21cee 100644 --- a/src/core/uri.c +++ b/src/core/uri.c @@ -35,33 +35,183 @@ FILE_LICENCE ( GPL2_OR_LATER ); #include <ipxe/uri.h> /** + * Decode URI field (in place) + * + * @v string String + * + * URI decoding can never increase the length of a string; we can + * therefore safely decode in place. + */ +static void uri_decode ( char *string ) { + char *dest = string; + char hexbuf[3]; + char *hexbuf_end; + char c; + char decoded; + unsigned int skip; + + /* Copy string, decoding escaped characters as necessary */ + do { + c = *(string++); + if ( c == '%' ) { + snprintf ( hexbuf, sizeof ( hexbuf ), "%s", string ); + decoded = strtoul ( hexbuf, &hexbuf_end, 16 ); + skip = ( hexbuf_end - hexbuf ); + string += skip; + if ( skip ) + c = decoded; + } + *(dest++) = c; + } while ( c ); +} + +/** + * Check if character should be escaped within a URI field + * + * @v c Character + * @v field URI field index + * @ret escaped Character should be escaped + */ +static int uri_character_escaped ( char c, unsigned int field ) { + + /* Non-printing characters and whitespace should always be + * escaped, since they cannot sensibly be displayed as part of + * a coherent URL string. (This test also catches control + * characters such as CR and LF, which could affect the + * operation of line-based protocols such as HTTP.) + * + * We should also escape characters which would alter the + * interpretation of the URL if not escaped, i.e. characters + * which have significance to the URL parser. We should not + * blindly escape all such characters, because this would lead + * to some very strange-looking URLs (e.g. if we were to + * always escape '/' as "%2F" even within the URI path). + * + * We do not need to be perfect. Our primary role is as a + * consumer of URIs rather than a producer; the main situation + * in which we produce a URI string is for display to a human + * user, who can probably tolerate some variance from the + * formal specification. The only situation in which we + * currently produce a URI string to be consumed by a computer + * is when constructing an HTTP request URI, which contains + * only the path and query fields. + * + * We can therefore sacrifice some correctness for the sake of + * code size. For example, colons within the URI host should + * be escaped unless they form part of an IPv6 literal + * address; doing this correctly would require the URI + * formatter to be aware of whether or not the URI host + * contained an IPv4 address, an IPv6 address, or a host name. + * We choose to simplify and never escape colons within the + * URI host field: in the event of a pathological hostname + * containing colons, this could potentially produce a URI + * string which could not be reparsed. + * + * After excluding non-printing characters, whitespace, and + * '%', the full set of characters with significance to the + * URL parser is "/#:@?". We choose for each URI field which + * of these require escaping in our use cases. + */ + static const char *escaped[URI_FIELDS] = { + /* Scheme: escape everything */ + [URI_SCHEME] = "/#:@?", + /* Opaque part: escape characters which would affect + * the reparsing of the URI, allowing everything else + * (e.g. ':', which will appear in iSCSI URIs). + */ + [URI_OPAQUE] = "/#", + /* User name: escape everything */ + [URI_USER] = "/#:@?", + /* Password: escape everything */ + [URI_PASSWORD] = "/#:@?", + /* Host name: escape everything except ':', which may + * appear as part of an IPv6 literal address. + */ + [URI_HOST] = "/#@?", + /* Port number: escape everything */ + [URI_PORT] = "/#:@?", + /* Path: escape everything except '/', which usually + * appears within paths. + */ + [URI_PATH] = "#:@?", + /* Query: escape everything except '/', which + * sometimes appears within queries. + */ + [URI_QUERY] = "#:@?", + /* Fragment: escape everything */ + [URI_FRAGMENT] = "/#:@?", + }; + + return ( /* Always escape non-printing characters and whitespace */ + ( ! isprint ( c ) ) || ( c == ' ' ) || + /* Always escape '%' */ + ( c == '%' ) || + /* Escape field-specific characters */ + strchr ( escaped[field], c ) ); +} + +/** + * Encode URI field + * + * @v uri URI + * @v field URI field index + * @v buf Buffer to contain encoded string + * @v len Length of buffer + * @ret len Length of encoded string (excluding NUL) + */ +size_t uri_encode ( const char *string, unsigned int field, + char *buf, ssize_t len ) { + ssize_t remaining = len; + size_t used; + char c; + + /* Ensure encoded string is NUL-terminated even if empty */ + if ( len > 0 ) + buf[0] = '\0'; + + /* Copy string, escaping as necessary */ + while ( ( c = *(string++) ) ) { + if ( uri_character_escaped ( c, field ) ) { + used = ssnprintf ( buf, remaining, "%%%02X", c ); + } else { + used = ssnprintf ( buf, remaining, "%c", c ); + } + buf += used; + remaining -= used; + } + + return ( len - remaining ); +} + +/** * Dump URI for debugging * * @v uri URI */ -static void dump_uri ( struct uri *uri ) { +static void uri_dump ( const struct uri *uri ) { + if ( ! uri ) return; if ( uri->scheme ) - DBG ( " scheme \"%s\"", uri->scheme ); + DBGC ( uri, " scheme \"%s\"", uri->scheme ); if ( uri->opaque ) - DBG ( " opaque \"%s\"", uri->opaque ); + DBGC ( uri, " opaque \"%s\"", uri->opaque ); if ( uri->user ) - DBG ( " user \"%s\"", uri->user ); + DBGC ( uri, " user \"%s\"", uri->user ); if ( uri->password ) - DBG ( " password \"%s\"", uri->password ); + DBGC ( uri, " password \"%s\"", uri->password ); if ( uri->host ) - DBG ( " host \"%s\"", uri->host ); + DBGC ( uri, " host \"%s\"", uri->host ); if ( uri->port ) - DBG ( " port \"%s\"", uri->port ); + DBGC ( uri, " port \"%s\"", uri->port ); if ( uri->path ) - DBG ( " path \"%s\"", uri->path ); + DBGC ( uri, " path \"%s\"", uri->path ); if ( uri->query ) - DBG ( " query \"%s\"", uri->query ); + DBGC ( uri, " query \"%s\"", uri->query ); if ( uri->fragment ) - DBG ( " fragment \"%s\"", uri->fragment ); + DBGC ( uri, " fragment \"%s\"", uri->fragment ); if ( uri->params ) - DBG ( " params \"%s\"", uri->params->name ); + DBGC ( uri, " params \"%s\"", uri->params->name ); } /** @@ -69,7 +219,7 @@ static void dump_uri ( struct uri *uri ) { * * @v refcnt Reference count */ -static void free_uri ( struct refcnt *refcnt ) { +static void uri_free ( struct refcnt *refcnt ) { struct uri *uri = container_of ( refcnt, struct uri, refcnt ); params_put ( uri->params ); @@ -93,16 +243,16 @@ struct uri * parse_uri ( const char *uri_string ) { char *tmp; char *path; char *authority; - int i; size_t raw_len; + unsigned int field; /* Allocate space for URI struct and a copy of the string */ raw_len = ( strlen ( uri_string ) + 1 /* NUL */ ); uri = zalloc ( sizeof ( *uri ) + raw_len ); if ( ! uri ) return NULL; - ref_init ( &uri->refcnt, free_uri ); - raw = ( ( ( char * ) uri ) + sizeof ( *uri ) ); + ref_init ( &uri->refcnt, uri_free ); + raw = ( ( ( void * ) uri ) + sizeof ( *uri ) ); /* Copy in the raw string */ memcpy ( raw, uri_string, raw_len ); @@ -125,12 +275,8 @@ struct uri * parse_uri ( const char *uri_string ) { uri->fragment = tmp; } - /* Identify absolute/relative URI. We ignore schemes that are - * apparently only a single character long, since otherwise we - * misinterpret a DOS-style path name ("C:\path\to\file") as a - * URI with scheme="C",opaque="\path\to\file". - */ - if ( ( tmp = strchr ( raw, ':' ) ) && ( tmp > ( raw + 1 ) ) ) { + /* Identify absolute/relative URI */ + if ( ( tmp = strchr ( raw, ':' ) ) ) { /* Absolute URI: identify hierarchical/opaque */ uri->scheme = raw; *(tmp++) = '\0'; @@ -159,6 +305,12 @@ struct uri * parse_uri ( const char *uri_string ) { uri->query = tmp; } + /* If we have no path remaining, then we're already finished + * processing. + */ + if ( ! path[0] ) + goto done; + /* Identify net/absolute/relative path */ if ( strncmp ( path, "//", 2 ) == 0 ) { /* Net path. If this is terminated by the first '/' @@ -205,23 +357,22 @@ struct uri * parse_uri ( const char *uri_string ) { } /* Split host into host[:port] */ - if ( ( tmp = strchr ( uri->host, ':' ) ) ) { + if ( ( uri->host[ strlen ( uri->host ) - 1 ] != ']' ) && + ( tmp = strrchr ( uri->host, ':' ) ) ) { *(tmp++) = '\0'; uri->port = tmp; } - /* Decode fields that should be decoded */ - for ( i = URI_FIRST_FIELD; i <= URI_LAST_FIELD; i++ ) { - const char *field = uri_get_field ( uri, i ); - if ( field && ( URI_ENCODED & ( 1 << i ) ) ) - uri_decode ( field, ( char * ) field, - strlen ( field ) + 1 /* NUL */ ); + /* Decode fields in-place */ + for ( field = 0 ; field < URI_FIELDS ; field++ ) { + if ( uri_field ( uri, field ) ) + uri_decode ( ( char * ) uri_field ( uri, field ) ); } done: - DBG ( "URI \"%s\" split into", uri_string ); - dump_uri ( uri ); - DBG ( "\n" ); + DBGC ( uri, "URI parsed \"%s\" to", uri_string ); + uri_dump ( uri ); + DBGC ( uri, "\n" ); return uri; } @@ -233,83 +384,138 @@ struct uri * parse_uri ( const char *uri_string ) { * @v default_port Default port to use if none specified in URI * @ret port Port */ -unsigned int uri_port ( struct uri *uri, unsigned int default_port ) { +unsigned int uri_port ( const struct uri *uri, unsigned int default_port ) { + if ( ( ! uri ) || ( ! uri->port ) ) return default_port; + return ( strtoul ( uri->port, NULL, 0 ) ); } /** - * Unparse URI + * Format URI * + * @v uri URI * @v buf Buffer to fill with URI string * @v size Size of buffer - * @v uri URI to write into buffer, or NULL - * @v fields Bitmask of fields to include in URI string, or URI_ALL * @ret len Length of URI string */ -int unparse_uri ( char *buf, size_t size, struct uri *uri, - unsigned int fields ) { - /* List of characters that typically go before certain fields */ - static char separators[] = { /* scheme */ 0, /* opaque */ ':', - /* user */ 0, /* password */ ':', - /* host */ '@', /* port */ ':', - /* path */ 0, /* query */ '?', - /* fragment */ '#' }; - int used = 0; - int i; - - DBG ( "URI unparsing" ); - dump_uri ( uri ); - DBG ( "\n" ); +size_t format_uri ( const struct uri *uri, char *buf, size_t len ) { + static const char prefixes[URI_FIELDS] = { + [URI_OPAQUE] = ':', + [URI_PASSWORD] = ':', + [URI_PORT] = ':', + [URI_PATH] = '/', + [URI_QUERY] = '?', + [URI_FRAGMENT] = '#', + }; + char prefix; + size_t used = 0; + unsigned int field; /* Ensure buffer is NUL-terminated */ - if ( size ) + if ( len ) buf[0] = '\0'; /* Special-case NULL URI */ if ( ! uri ) return 0; - /* Iterate through requested fields */ - for ( i = URI_FIRST_FIELD; i <= URI_LAST_FIELD; i++ ) { - const char *field = uri_get_field ( uri, i ); - char sep = separators[i]; - - /* Ensure `fields' only contains bits for fields that exist */ - if ( ! field ) - fields &= ~( 1 << i ); - - /* Store this field if we were asked to */ - if ( fields & ( 1 << i ) ) { - /* Print :// if we're non-opaque and had a scheme */ - if ( ( fields & URI_SCHEME_BIT ) && - ( i > URI_OPAQUE ) ) { - used += ssnprintf ( buf + used, size - used, - "://" ); - /* Only print :// once */ - fields &= ~URI_SCHEME_BIT; - } + /* Generate fields */ + for ( field = 0 ; field < URI_FIELDS ; field++ ) { + + /* Skip non-existent fields */ + if ( ! uri_field ( uri, field ) ) + continue; + + /* Prefix this field, if applicable */ + prefix = prefixes[field]; + if ( ( field == URI_HOST ) && ( uri->user != NULL ) ) + prefix = '@'; + if ( ( field == URI_PATH ) && ( uri->path[0] == '/' ) ) + prefix = '\0'; + if ( prefix ) { + used += ssnprintf ( ( buf + used ), ( len - used ), + "%c", prefix ); + } + + /* Encode this field */ + used += uri_encode ( uri_field ( uri, field ), field, + ( buf + used ), ( len - used ) ); - /* Only print separator if an earlier field exists */ - if ( sep && ( fields & ( ( 1 << i ) - 1 ) ) ) - used += ssnprintf ( buf + used, size - used, - "%c", sep ); - - /* Print contents of field, possibly encoded */ - if ( URI_ENCODED & ( 1 << i ) ) - used += uri_encode ( field, buf + used, - size - used, i ); - else - used += ssnprintf ( buf + used, size - used, - "%s", field ); + /* Suffix this field, if applicable */ + if ( ( field == URI_SCHEME ) && ( ! uri->opaque ) ) { + used += ssnprintf ( ( buf + used ), ( len - used ), + "://" ); } } + if ( len ) { + DBGC ( uri, "URI formatted" ); + uri_dump ( uri ); + DBGC ( uri, " to \"%s%s\"\n", buf, + ( ( used > len ) ? "<TRUNCATED>" : "" ) ); + } + return used; } /** + * Format URI + * + * @v uri URI + * @ret string URI string, or NULL on failure + * + * The caller is responsible for eventually freeing the allocated + * memory. + */ +char * format_uri_alloc ( const struct uri *uri ) { + size_t len; + char *string; + + len = ( format_uri ( uri, NULL, 0 ) + 1 /* NUL */ ); + string = malloc ( len ); + if ( string ) + format_uri ( uri, string, len ); + return string; +} + +/** + * Copy URI fields + * + * @v src Source URI + * @v dest Destination URI, or NULL to calculate length + * @ret len Length of raw URI + */ +static size_t uri_copy_fields ( const struct uri *src, struct uri *dest ) { + size_t len = sizeof ( *dest ); + char *out = ( ( void * ) dest + len ); + unsigned int field; + size_t field_len; + + /* Copy existent fields */ + for ( field = 0 ; field < URI_FIELDS ; field++ ) { + + /* Skip non-existent fields */ + if ( ! uri_field ( src, field ) ) + continue; + + /* Calculate field length */ + field_len = ( strlen ( uri_field ( src, field ) ) + + 1 /* NUL */ ); + len += field_len; + + /* Copy field, if applicable */ + if ( dest ) { + memcpy ( out, uri_field ( src, field ), field_len ); + uri_field ( dest, field ) = out; + out += field_len; + } + } + return len; +} + +/** * Duplicate URI * * @v uri URI @@ -317,12 +523,28 @@ int unparse_uri ( char *buf, size_t size, struct uri *uri, * * Creates a modifiable copy of a URI. */ -struct uri * uri_dup ( struct uri *uri ) { - size_t len = ( unparse_uri ( NULL, 0, uri, URI_ALL ) + 1 ); - char buf[len]; +struct uri * uri_dup ( const struct uri *uri ) { + struct uri *dup; + size_t len; + + /* Allocate new URI */ + len = uri_copy_fields ( uri, NULL ); + dup = zalloc ( len ); + if ( ! dup ) + return NULL; + ref_init ( &dup->refcnt, uri_free ); + + /* Copy fields */ + uri_copy_fields ( uri, dup ); + + /* Copy parameters */ + dup->params = params_get ( uri->params ); - unparse_uri ( buf, len, uri, URI_ALL ); - return parse_uri ( buf ); + DBGC ( uri, "URI duplicated" ); + uri_dump ( uri ); + DBGC ( uri, "\n" ); + + return dup; } /** @@ -398,7 +620,7 @@ char * resolve_path ( const char *base_path, * relative URI (e.g. "../initrds/initrd.gz") and produces a new URI * (e.g. "http://ipxe.org/initrds/initrd.gz"). */ -struct uri * resolve_uri ( struct uri *base_uri, +struct uri * resolve_uri ( const struct uri *base_uri, struct uri *relative_uri ) { struct uri tmp_uri; char *tmp_path = NULL; @@ -417,11 +639,16 @@ struct uri * resolve_uri ( struct uri *base_uri, tmp_uri.path = tmp_path; tmp_uri.query = relative_uri->query; tmp_uri.fragment = relative_uri->fragment; + tmp_uri.params = relative_uri->params; } else if ( relative_uri->query ) { tmp_uri.query = relative_uri->query; tmp_uri.fragment = relative_uri->fragment; + tmp_uri.params = relative_uri->params; } else if ( relative_uri->fragment ) { tmp_uri.fragment = relative_uri->fragment; + tmp_uri.params = relative_uri->params; + } else if ( relative_uri->params ) { + tmp_uri.params = relative_uri->params; } /* Create demangled URI */ @@ -431,100 +658,23 @@ struct uri * resolve_uri ( struct uri *base_uri, } /** - * Test for unreserved URI characters - * - * @v c Character to test - * @v field Field of URI in which character lies - * @ret is_unreserved Character is an unreserved character - */ -static int is_unreserved_uri_char ( int c, int field ) { - /* According to RFC3986, the unreserved character set is - * - * A-Z a-z 0-9 - _ . ~ - * - * but we also pass & ; = in queries, / in paths, - * and everything in opaques - */ - int ok = ( isupper ( c ) || islower ( c ) || isdigit ( c ) || - ( c == '-' ) || ( c == '_' ) || - ( c == '.' ) || ( c == '~' ) ); - - if ( field == URI_QUERY ) - ok = ok || ( c == ';' ) || ( c == '&' ) || ( c == '=' ); - - if ( field == URI_PATH ) - ok = ok || ( c == '/' ); - - if ( field == URI_OPAQUE ) - ok = 1; - - return ok; -} - -/** - * URI-encode string - * - * @v raw_string String to be URI-encoded - * @v buf Buffer to contain encoded string - * @v len Length of buffer - * @v field Field of URI in which string lies - * @ret len Length of encoded string (excluding NUL) - */ -size_t uri_encode ( const char *raw_string, char *buf, ssize_t len, - int field ) { - ssize_t remaining = len; - size_t used; - unsigned char c; - - if ( len > 0 ) - buf[0] = '\0'; - - while ( ( c = *(raw_string++) ) ) { - if ( is_unreserved_uri_char ( c, field ) ) { - used = ssnprintf ( buf, remaining, "%c", c ); - } else { - used = ssnprintf ( buf, remaining, "%%%02X", c ); - } - buf += used; - remaining -= used; - } - - return ( len - remaining ); -} - -/** - * Decode URI-encoded string + * Construct TFTP URI from next-server and filename * - * @v encoded_string URI-encoded string - * @v buf Buffer to contain decoded string - * @v len Length of buffer - * @ret len Length of decoded string (excluding NUL) + * @v next_server Next-server address + * @v filename Filename + * @ret uri URI, or NULL on failure * - * This function may be used in-place, with @a buf the same as - * @a encoded_string. + * TFTP filenames specified via the DHCP next-server field often + * contain characters such as ':' or '#' which would confuse the + * generic URI parser. We provide a mechanism for directly + * constructing a TFTP URI from the next-server and filename. */ -size_t uri_decode ( const char *encoded_string, char *buf, ssize_t len ) { - ssize_t remaining; - char hexbuf[3]; - char *hexbuf_end; - unsigned char c; - - for ( remaining = len; *encoded_string; remaining-- ) { - if ( *encoded_string == '%' ) { - encoded_string++; - snprintf ( hexbuf, sizeof ( hexbuf ), "%s", - encoded_string ); - c = strtoul ( hexbuf, &hexbuf_end, 16 ); - encoded_string += ( hexbuf_end - hexbuf ); - } else { - c = *(encoded_string++); - } - if ( remaining > 1 ) - *buf++ = c; - } - - if ( len ) - *buf = 0; - - return ( len - remaining ); +struct uri * tftp_uri ( struct in_addr next_server, const char *filename ) { + struct uri uri; + + memset ( &uri, 0, sizeof ( uri ) ); + uri.scheme = "tftp"; + uri.host = inet_ntoa ( next_server ); + uri.path = filename; + return uri_dup ( &uri ); } |