2020-03-30 22:45:41 +02:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2020 HAW Hamburg
|
|
|
|
*
|
|
|
|
* This file is subject to the terms and conditions of the GNU Lesser
|
|
|
|
* General Public License v2.1. See the file LICENSE in the top level
|
|
|
|
* directory for more details.
|
|
|
|
*/
|
|
|
|
/**
|
|
|
|
* @ingroup sys_uri_parser
|
|
|
|
* @{
|
|
|
|
*
|
|
|
|
* @file
|
|
|
|
* @brief A minimal, non-destructive URI parser.
|
|
|
|
* @see https://tools.ietf.org/html/rfc3986
|
|
|
|
*
|
|
|
|
* @author Cenk Gündoğan <cenk.guendogan@haw-hamburg.de>
|
|
|
|
*
|
|
|
|
* @}
|
|
|
|
*/
|
|
|
|
|
2020-10-21 15:58:33 +02:00
|
|
|
#include <assert.h>
|
2022-05-11 15:00:16 +02:00
|
|
|
#include <stdlib.h>
|
2020-10-21 15:58:33 +02:00
|
|
|
|
2020-03-30 22:45:41 +02:00
|
|
|
#include "uri_parser.h"
|
|
|
|
|
2022-05-11 15:00:16 +02:00
|
|
|
#define MAX_PORT_STR_LEN (5)
|
|
|
|
|
2020-10-22 11:35:22 +02:00
|
|
|
#define ENABLE_DEBUG 0
|
2020-03-30 22:45:41 +02:00
|
|
|
#include "debug.h"
|
|
|
|
|
|
|
|
/* strchr for non-Null-terminated strings (buffers) */
|
2021-08-04 14:01:28 +02:00
|
|
|
static const char *_strchrb(const char *start, const char *stop, char c)
|
2020-03-30 22:45:41 +02:00
|
|
|
{
|
|
|
|
for (; start < stop; start++) {
|
|
|
|
if (*start == c) {
|
|
|
|
return start;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-08-04 14:01:28 +02:00
|
|
|
static const char *_consume_scheme(uri_parser_result_t *result, const char *uri,
|
|
|
|
const char *uri_end, bool *has_authority)
|
2020-03-30 22:45:41 +02:00
|
|
|
{
|
|
|
|
assert(uri);
|
2020-04-01 11:10:09 +02:00
|
|
|
|
|
|
|
/* assume no authority section first */
|
|
|
|
*has_authority = false;
|
2020-03-30 22:45:41 +02:00
|
|
|
|
|
|
|
/* cannot have empty scheme */
|
|
|
|
if (uri[0] == ':') {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-08-04 14:01:28 +02:00
|
|
|
const char *p = _strchrb(uri, uri_end, ':');
|
2020-03-30 22:45:41 +02:00
|
|
|
|
|
|
|
result->scheme = uri;
|
|
|
|
result->scheme_len = p - uri;
|
|
|
|
|
|
|
|
/* check if authority part exists '://' */
|
2021-02-04 20:08:26 +01:00
|
|
|
if (((uri_end - p) > 2) && (p[1] == '/') && (p[2] == '/')) {
|
2020-03-30 22:45:41 +02:00
|
|
|
*has_authority = true;
|
|
|
|
/* skip '://' */
|
|
|
|
return p + 3;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* skip ':' */
|
|
|
|
return p + 1;
|
|
|
|
}
|
|
|
|
|
2021-08-04 14:01:28 +02:00
|
|
|
void _consume_userinfo(uri_parser_result_t *result, const char *uri,
|
|
|
|
const char *authority_end)
|
2020-03-30 22:45:41 +02:00
|
|
|
{
|
|
|
|
/* check for userinfo within authority */
|
2021-08-04 14:01:28 +02:00
|
|
|
const char *userinfo_end = _strchrb(uri, authority_end, '@');
|
2020-03-30 22:45:41 +02:00
|
|
|
|
|
|
|
/* check if match */
|
|
|
|
if (userinfo_end) {
|
|
|
|
result->userinfo = uri;
|
|
|
|
result->userinfo_len = userinfo_end - uri;
|
2021-02-04 16:17:58 +01:00
|
|
|
|
|
|
|
/* shift host part beyond userinfo and '@', but only if possible */
|
|
|
|
unsigned offset = result->userinfo_len + 1;
|
|
|
|
if ((result->host + offset) > authority_end) {
|
|
|
|
result->host_len = 0;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
result->host_len -= offset;
|
|
|
|
result->host += offset;
|
2020-03-30 22:45:41 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-04 14:01:28 +02:00
|
|
|
bool _consume_port(uri_parser_result_t *result, const char *ipv6_end,
|
|
|
|
const char *authority_end)
|
2020-03-30 22:45:41 +02:00
|
|
|
{
|
|
|
|
/* check for port after host part */
|
2021-08-04 14:01:28 +02:00
|
|
|
const char *port_begin = NULL;
|
2022-05-11 15:00:16 +02:00
|
|
|
uint16_t port_str_len = 0;
|
|
|
|
|
2020-03-30 22:45:41 +02:00
|
|
|
/* repeat until last ':' in authority section */
|
|
|
|
/* if ipv6 address, check after ipv6 end marker */
|
2021-08-04 14:01:28 +02:00
|
|
|
const char *p = (ipv6_end ? ipv6_end : result->host);
|
2020-03-30 22:45:41 +02:00
|
|
|
while (p != NULL && (p < authority_end)) {
|
|
|
|
port_begin = p;
|
|
|
|
p = _strchrb(p + 1, authority_end, ':');
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check if match */
|
|
|
|
if (port_begin && (port_begin[0] == ':')) {
|
2022-05-11 15:00:16 +02:00
|
|
|
port_begin++;
|
|
|
|
|
|
|
|
/* port should be at least one character long */
|
|
|
|
if (port_begin == authority_end) {
|
2020-03-30 22:45:41 +02:00
|
|
|
return false;
|
|
|
|
}
|
2022-05-11 21:15:55 +02:00
|
|
|
|
2022-05-11 15:00:16 +02:00
|
|
|
port_str_len = authority_end - port_begin;
|
|
|
|
|
|
|
|
/* Verify that the port number is up to 5 (random) chars in size */
|
|
|
|
if (port_str_len > MAX_PORT_STR_LEN) {
|
2022-05-11 21:15:55 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2022-05-11 15:00:16 +02:00
|
|
|
/* Verify that all characters of the port are numerical */
|
|
|
|
for (unsigned int i = 0; i < port_str_len; ++i) {
|
|
|
|
if (!((port_begin[i] >= '0') && (port_begin[i] <= '9'))) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Verify that the next character, after the port, is an invalid
|
|
|
|
* character for the atol function. Preventing it from reading out-
|
|
|
|
* side of the port section */
|
|
|
|
if ((authority_end[0] >= '0') && (authority_end[0] <= '9')) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Verify that the port is smaller or equal to UINT16_MAX. */
|
|
|
|
uint32_t port = atol(port_begin);
|
|
|
|
if (port > UINT16_MAX) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
result->port = (uint16_t)port;
|
|
|
|
result->port_str = port_begin;
|
|
|
|
result->port_str_len = port_str_len;
|
|
|
|
|
2020-03-30 22:45:41 +02:00
|
|
|
/* cut host part before port and ':' */
|
2022-05-11 15:00:16 +02:00
|
|
|
result->host_len -= result->port_str_len + 1;
|
2020-03-30 22:45:41 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-08-04 14:01:28 +02:00
|
|
|
static const char *_consume_authority(uri_parser_result_t *result, const char *uri,
|
|
|
|
const char *uri_end)
|
2020-03-30 22:45:41 +02:00
|
|
|
{
|
|
|
|
assert(uri);
|
|
|
|
|
|
|
|
/* search until first '/' */
|
2021-08-04 14:01:28 +02:00
|
|
|
const char *authority_end = _strchrb(uri, uri_end, '/');
|
2020-03-30 22:45:41 +02:00
|
|
|
if (!authority_end) {
|
|
|
|
authority_end = uri_end;
|
|
|
|
}
|
|
|
|
result->host = uri;
|
|
|
|
result->host_len = authority_end - uri;
|
|
|
|
|
|
|
|
/* consume userinfo, if available */
|
|
|
|
_consume_userinfo(result, uri, authority_end);
|
|
|
|
|
2021-02-04 16:17:58 +01:00
|
|
|
/* host is empty */
|
|
|
|
if (result->host_len == 0) {
|
|
|
|
return authority_end;
|
|
|
|
}
|
|
|
|
|
2021-08-04 14:01:28 +02:00
|
|
|
const char *ipv6_end = NULL;
|
2020-03-30 22:45:41 +02:00
|
|
|
/* validate IPv6 form */
|
|
|
|
if (result->host[0] == '[') {
|
|
|
|
ipv6_end = _strchrb(result->host, uri_end, ']');
|
|
|
|
/* found end marker of IPv6 form beyond authority part */
|
|
|
|
if (ipv6_end >= authority_end) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2020-10-29 14:24:07 +01:00
|
|
|
|
2021-08-04 14:01:28 +02:00
|
|
|
const char *zoneid_start = _strchrb(result->host, ipv6_end, '%');
|
2020-10-29 14:24:07 +01:00
|
|
|
if (zoneid_start) {
|
|
|
|
/* skip % */
|
|
|
|
result->zoneid = zoneid_start + 1;
|
|
|
|
result->zoneid_len = ipv6_end - result->zoneid;
|
|
|
|
|
|
|
|
/* zoneid cannot be empty */
|
|
|
|
if (result->zoneid_len == 0) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* remove '[', ']', and '%' zoneid from ipv6addr */
|
|
|
|
result->ipv6addr = result->host + 1;
|
|
|
|
result->ipv6addr_len = ipv6_end - result->ipv6addr;
|
|
|
|
if (result->zoneid) {
|
|
|
|
result->ipv6addr_len -= result->zoneid_len + 1;
|
|
|
|
}
|
2020-03-30 22:45:41 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* consume port, if available */
|
|
|
|
if (!_consume_port(result, ipv6_end, authority_end)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* this includes the '/' */
|
|
|
|
return authority_end;
|
|
|
|
}
|
|
|
|
|
2021-08-04 14:01:28 +02:00
|
|
|
static const char *_consume_path(uri_parser_result_t *result, const char *uri,
|
|
|
|
const char *uri_end)
|
2020-03-30 22:45:41 +02:00
|
|
|
{
|
|
|
|
assert(uri);
|
|
|
|
|
|
|
|
result->path = uri;
|
|
|
|
result->path_len = (uri_end - uri);
|
|
|
|
|
|
|
|
/* check for query start '?' */
|
2021-08-04 14:01:28 +02:00
|
|
|
const char *path_end = _strchrb(uri, uri_end, '?');
|
2020-03-30 22:45:41 +02:00
|
|
|
|
|
|
|
/* no query string found, return! */
|
|
|
|
if (!path_end) {
|
|
|
|
return (result->path + result->path_len);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* there is a query string */
|
|
|
|
result->query = path_end + 1;
|
|
|
|
/* do not count '?' */
|
|
|
|
result->query_len = result->path_len - (path_end - uri) - 1;
|
|
|
|
/* cut path part before query and '?' */
|
|
|
|
result->path_len -= result->query_len + 1;
|
|
|
|
|
|
|
|
return (result->query + result->query_len);
|
|
|
|
}
|
|
|
|
|
2021-08-04 14:01:28 +02:00
|
|
|
static int _parse_relative(uri_parser_result_t *result, const char *uri,
|
|
|
|
const char *uri_end)
|
2020-03-30 22:45:41 +02:00
|
|
|
{
|
|
|
|
uri = _consume_path(result, uri, uri_end);
|
2020-04-02 00:09:43 +02:00
|
|
|
/* uri should point to uri_end, otherwise there's something left
|
|
|
|
* to consume ... */
|
|
|
|
if (uri != uri_end) {
|
2020-03-30 22:45:41 +02:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-08-04 14:01:28 +02:00
|
|
|
static int _parse_absolute(uri_parser_result_t *result, const char *uri,
|
|
|
|
const char *uri_end)
|
2020-03-30 22:45:41 +02:00
|
|
|
{
|
2020-04-01 11:10:09 +02:00
|
|
|
bool has_authority;
|
2020-03-30 22:45:41 +02:00
|
|
|
|
|
|
|
uri = _consume_scheme(result, uri, uri_end, &has_authority);
|
|
|
|
if (uri == NULL) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2021-02-05 13:32:33 +01:00
|
|
|
if (uri >= uri_end) {
|
|
|
|
/* nothing more to consume */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-03-30 22:45:41 +02:00
|
|
|
if (has_authority) {
|
|
|
|
uri = _consume_authority(result, uri, uri_end);
|
|
|
|
if (uri == NULL) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-04 16:19:29 +01:00
|
|
|
/* is there more to parse after authority? */
|
|
|
|
if (uri < uri_end) {
|
|
|
|
/* parsing the path, starting with '/' */
|
|
|
|
return _parse_relative(result, uri, uri_end);
|
|
|
|
}
|
|
|
|
return 0;
|
2020-03-30 22:45:41 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
bool uri_parser_is_absolute(const char *uri, size_t uri_len)
|
|
|
|
{
|
2021-08-04 14:01:28 +02:00
|
|
|
const char *colon = _strchrb(uri, uri + uri_len, ':');
|
2020-03-30 22:45:41 +02:00
|
|
|
|
|
|
|
/* potentially absolute, if ':' exists */
|
|
|
|
if (colon) {
|
|
|
|
/* first character should be ALPHA */
|
|
|
|
if (!(((uri[0] >= 'A') && (uri[0] <= 'Z')) ||
|
|
|
|
((uri[0] >= 'a') && (uri[0] <= 'z')))) {
|
|
|
|
/* relative */
|
|
|
|
return false;
|
|
|
|
}
|
2020-04-03 15:22:13 +02:00
|
|
|
for (int i = 0; &uri[i] < colon; ++i) {
|
|
|
|
if (!(((uri[i] >= 'A') && (uri[i] <= 'Z')) ||
|
|
|
|
((uri[i] >= 'a') && (uri[i] <= 'z')) ||
|
|
|
|
((uri[i] >= '0') && (uri[i] <= '9')) ||
|
|
|
|
(uri[i] == '+') ||
|
|
|
|
(uri[i] == '-') ||
|
|
|
|
(uri[i] == '.'))) {
|
|
|
|
/* relative */
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2020-03-30 22:45:41 +02:00
|
|
|
|
|
|
|
/* absolute */
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* relative */
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool uri_parser_is_absolute_string(const char *uri)
|
|
|
|
{
|
|
|
|
return uri_parser_is_absolute(uri, strlen(uri));
|
|
|
|
}
|
|
|
|
|
|
|
|
int uri_parser_process(uri_parser_result_t *result, const char *uri,
|
|
|
|
size_t uri_len)
|
|
|
|
{
|
|
|
|
/* uri cannot be empty */
|
|
|
|
if ((NULL == uri) || (uri[0] == '\0')) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
memset(result, 0, sizeof(*result));
|
|
|
|
|
|
|
|
if (uri_parser_is_absolute(uri, uri_len)) {
|
2022-05-11 15:00:16 +02:00
|
|
|
if (_parse_absolute(result, uri, uri + uri_len) != 0) {
|
|
|
|
memset(result, 0, sizeof(*result));
|
|
|
|
return -1;
|
|
|
|
}
|
2020-03-30 22:45:41 +02:00
|
|
|
}
|
|
|
|
else {
|
2022-05-11 15:00:16 +02:00
|
|
|
if (_parse_relative(result, uri, uri + uri_len) != 0) {
|
|
|
|
memset(result, 0, sizeof(*result));
|
|
|
|
return -1;
|
|
|
|
}
|
2020-03-30 22:45:41 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int uri_parser_process_string(uri_parser_result_t *result, const char *uri)
|
|
|
|
{
|
|
|
|
return uri_parser_process(result, uri, strlen(uri));
|
|
|
|
}
|
2021-07-30 14:42:05 +02:00
|
|
|
|
|
|
|
int uri_parser_split_query(const uri_parser_result_t *uri,
|
|
|
|
uri_parser_query_param_t *params,
|
|
|
|
size_t params_len)
|
|
|
|
{
|
|
|
|
const char *query_end;
|
|
|
|
unsigned idx = 0;
|
|
|
|
|
|
|
|
assert(uri);
|
|
|
|
assert(params);
|
|
|
|
|
|
|
|
if ((uri->query == NULL) || (uri->query_len == 0) || (params_len == 0)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
assert(params[0].name == 0);
|
|
|
|
assert(params[0].name_len == 0);
|
|
|
|
assert(params[0].value == 0);
|
|
|
|
assert(params[0].value_len == 0);
|
|
|
|
query_end = uri->query + uri->query_len;
|
|
|
|
params[0].name = uri->query;
|
|
|
|
for (const char *c = uri->query; c < query_end; c++) {
|
|
|
|
switch (*c) {
|
|
|
|
case '#':
|
|
|
|
case '&':
|
|
|
|
if (params[idx].value == NULL) {
|
|
|
|
/* we should have picked up a parameter value by now */
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
params[idx].value_len = c - params[idx].value;
|
|
|
|
if (*c == '#') {
|
|
|
|
/* we've reached the end of the query string, next comes an
|
|
|
|
* anchor, enforce end of loop
|
|
|
|
* XXX: can be removed when uri_parser has anchor support */
|
|
|
|
c = query_end;
|
|
|
|
}
|
|
|
|
else if ((idx + 1) < params_len) {
|
|
|
|
/* c is an ampersand (&), so mark the next char as the next
|
|
|
|
* parameter's name name */
|
2021-08-04 14:01:28 +02:00
|
|
|
params[++idx].name = c + 1U;
|
2021-07-30 14:42:05 +02:00
|
|
|
assert(params[idx].name_len == 0);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* c is an ampersand (&), but we exceeded param_len.
|
|
|
|
* Return -2 as per doc */
|
|
|
|
return -2;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case '=':
|
|
|
|
/* params[idx].value != NULL picks up duplicate = in query
|
|
|
|
* parameter */
|
|
|
|
if ((params[idx].name == NULL) || params[idx].value != NULL) {
|
|
|
|
/* we should have picked up a parameter name by now */
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
params[idx].name_len = c - params[idx].name;
|
|
|
|
/* pick next char as start of value */
|
2021-08-04 14:01:28 +02:00
|
|
|
params[idx].value = c + 1U;
|
2021-07-30 14:42:05 +02:00
|
|
|
/* make sure the precondition on params is met */
|
|
|
|
assert(params[idx].value_len == 0);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ((uri->query != query_end) && (params[idx].value == NULL)) {
|
|
|
|
/* we should have picked up a parameter value by now */
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
/* set final value_len */
|
|
|
|
params[idx].value_len = query_end - params[idx].value;
|
|
|
|
return idx + 1;
|
|
|
|
}
|