mirror of
https://github.com/netsurf-browser/netsurf
synced 2025-02-07 01:55:00 +03:00
Contribution from Philip Boulain <prb@ecs.soton.ac.uk>:
This makes url_normalize take care of whitespace in a fairly useful way, consistent with other browsers: - Leading and trailing whitespace is trimmed - Internal whitespace is urlescaped For example, " http://www.google.co.uk/search?q=hello world " becomes "http://www.google.co.uk/search?q=hello%20world" Explicit trailing whitespace, e.g. "...hello world%20", is left alone. The upshot is that if you sloppily copy-paste a URL from IRC or whatnot into the address bar, NetSurf no longer silently ignores you if you caught some adjacent whitespace. svn path=/trunk/netsurf/; revision=4198
This commit is contained in:
parent
497372c257
commit
3f6d2a9f0c
117
utils/url.c
117
utils/url.c
@ -127,41 +127,69 @@ url_func_result url_normalize(const char *url, char **result)
|
|||||||
{
|
{
|
||||||
char c;
|
char c;
|
||||||
int m;
|
int m;
|
||||||
int i;
|
size_t i;
|
||||||
size_t len;
|
size_t len;
|
||||||
|
size_t bufsize;
|
||||||
|
char* norm;
|
||||||
bool http = false;
|
bool http = false;
|
||||||
regmatch_t match[10];
|
regmatch_t match[10];
|
||||||
|
|
||||||
*result = NULL;
|
*result = NULL;
|
||||||
|
|
||||||
if ((m = regexec(&url_re, url, 10, match, 0))) {
|
/* skip past any leading whitespace (likely if URL was copy-pasted) */
|
||||||
|
while (isspace(*url))
|
||||||
|
url++;
|
||||||
|
|
||||||
|
/* allocate sufficiently large buffer for new URL */
|
||||||
|
len = strlen(url);
|
||||||
|
bufsize = len + sizeof("http://")-1 + sizeof("/")-1 + 1; /* 'http://' + '/' + '\0' */
|
||||||
|
/* work out how much extra to leave for internal whitespace */
|
||||||
|
for(i = 0; i < len; i++) {
|
||||||
|
if(isspace(url[i])) bufsize += 2; /* ' ' -> '%20' */
|
||||||
|
}
|
||||||
|
if ((norm = malloc(bufsize)) == NULL) {
|
||||||
|
LOG(("malloc failed"));
|
||||||
|
return URL_FUNC_NOMEM;
|
||||||
|
}
|
||||||
|
*result = norm;
|
||||||
|
strcpy(norm, url);
|
||||||
|
|
||||||
|
/* truncate trailing whitespace (significant should be uriencoded) */
|
||||||
|
for (i = len - 1; (i > 0) && isspace(norm[i]); i--) {
|
||||||
|
norm[i] = '\0';
|
||||||
|
len--;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* encode any remaining (internal) whitespace */
|
||||||
|
for (i = 0; i < len; i++) {
|
||||||
|
if(isspace(norm[i])) {
|
||||||
|
char space = norm[i];
|
||||||
|
memmove(norm + i + 2, norm + i, 1 + len - i);
|
||||||
|
len += 2;
|
||||||
|
norm[ i] = '%';
|
||||||
|
norm[++i] = digit2lowcase_hex(space >> 4);
|
||||||
|
norm[++i] = digit2lowcase_hex(space & 0xf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* finally verify that it's actually an URL we're working on
|
||||||
|
* (RFC regex too fussy to tolerate above WSP problems) */
|
||||||
|
if ((m = regexec(&url_re, norm, 10, match, 0))) {
|
||||||
LOG(("url '%s' failed to match regex", url));
|
LOG(("url '%s' failed to match regex", url));
|
||||||
return URL_FUNC_FAILED;
|
return URL_FUNC_FAILED;
|
||||||
}
|
}
|
||||||
|
|
||||||
len = strlen(url);
|
|
||||||
|
|
||||||
if (match[URL_RE_SCHEME].rm_so == -1) {
|
if (match[URL_RE_SCHEME].rm_so == -1) {
|
||||||
/* scheme missing: add http:// and reparse */
|
/* scheme missing: add http:// and reparse */
|
||||||
/* LOG(("scheme missing: using http"));*/
|
/* LOG(("scheme missing: using http"));*/
|
||||||
if ((*result = malloc(len + 13)) == NULL) {
|
memmove(norm + sizeof("http://")-1, norm, len + 1);
|
||||||
LOG(("malloc failed"));
|
memcpy(norm, "http://", sizeof("http://")-1); /* do NOT copy null */
|
||||||
return URL_FUNC_NOMEM;
|
len += 7;
|
||||||
}
|
if ((m = regexec(&url_re, norm, 10, match, 0))) {
|
||||||
strcpy(*result, "http://");
|
LOG(("url '%s' failed to match regex", norm));
|
||||||
strcpy(*result + sizeof("http://")-1, url);
|
free(norm);
|
||||||
if ((m = regexec(&url_re, *result, 10, match, 0))) {
|
|
||||||
LOG(("url '%s' failed to match regex", (*result)));
|
|
||||||
free(*result);
|
|
||||||
return URL_FUNC_FAILED;
|
return URL_FUNC_FAILED;
|
||||||
}
|
}
|
||||||
len += sizeof("http://")-1;
|
|
||||||
} else {
|
|
||||||
if ((*result = malloc(len + 6)) == NULL) {
|
|
||||||
LOG(("malloc failed"));
|
|
||||||
return URL_FUNC_NOMEM;
|
|
||||||
}
|
|
||||||
strcpy(*result, url);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*for (unsigned int i = 0; i != 10; i++) {
|
/*for (unsigned int i = 0; i != 10; i++) {
|
||||||
@ -177,22 +205,22 @@ url_func_result url_normalize(const char *url, char **result)
|
|||||||
if (match[URL_RE_SCHEME].rm_so != -1) {
|
if (match[URL_RE_SCHEME].rm_so != -1) {
|
||||||
for (i = match[URL_RE_SCHEME].rm_so;
|
for (i = match[URL_RE_SCHEME].rm_so;
|
||||||
i != match[URL_RE_SCHEME].rm_eo; i++)
|
i != match[URL_RE_SCHEME].rm_eo; i++)
|
||||||
(*result)[i] = tolower((*result)[i]);
|
norm[i] = tolower(norm[i]);
|
||||||
if (match[URL_RE_SCHEME].rm_eo == 4
|
if (match[URL_RE_SCHEME].rm_eo == 4
|
||||||
&& (*result)[0] == 'h'
|
&& norm[0] == 'h'
|
||||||
&& (*result)[1] == 't'
|
&& norm[1] == 't'
|
||||||
&& (*result)[2] == 't'
|
&& norm[2] == 't'
|
||||||
&& (*result)[3] == 'p')
|
&& norm[3] == 'p')
|
||||||
http = true;
|
http = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* make empty path into "/" */
|
/* make empty path into "/" */
|
||||||
if (match[URL_RE_PATH].rm_so != -1 &&
|
if (match[URL_RE_PATH].rm_so != -1 &&
|
||||||
match[URL_RE_PATH].rm_so == match[URL_RE_PATH].rm_eo) {
|
match[URL_RE_PATH].rm_so == match[URL_RE_PATH].rm_eo) {
|
||||||
memmove((*result) + match[URL_RE_PATH].rm_so + 1,
|
memmove(norm + match[URL_RE_PATH].rm_so + 1,
|
||||||
(*result) + match[URL_RE_PATH].rm_so,
|
norm + match[URL_RE_PATH].rm_so,
|
||||||
len - match[URL_RE_PATH].rm_so + 1);
|
len - match[URL_RE_PATH].rm_so + 1);
|
||||||
(*result)[match[URL_RE_PATH].rm_so] = '/';
|
norm[match[URL_RE_PATH].rm_so] = '/';
|
||||||
len++;
|
len++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -200,45 +228,45 @@ url_func_result url_normalize(const char *url, char **result)
|
|||||||
if (match[URL_RE_AUTHORITY].rm_so != -1) {
|
if (match[URL_RE_AUTHORITY].rm_so != -1) {
|
||||||
for (i = match[URL_RE_AUTHORITY].rm_so;
|
for (i = match[URL_RE_AUTHORITY].rm_so;
|
||||||
i != match[URL_RE_AUTHORITY].rm_eo; i++) {
|
i != match[URL_RE_AUTHORITY].rm_eo; i++) {
|
||||||
if ((*result)[i] == ':') {
|
if (norm[i] == ':' && (i + 3) < len) {
|
||||||
if (http && (*result)[i + 1] == '8' &&
|
if (http && norm[i + 1] == '8' &&
|
||||||
(*result)[i + 2] == '0' &&
|
norm[i + 2] == '0' &&
|
||||||
i + 3 ==
|
i + 3 ==
|
||||||
match[URL_RE_AUTHORITY].rm_eo) {
|
match[URL_RE_AUTHORITY].rm_eo) {
|
||||||
memmove((*result) + i,
|
memmove(norm + i,
|
||||||
(*result) + i + 3,
|
norm + i + 3,
|
||||||
len -
|
len -
|
||||||
match[URL_RE_AUTHORITY].
|
match[URL_RE_AUTHORITY].
|
||||||
rm_eo);
|
rm_eo);
|
||||||
len -= 3;
|
len -= 3;
|
||||||
(*result)[len] = '\0';
|
norm[len] = '\0';
|
||||||
} else if (i + 1 == match[4].rm_eo) {
|
} else if (i + 1 == match[4].rm_eo) {
|
||||||
memmove((*result) + i,
|
memmove(norm + i,
|
||||||
(*result) + i + 1,
|
norm + i + 1,
|
||||||
len -
|
len -
|
||||||
match[URL_RE_AUTHORITY].
|
match[URL_RE_AUTHORITY].
|
||||||
rm_eo);
|
rm_eo);
|
||||||
len--;
|
len--;
|
||||||
(*result)[len] = '\0';
|
norm[len] = '\0';
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
(*result)[i] = tolower((*result)[i]);
|
norm[i] = tolower(norm[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* unescape non-"reserved" escaped characters */
|
/* unescape non-"reserved" escaped characters */
|
||||||
for (i = 0; (unsigned)i != len; i++) {
|
for (i = 0; i + 2 < len; i++) {
|
||||||
if ((*result)[i] != '%')
|
if (norm[i] != '%')
|
||||||
continue;
|
continue;
|
||||||
c = tolower((*result)[i + 1]);
|
c = tolower(norm[i + 1]);
|
||||||
if ('0' <= c && c <= '9')
|
if ('0' <= c && c <= '9')
|
||||||
m = 16 * (c - '0');
|
m = 16 * (c - '0');
|
||||||
else if ('a' <= c && c <= 'f')
|
else if ('a' <= c && c <= 'f')
|
||||||
m = 16 * (c - 'a' + 10);
|
m = 16 * (c - 'a' + 10);
|
||||||
else
|
else
|
||||||
continue;
|
continue;
|
||||||
c = tolower((*result)[i + 2]);
|
c = tolower(norm[i + 2]);
|
||||||
if ('0' <= c && c <= '9')
|
if ('0' <= c && c <= '9')
|
||||||
m += c - '0';
|
m += c - '0';
|
||||||
else if ('a' <= c && c <= 'f')
|
else if ('a' <= c && c <= 'f')
|
||||||
@ -252,11 +280,12 @@ url_func_result url_normalize(const char *url, char **result)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
(*result)[i] = m;
|
norm[i] = m;
|
||||||
memmove((*result) + i + 1, (*result) + i + 3, len - i - 2);
|
memmove(norm + i + 1, norm + i + 3, len - i - 2);
|
||||||
len -= 2;
|
len -= 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* norm and *result point to same memory, so just return ok */
|
||||||
return URL_FUNC_OK;
|
return URL_FUNC_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <regex.h>
|
#include <regex.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
#ifndef NOF_ELEMENTS
|
#ifndef NOF_ELEMENTS
|
||||||
#define NOF_ELEMENTS(array) (sizeof(array)/sizeof(*(array)))
|
#define NOF_ELEMENTS(array) (sizeof(array)/sizeof(*(array)))
|
||||||
@ -71,6 +72,17 @@ char *strcasestr(const char *haystack, const char *needle);
|
|||||||
#endif
|
#endif
|
||||||
unsigned int wallclock(void);
|
unsigned int wallclock(void);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a hex digit for the given numerical value.
|
||||||
|
*
|
||||||
|
* \return character in range 0-9a-f
|
||||||
|
*/
|
||||||
|
inline static char digit2lowcase_hex(unsigned char digit) {
|
||||||
|
assert(digit < 16);
|
||||||
|
return "0123456789abcdef"[digit];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Platform specific functions */
|
/* Platform specific functions */
|
||||||
void die(const char * const error);
|
void die(const char * const error);
|
||||||
void warn_user(const char *warning, const char *detail);
|
void warn_user(const char *warning, const char *detail);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user