netsurf/utils/url.c

/*
 * Copyright 2006 Richard Wilson <info@tinct.net>
 * Copyright 2005 James Bursa <bursa@users.sourceforge.net>
 * Copyright 2005 John M Bell <jmb202@ecs.soton.ac.uk>
 *
 * This file is part of NetSurf, http://www.netsurf-browser.org/
 *
 * NetSurf is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * NetSurf is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/** \file
 * \brief Implementation of URL parsing and joining operations.
 */

#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <curl/curl.h>

#include "utils/config.h"
#include "utils/log.h"
#include "utils/url.h"


/* exported interface documented in utils/url.h */
nserror url_unescape(const char *str, char **result)
{
	char *curlstr;
	char *retstr;

	curlstr = curl_unescape(str, 0);
	if (curlstr == NULL) {
		return NSERROR_NOMEM;
	}

	retstr = strdup(curlstr);
	curl_free(curlstr);

	if (retstr == NULL) {
		return NSERROR_NOMEM;
	}

	*result = retstr;
	return NSERROR_OK;
}


/* exported interface documented in utils/url.h */
nserror url_escape(const char *unescaped, size_t toskip,
		bool sptoplus, const char *escexceptions, char **result)
{
	size_t len;
	char *escaped, *d, *tmpres;
	const char *c;

	if (!unescaped || !result)
		return NSERROR_NOT_FOUND;

	*result = NULL;

	len = strlen(unescaped);
	if (len < toskip)
		return NSERROR_NOT_FOUND;
	len -= toskip;

	escaped = malloc(len * 3 + 1);
	if (!escaped)
		return NSERROR_NOMEM;

	for (c = unescaped + toskip, d = escaped; *c; c++) {
		/* Check if we should escape this byte.
		 * '~' is unreserved and should not be percent encoded, if
		 * you believe the spec; however, leaving it unescaped
		 * breaks a bunch of websites, so we escape it anyway. */
		if (!isascii(*c)
			|| (strchr(":/?#[]@" /* gen-delims */
				  "!$&'()*+,;=" /* sub-delims */
				  "<>%\"{}|\\^`~" /* others */,	*c)
				&& (!escexceptions || !strchr(escexceptions, *c)))
			|| *c <= 0x20 || *c == 0x7f) {
			if (*c == 0x20 && sptoplus) {
				*d++ = '+';
			} else {
				*d++ = '%';
				*d++ = "0123456789ABCDEF"[((*c >> 4) & 0xf)];
				*d++ = "0123456789ABCDEF"[(*c & 0xf)];
			}
		} else {
			/* unreserved characters: [a-zA-Z0-9-._] */
			*d++ = *c;
		}
	}
	*d++ = '\0';

	tmpres = malloc(d - escaped + toskip);
	if (!tmpres) {
		free(escaped);
		return NSERROR_NOMEM;
	}

	memcpy(tmpres, unescaped, toskip); 
	memcpy(tmpres + toskip, escaped, d - escaped);
	*result = tmpres;

	free(escaped);

	return NSERROR_OK;
}
[project @ 2004-03-02 18:02:17 by bursa] Add new url functions and modify to use them. svn path=/import/netsurf/; revision=578 2004-03-02 21:02:41 +03:00			`/*`
Massively optimise common URL functions. This should (a) drastically reduce the start-up time for users with a large history/hotlist, and (b) decrease the loading time for pages with a sizable number of references. svn path=/trunk/netsurf/; revision=2656 2006-06-27 21:59:32 +04:00			`* Copyright 2006 Richard Wilson <info@tinct.net>`
[project @ 2005-04-30 14:31:48 by bursa] Fix bug in url_up_re which caused /xy/../ not to be recognised when xy was 2 characters. Add defines for match part numbers. Simplify part 6(g) of url_join(). svn path=/import/netsurf/; revision=1704 2005-04-30 18:31:48 +04:00			`* Copyright 2005 James Bursa <bursa@users.sourceforge.net>`
			`* Copyright 2005 John M Bell <jmb202@ecs.soton.ac.uk>`
Update all source code file headers to reflect GPL version 2 only and contain appropriate licence text svn path=/trunk/netsurf/; revision=3486 2007-08-08 20:16:03 +04:00			`*`
			`* This file is part of NetSurf, http://www.netsurf-browser.org/`
			`*`
			`* NetSurf is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation; version 2 of the License.`
			`*`
			`* NetSurf is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program. If not, see <http://www.gnu.org/licenses/>.`
[project @ 2004-03-02 18:02:17 by bursa] Add new url functions and modify to use them. svn path=/import/netsurf/; revision=578 2004-03-02 21:02:41 +03:00			`*/`

			`/** \file`
refactor url utility functions to use standard nserror codes and have appropriate documentation. 2014-05-08 03:16:50 +04:00			`* \brief Implementation of URL parsing and joining operations.`
[project @ 2004-03-02 18:02:17 by bursa] Add new url functions and modify to use them. svn path=/import/netsurf/; revision=578 2004-03-02 21:02:41 +03:00			`*/`

			`#include <ctype.h>`
			`#include <string.h>`
Split utils header into string functions and everything else split out the string handling API from the rest of the utils header and fix up all the fallout. 2016-04-22 00:36:21 +03:00			`#include <stdlib.h>`
remove uncessary includes 2014-05-02 15:26:25 +04:00			`#include <curl/curl.h>`
add compatability for inet_pton and inet_aton svn path=/trunk/netsurf/; revision=11627 2011-02-07 17:41:44 +03:00
BeOS does not support IPv6, and never will. svn path=/trunk/netsurf/; revision=11014 2010-12-05 22:16:49 +03:00			`#include "utils/config.h"`
Remove the netsurf/ from the include paths and rationalise use of <> vs "" in includes NetSurf includes are now done with ""s and other system includes with <>s as C intended. The scandeps tool has been updated to only look for ""ed includes, and to verify that the files exist in the tree before adding them to the dependency lines. The depend rule has therefore been augmented to make sure the autogenerated files are built before it is run. This is untested under self-hosted RISC OS builds. All else tested and works. svn path=/trunk/netsurf/; revision=3307 2007-05-31 02:39:54 +04:00			`#include "utils/log.h"`
remove uncessary includes 2014-05-02 15:26:25 +04:00			`#include "utils/url.h"`
[project @ 2004-03-02 18:02:17 by bursa] Add new url functions and modify to use them. svn path=/import/netsurf/; revision=578 2004-03-02 21:02:41 +03:00
Massively optimise common URL functions. This should (a) drastically reduce the start-up time for users with a large history/hotlist, and (b) decrease the loading time for pages with a sizable number of references. svn path=/trunk/netsurf/; revision=2656 2006-06-27 21:59:32 +04:00
refactor url utility functions to use standard nserror codes and have appropriate documentation. 2014-05-08 03:16:50 +04:00			`/* exported interface documented in utils/url.h */`
			`nserror url_unescape(const char str, char *result)`
Merge branches/vince/netsurf-file-fetcher to trunk r=jmb svn path=/trunk/netsurf/; revision=10750 2010-09-10 01:45:59 +04:00			`{`
			`char *curlstr;`
			`char *retstr;`

			`curlstr = curl_unescape(str, 0);`
			`if (curlstr == NULL) {`
refactor url utility functions to use standard nserror codes and have appropriate documentation. 2014-05-08 03:16:50 +04:00			`return NSERROR_NOMEM;`
Merge branches/vince/netsurf-file-fetcher to trunk r=jmb svn path=/trunk/netsurf/; revision=10750 2010-09-10 01:45:59 +04:00			`}`

			`retstr = strdup(curlstr);`
			`curl_free(curlstr);`

			`if (retstr == NULL) {`
refactor url utility functions to use standard nserror codes and have appropriate documentation. 2014-05-08 03:16:50 +04:00			`return NSERROR_NOMEM;`
Merge branches/vince/netsurf-file-fetcher to trunk r=jmb svn path=/trunk/netsurf/; revision=10750 2010-09-10 01:45:59 +04:00			`}`

			`*result = retstr;`
refactor url utility functions to use standard nserror codes and have appropriate documentation. 2014-05-08 03:16:50 +04:00			`return NSERROR_OK;`
Merge branches/vince/netsurf-file-fetcher to trunk r=jmb svn path=/trunk/netsurf/; revision=10750 2010-09-10 01:45:59 +04:00			`}`
[project @ 2005-07-23 20:43:37 by bursa] Rewrite and simplify url_nice() to improve suggested filenames. Add option to keep extensions (no UI currently). svn path=/import/netsurf/; revision=1814 2005-07-24 00:43:37 +04:00

refactor url utility functions to use standard nserror codes and have appropriate documentation. 2014-05-08 03:16:50 +04:00			`/* exported interface documented in utils/url.h */`
			`nserror url_escape(const char *unescaped, size_t toskip,`
- riscos/gui.c(path_to_url): escape the characters which need to be escaped when converting the host path to file: URL. - utils/{url.c,url.h}(url_escape): * added parameter 'toskip' to specify number of input characters which need to be skipped in the escape process. This avoids extra malloc buffer juggling. * added parameter 'escexceptions' to specify the characters which need to be excluded from the escape process. Solves SF tracker ID 1910169. Note that when discname in path contains '/' characters (case: "file:///Sunfish#192.168.0.50::/home/joty.$/jo.html") or there is no discname specified at all (case "file:///HostFS:$/jo.htm"), you need an UnixLib fix as in http://www.riscos.info/websvn/listing.php?repname=gccsdk&path=%2Ftrunk%2Fgcc4%2F&rev=3395&sc=1 svn path=/trunk/netsurf/; revision=4069 2008-04-02 04:43:51 +04:00			`bool sptoplus, const char escexceptions, char *result)`
[project @ 2005-06-26 22:18:37 by jmb] Improve clarity of use of utf8_to_enc. Remove use of curl_escape - url_escape does similar things, just better. svn path=/import/netsurf/; revision=1766 2005-06-27 02:18:37 +04:00			`{`
- riscos/gui.c(path_to_url): escape the characters which need to be escaped when converting the host path to file: URL. - utils/{url.c,url.h}(url_escape): * added parameter 'toskip' to specify number of input characters which need to be skipped in the escape process. This avoids extra malloc buffer juggling. * added parameter 'escexceptions' to specify the characters which need to be excluded from the escape process. Solves SF tracker ID 1910169. Note that when discname in path contains '/' characters (case: "file:///Sunfish#192.168.0.50::/home/joty.$/jo.html") or there is no discname specified at all (case "file:///HostFS:$/jo.htm"), you need an UnixLib fix as in http://www.riscos.info/websvn/listing.php?repname=gccsdk&path=%2Ftrunk%2Fgcc4%2F&rev=3395&sc=1 svn path=/trunk/netsurf/; revision=4069 2008-04-02 04:43:51 +04:00			`size_t len;`
			`char escaped, d, *tmpres;`
[project @ 2005-06-26 22:18:37 by jmb] Improve clarity of use of utf8_to_enc. Remove use of curl_escape - url_escape does similar things, just better. svn path=/import/netsurf/; revision=1766 2005-06-27 02:18:37 +04:00			`const char *c;`

			`if (!unescaped \|\| !result)`
refactor url utility functions to use standard nserror codes and have appropriate documentation. 2014-05-08 03:16:50 +04:00			`return NSERROR_NOT_FOUND;`
[project @ 2005-06-26 22:18:37 by jmb] Improve clarity of use of utf8_to_enc. Remove use of curl_escape - url_escape does similar things, just better. svn path=/import/netsurf/; revision=1766 2005-06-27 02:18:37 +04:00
			`*result = NULL;`

			`len = strlen(unescaped);`
- riscos/gui.c(path_to_url): escape the characters which need to be escaped when converting the host path to file: URL. - utils/{url.c,url.h}(url_escape): * added parameter 'toskip' to specify number of input characters which need to be skipped in the escape process. This avoids extra malloc buffer juggling. * added parameter 'escexceptions' to specify the characters which need to be excluded from the escape process. Solves SF tracker ID 1910169. Note that when discname in path contains '/' characters (case: "file:///Sunfish#192.168.0.50::/home/joty.$/jo.html") or there is no discname specified at all (case "file:///HostFS:$/jo.htm"), you need an UnixLib fix as in http://www.riscos.info/websvn/listing.php?repname=gccsdk&path=%2Ftrunk%2Fgcc4%2F&rev=3395&sc=1 svn path=/trunk/netsurf/; revision=4069 2008-04-02 04:43:51 +04:00			`if (len < toskip)`
refactor url utility functions to use standard nserror codes and have appropriate documentation. 2014-05-08 03:16:50 +04:00			`return NSERROR_NOT_FOUND;`
- riscos/gui.c(path_to_url): escape the characters which need to be escaped when converting the host path to file: URL. - utils/{url.c,url.h}(url_escape): * added parameter 'toskip' to specify number of input characters which need to be skipped in the escape process. This avoids extra malloc buffer juggling. * added parameter 'escexceptions' to specify the characters which need to be excluded from the escape process. Solves SF tracker ID 1910169. Note that when discname in path contains '/' characters (case: "file:///Sunfish#192.168.0.50::/home/joty.$/jo.html") or there is no discname specified at all (case "file:///HostFS:$/jo.htm"), you need an UnixLib fix as in http://www.riscos.info/websvn/listing.php?repname=gccsdk&path=%2Ftrunk%2Fgcc4%2F&rev=3395&sc=1 svn path=/trunk/netsurf/; revision=4069 2008-04-02 04:43:51 +04:00			`len -= toskip;`
[project @ 2005-06-26 22:18:37 by jmb] Improve clarity of use of utf8_to_enc. Remove use of curl_escape - url_escape does similar things, just better. svn path=/import/netsurf/; revision=1766 2005-06-27 02:18:37 +04:00
			`escaped = malloc(len * 3 + 1);`
			`if (!escaped)`
refactor url utility functions to use standard nserror codes and have appropriate documentation. 2014-05-08 03:16:50 +04:00			`return NSERROR_NOMEM;`
[project @ 2004-03-27 23:18:52 by bursa] Implement url_nice() and make save boxes use it to choose the default filename. svn path=/import/netsurf/; revision=682 2004-03-28 03:18:52 +04:00
- riscos/gui.c(path_to_url): escape the characters which need to be escaped when converting the host path to file: URL. - utils/{url.c,url.h}(url_escape): * added parameter 'toskip' to specify number of input characters which need to be skipped in the escape process. This avoids extra malloc buffer juggling. * added parameter 'escexceptions' to specify the characters which need to be excluded from the escape process. Solves SF tracker ID 1910169. Note that when discname in path contains '/' characters (case: "file:///Sunfish#192.168.0.50::/home/joty.$/jo.html") or there is no discname specified at all (case "file:///HostFS:$/jo.htm"), you need an UnixLib fix as in http://www.riscos.info/websvn/listing.php?repname=gccsdk&path=%2Ftrunk%2Fgcc4%2F&rev=3395&sc=1 svn path=/trunk/netsurf/; revision=4069 2008-04-02 04:43:51 +04:00			`for (c = unescaped + toskip, d = escaped; *c; c++) {`
Bring percent-encoding closer to what other browsers do. svn path=/trunk/netsurf/; revision=3179 2007-02-12 01:28:00 +03:00			`/* Check if we should escape this byte.`
			`* '~' is unreserved and should not be percent encoded, if`
			`* you believe the spec; however, leaving it unescaped`
			`* breaks a bunch of websites, so we escape it anyway. */`
- riscos/gui.c(path_to_url): escape the characters which need to be escaped when converting the host path to file: URL. - utils/{url.c,url.h}(url_escape): * added parameter 'toskip' to specify number of input characters which need to be skipped in the escape process. This avoids extra malloc buffer juggling. * added parameter 'escexceptions' to specify the characters which need to be excluded from the escape process. Solves SF tracker ID 1910169. Note that when discname in path contains '/' characters (case: "file:///Sunfish#192.168.0.50::/home/joty.$/jo.html") or there is no discname specified at all (case "file:///HostFS:$/jo.htm"), you need an UnixLib fix as in http://www.riscos.info/websvn/listing.php?repname=gccsdk&path=%2Ftrunk%2Fgcc4%2F&rev=3395&sc=1 svn path=/trunk/netsurf/; revision=4069 2008-04-02 04:43:51 +04:00			`if (!isascii(*c)`
			`\|\| (strchr(":/?#[]@" /* gen-delims */`
			`"!$&'()+,;=" / sub-delims */`
			"<>%\"{}\|\\^`~" /* others /, c)
			`&& (!escexceptions \|\| !strchr(escexceptions, *c)))`
			`\|\| c <= 0x20 \|\| c == 0x7f) {`
Bring percent-encoding closer to what other browsers do. svn path=/trunk/netsurf/; revision=3179 2007-02-12 01:28:00 +03:00			`if (*c == 0x20 && sptoplus) {`
Correcly encode spaces for url-encoded form submission. svn path=/trunk/netsurf/; revision=2960 2006-09-19 14:08:33 +04:00			`*d++ = '+';`
Bring percent-encoding closer to what other browsers do. svn path=/trunk/netsurf/; revision=3179 2007-02-12 01:28:00 +03:00			`} else {`
Correcly encode spaces for url-encoded form submission. svn path=/trunk/netsurf/; revision=2960 2006-09-19 14:08:33 +04:00			`*d++ = '%';`
			`d++ = "0123456789ABCDEF"[((c >> 4) & 0xf)];`
			`d++ = "0123456789ABCDEF"[(c & 0xf)];`
			`}`
Bring percent-encoding closer to what other browsers do. svn path=/trunk/netsurf/; revision=3179 2007-02-12 01:28:00 +03:00			`} else {`
			`/* unreserved characters: [a-zA-Z0-9-._] */`
[project @ 2005-06-26 22:18:37 by jmb] Improve clarity of use of utf8_to_enc. Remove use of curl_escape - url_escape does similar things, just better. svn path=/import/netsurf/; revision=1766 2005-06-27 02:18:37 +04:00			`d++ = c;`
			`}`
			`}`
[project @ 2005-06-27 01:57:54 by adrianl] Fix termination of output from url_escape svn path=/import/netsurf/; revision=1770 2005-06-27 05:57:54 +04:00			`*d++ = '\0';`

- riscos/gui.c(path_to_url): escape the characters which need to be escaped when converting the host path to file: URL. - utils/{url.c,url.h}(url_escape): * added parameter 'toskip' to specify number of input characters which need to be skipped in the escape process. This avoids extra malloc buffer juggling. * added parameter 'escexceptions' to specify the characters which need to be excluded from the escape process. Solves SF tracker ID 1910169. Note that when discname in path contains '/' characters (case: "file:///Sunfish#192.168.0.50::/home/joty.$/jo.html") or there is no discname specified at all (case "file:///HostFS:$/jo.htm"), you need an UnixLib fix as in http://www.riscos.info/websvn/listing.php?repname=gccsdk&path=%2Ftrunk%2Fgcc4%2F&rev=3395&sc=1 svn path=/trunk/netsurf/; revision=4069 2008-04-02 04:43:51 +04:00			`tmpres = malloc(d - escaped + toskip);`
			`if (!tmpres) {`
[project @ 2005-06-26 22:18:37 by jmb] Improve clarity of use of utf8_to_enc. Remove use of curl_escape - url_escape does similar things, just better. svn path=/import/netsurf/; revision=1766 2005-06-27 02:18:37 +04:00			`free(escaped);`
refactor url utility functions to use standard nserror codes and have appropriate documentation. 2014-05-08 03:16:50 +04:00			`return NSERROR_NOMEM;`
[project @ 2005-06-26 22:18:37 by jmb] Improve clarity of use of utf8_to_enc. Remove use of curl_escape - url_escape does similar things, just better. svn path=/import/netsurf/; revision=1766 2005-06-27 02:18:37 +04:00			`}`
[project @ 2005-06-27 01:57:54 by adrianl] Fix termination of output from url_escape svn path=/import/netsurf/; revision=1770 2005-06-27 05:57:54 +04:00
- riscos/gui.c(path_to_url): escape the characters which need to be escaped when converting the host path to file: URL. - utils/{url.c,url.h}(url_escape): * added parameter 'toskip' to specify number of input characters which need to be skipped in the escape process. This avoids extra malloc buffer juggling. * added parameter 'escexceptions' to specify the characters which need to be excluded from the escape process. Solves SF tracker ID 1910169. Note that when discname in path contains '/' characters (case: "file:///Sunfish#192.168.0.50::/home/joty.$/jo.html") or there is no discname specified at all (case "file:///HostFS:$/jo.htm"), you need an UnixLib fix as in http://www.riscos.info/websvn/listing.php?repname=gccsdk&path=%2Ftrunk%2Fgcc4%2F&rev=3395&sc=1 svn path=/trunk/netsurf/; revision=4069 2008-04-02 04:43:51 +04:00			`memcpy(tmpres, unescaped, toskip);`
			`memcpy(tmpres + toskip, escaped, d - escaped);`
			`*result = tmpres;`
[project @ 2005-06-26 22:18:37 by jmb] Improve clarity of use of utf8_to_enc. Remove use of curl_escape - url_escape does similar things, just better. svn path=/import/netsurf/; revision=1766 2005-06-27 02:18:37 +04:00
			`free(escaped);`

refactor url utility functions to use standard nserror codes and have appropriate documentation. 2014-05-08 03:16:50 +04:00			`return NSERROR_OK;`
[project @ 2005-06-26 22:18:37 by jmb] Improve clarity of use of utf8_to_enc. Remove use of curl_escape - url_escape does similar things, just better. svn path=/import/netsurf/; revision=1766 2005-06-27 02:18:37 +04:00			`}`