/* * This file is part of NetSurf, http://netsurf.sourceforge.net/ * Licensed under the GNU General Public License, * http://www.opensource.org/licenses/gpl-license * Copyright 2006 John M Bell */ /** \file * Unified URL information database (implementation) * * URLs are stored in a tree-based structure as follows: * * The host component is extracted from each URL and, if a FQDN, split on * every '.'.The tree is constructed by inserting each FQDN segment in * reverse order. Duplicate nodes are merged. * * If the host part of an URL is an IP address, then this is added to the * tree verbatim (as if it were a TLD). * * This provides something looking like: * * root (a sentinel) * | * ------------------------------------------------- * | | | | | | | * com edu gov 127.0.0.1 net org uk TLDs * | | | | | | * google ... ... ... ... co 2LDs * | | * www bbc Hosts/Subdomains * | * www ... * * Each of the nodes in this tree is a struct host_part. This stores the * FQDN segment (or IP address) with which the node is concerned. Each node * may contain further information about paths on a host (struct path_data) * or SSL certificate processing on a host-wide basis * (host_part::permit_invalid_certs). * * Path data is concerned with storing various metadata about the path in * question. This includes global history data, HTTP authentication details * and any associated HTTP cookies. This is stored as a tree of path segments * hanging off the relevant host_part node. * * Therefore, to find the last visited time of the URL * http://www.example.com/path/to/resource.html, the FQDN tree would be * traversed in the order root -> "com" -> "example" -> "www". The "www" * node would have attached to it a tree of struct path_data: * * (sentinel) * | * path * | * to * | * resource.html * * This represents the absolute path "/path/to/resource.html". The leaf node * "resource.html" contains the last visited time of the resource. * * The mechanism described above is, however, not particularly conducive to * fast searching of the database for a given URL (or URLs beginning with a * given prefix). Therefore, an anciliary data structure is used to enable * fast searching. This structure simply reflects the contents of the * database, with entries being added/removed at the same time as for the * core database. In order to ensure that degenerate cases are kept to a * minimum, we use an AAtree. This is an approximation of a Red-Black tree * with similar performance characteristics, but with a significantly * simpler implementation. Entries in this tree comprise pointers to the * leaf nodes of the host tree described above. */ #include #include #include #include #include #include #include #include #include "netsurf/image/bitmap.h" #include "netsurf/content/urldb.h" #include "netsurf/desktop/cookies.h" #include "netsurf/desktop/options.h" #ifdef riscos /** \todo lose this */ #include "netsurf/riscos/bitmap.h" #endif #include "netsurf/utils/log.h" #include "netsurf/utils/url.h" #include "netsurf/utils/utils.h" struct cookie_internal_data { char *name; /**< Cookie name */ char *value; /**< Cookie value */ char *comment; /**< Cookie comment */ bool domain_from_set; /**< Domain came from Set-Cookie: header */ char *domain; /**< Domain */ bool path_from_set; /**< Path came from Set-Cookie: header */ char *path; /**< Path */ time_t expires; /**< Expiry timestamp, or 1 for session */ time_t last_used; /**< Last used time */ bool secure; /**< Only send for HTTPS requests */ cookie_version version; /**< Specification compliance */ bool no_destroy; /**< Never destroy this cookie, * unless it's expired */ struct cookie_internal_data *prev; /**< Previous in list */ struct cookie_internal_data *next; /**< Next in list */ }; struct auth_data { char *realm; /**< Protection realm */ char *auth; /**< Authentication details in form * username:password */ }; struct url_internal_data { char *title; /**< Resource title */ unsigned int visits; /**< Visit count */ time_t last_visit; /**< Last visit time */ content_type type; /**< Type of resource */ }; struct path_data { char *url; /**< Full URL */ char *scheme; /**< URL scheme for data */ unsigned int port; /**< Port number for data */ char *segment; /**< Path segment for this node */ unsigned int frag_cnt; /**< Number of entries in ::fragment */ char **fragment; /**< Array of fragments */ bool persistent; /**< This entry should persist */ struct bitmap *thumb; /**< Thumbnail image of resource */ struct url_internal_data urld; /**< URL data for resource */ struct auth_data auth; /**< Authentication data for resource */ struct cookie_internal_data *cookies; /**< Cookies associated with resource */ struct path_data *next; /**< Next sibling */ struct path_data *prev; /**< Previous sibling */ struct path_data *parent; /**< Parent path segment */ struct path_data *children; /**< Child path segments */ struct path_data *last; /**< Last child */ }; struct host_part { /**< Known paths on this host. This _must_ be first so that * struct host_part *h = (struct host_part *)mypath; works */ struct path_data paths; bool permit_invalid_certs; /**< Allow access to SSL protected * resources on this host without * verifying certificate authenticity */ char *part; /**< Part of host string */ struct host_part *next; /**< Next sibling */ struct host_part *prev; /**< Previous sibling */ struct host_part *parent; /**< Parent host part */ struct host_part *children; /**< Child host parts */ }; struct search_node { const struct host_part *data; /**< Host tree entry */ unsigned int level; /**< Node level */ struct search_node *left; /**< Left subtree */ struct search_node *right; /**< Right subtree */ }; /* Saving */ static void urldb_save_search_tree(struct search_node *root, FILE *fp); static void urldb_count_urls(const struct path_data *root, time_t expiry, unsigned int *count); static void urldb_write_paths(const struct path_data *parent, const char *host, FILE *fp, char **path, int *path_alloc, int *path_used, time_t expiry); /* Iteration */ static bool urldb_iterate_partial_host(struct search_node *root, const char *prefix, bool (*callback)(const char *url, const struct url_data *data)); static bool urldb_iterate_partial_path(const struct path_data *parent, const char *prefix, bool (*callback)(const char *url, const struct url_data *data)); static bool urldb_iterate_entries_host(struct search_node *parent, bool (*url_callback)(const char *url, const struct url_data *data), bool (*cookie_callback)(const struct cookie_data *data)); static bool urldb_iterate_entries_path(const struct path_data *parent, bool (*url_callback)(const char *url, const struct url_data *data), bool (*cookie_callback)(const struct cookie_data *data)); /* Insertion */ static struct host_part *urldb_add_host_node(const char *part, struct host_part *parent); static struct host_part *urldb_add_host(const char *host); static struct path_data *urldb_add_path_node(const char *scheme, unsigned int port, const char *segment, const char *fragment, struct path_data *parent); static struct path_data *urldb_add_path(const char *scheme, unsigned int port, const struct host_part *host, const char *path, const char *fragment, const char *url_no_frag); static int urldb_add_path_fragment_cmp(const void *a, const void *b); static struct path_data *urldb_add_path_fragment(struct path_data *segment, const char *fragment); /* Lookup */ static struct path_data *urldb_find_url(const char *url); static struct path_data *urldb_match_path(const struct path_data *parent, const char *path, const char *scheme, unsigned short port); /* Dump */ static void urldb_dump_hosts(struct host_part *parent); static void urldb_dump_paths(struct path_data *parent); static void urldb_dump_search(struct search_node *parent, int depth); /* Search tree */ static struct search_node *urldb_search_insert(struct search_node *root, const struct host_part *data); static struct search_node *urldb_search_insert_internal( struct search_node *root, struct search_node *n); static struct search_node *urldb_search_remove(struct search_node *root, const struct host_part *data); static const struct host_part *urldb_search_find(struct search_node *root, const char *host); static struct search_node *urldb_search_skew(struct search_node *root); static struct search_node *urldb_search_split(struct search_node *root); static int urldb_search_match_host(const struct host_part *a, const struct host_part *b); static int urldb_search_match_string(const struct host_part *a, const char *b); static int urldb_search_match_prefix(const struct host_part *a, const char *b); /* Cookies */ static struct cookie_internal_data *urldb_parse_cookie(const char *url, const char **cookie); static bool urldb_parse_avpair(struct cookie_internal_data *c, char *n, char *v); static bool urldb_insert_cookie(struct cookie_internal_data *c, const char *scheme, const char *url); static void urldb_free_cookie(struct cookie_internal_data *c); static bool urldb_concat_cookie(struct cookie_internal_data *c, int *used, int *alloc, char **buf); static void urldb_save_cookie_hosts(FILE *fp, struct host_part *parent); static void urldb_save_cookie_paths(FILE *fp, struct path_data *parent); /** Root database handle */ static struct host_part db_root; /** Search trees - one per letter + 1 for IPs */ #define NUM_SEARCH_TREES 27 #define ST_IP 0 #define ST_DN 1 static struct search_node empty = { 0, 0, &empty, &empty }; static struct search_node *search_trees[NUM_SEARCH_TREES] = { &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty }; #define COOKIE_FILE_VERSION 100 #define URL_FILE_VERSION 106 /** * Import an URL database from file, replacing any existing database * * \param filename Name of file containing data */ void urldb_load(const char *filename) { #define MAXIMUM_URL_LENGTH 4096 char s[MAXIMUM_URL_LENGTH]; char host[256]; struct host_part *h; int urls; int i; int version; int length; FILE *fp; assert(filename); LOG(("Loading URL file")); fp = fopen(filename, "r"); if (!fp) { LOG(("Failed to open file '%s' for reading", filename)); return; } if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) return; version = atoi(s); if (version < 105) { LOG(("Unsupported URL file version.")); return; } if (version > URL_FILE_VERSION) { LOG(("Unknown URL file version.")); return; } while (fgets(host, sizeof host, fp)) { /* get the hostname */ length = strlen(host) - 1; host[length] = '\0'; /* skip data that has ended up with a host of '' */ if (length == 0) { if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; urls = atoi(s); for (i = 0; i < ((version == 105 ? 6 : 8) * urls); i++) if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; continue; } if (version == 105) { /* file:/ -> localhost */ if (strcasecmp(host, "file:/") == 0) snprintf(host, sizeof host, "localhost"); else { /* strip any port number */ char *colon = strrchr(host, ':'); if (colon) *colon = '\0'; } } /* read number of URLs */ if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; urls = atoi(s); /* no URLs => try next host */ if (urls == 0) { LOG(("No URLs for '%s'", host)); continue; } h = urldb_add_host(host); if (!h) die("Memory exhausted whilst loading URL file"); /* load the non-corrupt data */ for (i = 0; i < urls; i++) { struct path_data *p = NULL; if (version == 105) { if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; length = strlen(s) - 1; s[length] = '\0'; if (strncasecmp(s, "file:", 5) == 0) { /* local file, so fudge insertion */ char url[7 + 4096]; snprintf(url, sizeof url, "file://%s", s + 5); p = urldb_add_path("file", 0, h, s + 5, NULL, url); if (!p) { LOG(("Failed inserting '%s'", url)); die("Memory exhausted " "whilst loading " "URL file"); } } else { if (!urldb_add_url(s)) { LOG(("Failed inserting '%s'", s)); } p = urldb_find_url(s); } } else { char scheme[64], ports[10]; char url[64 + 3 + 256 + 6 + 4096 + 1]; unsigned int port; bool is_file = false; if (!fgets(scheme, sizeof scheme, fp)) break; length = strlen(scheme) - 1; scheme[length] = '\0'; if (!fgets(ports, sizeof ports, fp)) break; length = strlen(ports) - 1; ports[length] = '\0'; port = atoi(ports); if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; length = strlen(s) - 1; s[length] = '\0'; if (!strcasecmp(host, "localhost") && !strcasecmp(scheme, "file")) is_file = true; snprintf(url, sizeof url, "%s://%s%s%s%s", scheme, /* file URLs have no host */ (is_file ? "" : host), (port ? ":" : ""), (port ? ports : ""), s); p = urldb_add_path(scheme, port, h, s, NULL, url); if (!p) { LOG(("Failed inserting '%s'", url)); die("Memory exhausted whilst loading " "URL file"); } } if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; if (p) p->urld.visits = (unsigned int)atoi(s); if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; if (p) p->urld.last_visit = (time_t)atoi(s); if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; if (p) p->urld.type = (content_type)atoi(s); if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; #ifdef riscos if (p && strlen(s) == 12) { /* ensure filename is 'XX.XX.XX.XX' */ if ((s[2] == '.') && (s[5] == '.') && (s[8] == '.')) { s[2] = '/'; s[5] = '/'; s[8] = '/'; s[11] = '\0'; p->thumb = bitmap_create_file(s); } else if ((s[2] == '/') && (s[5] == '/') && (s[8] == '/')) { s[11] = '\0'; p->thumb = bitmap_create_file(s); } } #endif if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) break; length = strlen(s) - 1; if (p && length > 0) { s[length] = '\0'; p->urld.title = malloc(length + 1); if (p->urld.title) memcpy(p->urld.title, s, length + 1); } } } fclose(fp); LOG(("Successfully loaded URL file")); #undef MAXIMUM_URL_LENGTH } /** * Export the current database to file * * \param filename Name of file to export to */ void urldb_save(const char *filename) { FILE *fp; int i; assert(filename); fp = fopen(filename, "w"); if (!fp) { LOG(("Failed to open file '%s' for writing", filename)); return; } /* file format version number */ fprintf(fp, "%d\n", URL_FILE_VERSION); for (i = 0; i != NUM_SEARCH_TREES; i++) { urldb_save_search_tree(search_trees[i], fp); } fclose(fp); } /** * Save a search (sub)tree * * \param root Root of (sub)tree to save * \param fp File to write to */ void urldb_save_search_tree(struct search_node *parent, FILE *fp) { char host[256]; const struct host_part *h; unsigned int path_count = 0; char *path, *p, *end; int path_alloc = 64, path_used = 2; time_t expiry = time(NULL) - (60 * 60 * 24) * option_expire_url; if (parent == &empty) return; urldb_save_search_tree(parent->left, fp); path = malloc(path_alloc); if (!path) return; path[0] = '/'; path[1] = '\0'; for (h = parent->data, p = host, end = host + sizeof host; h && h != &db_root && p < end; h = h->parent) { int written = snprintf(p, end - p, "%s%s", h->part, (h->parent && h->parent->parent) ? "." : ""); if (written < 0) { free(path); return; } p += written; } urldb_count_urls(&parent->data->paths, expiry, &path_count); if (path_count > 0) { fprintf(fp, "%s\n%i\n", host, path_count); urldb_write_paths(&parent->data->paths, host, fp, &path, &path_alloc, &path_used, expiry); } free(path); urldb_save_search_tree(parent->right, fp); } /** * Count number of URLs associated with a host * * \param root Root of path data tree * \param expiry Expiry time for URLs * \param count Pointer to count */ void urldb_count_urls(const struct path_data *root, time_t expiry, unsigned int *count) { const struct path_data *p; if (!root->children) { if (root->persistent || ((root->urld.last_visit > expiry) && (root->urld.visits > 0))) (*count)++; } for (p = root->children; p; p = p->next) urldb_count_urls(p, expiry, count); } /** * Write paths associated with a host * * \param parent Root of (sub)tree to write * \param host Current host name * \param fp File to write to * \param path Current path string * \param path_alloc Allocated size of path * \param path_used Used size of path * \param expiry Expiry time of URLs */ void urldb_write_paths(const struct path_data *parent, const char *host, FILE *fp, char **path, int *path_alloc, int *path_used, time_t expiry) { const struct path_data *p; int i; int pused = *path_used; if (!parent->children) { /* leaf node */ if (!(parent->persistent || ((parent->urld.last_visit > expiry) && (parent->urld.visits > 0)))) /* expired */ return; fprintf(fp, "%s\n", parent->scheme); if (parent->port) fprintf(fp,"%d\n", parent->port); else fprintf(fp, "\n"); fprintf(fp, "%s\n", *path); /** \todo handle fragments? */ fprintf(fp, "%i\n%i\n%i\n", parent->urld.visits, (int)parent->urld.last_visit, (int)parent->urld.type); #ifdef riscos if (parent->thumb) fprintf(fp, "%s\n", parent->thumb->filename); else fprintf(fp, "\n"); #else fprintf(fp, "\n"); #endif if (parent->urld.title) { char *s = parent->urld.title; for (i = 0; s[i] != '\0'; i++) if (s[i] < 32) s[i] = ' '; for (--i; ((i > 0) && (s[i] == ' ')); i--) s[i] = '\0'; fprintf(fp, "%s\n", parent->urld.title); } else fprintf(fp, "\n"); } for (p = parent->children; p; p = p->next) { int len = *path_used + strlen(p->segment) + 1; if (*path_alloc < len) { char *temp = realloc(*path, (len > 64) ? len : *path_alloc + 64); if (!temp) return; *path = temp; *path_alloc = (len > 64) ? len : *path_alloc + 64; } strcat(*path, p->segment); if (p->children) { strcat(*path, "/"); } else { len -= 1; } *path_used = len; urldb_write_paths(p, host, fp, path, path_alloc, path_used, expiry); /* restore path to its state on entry to this function */ *path_used = pused; (*path)[pused - 1] = '\0'; } } /** * Set the cross-session persistence of the entry for an URL * * \param url Absolute URL to persist * \param persist True to persist, false otherwise */ void urldb_set_url_persistence(const char *url, bool persist) { struct path_data *p; assert(url); p = urldb_find_url(url); if (!p) return; p->persistent = persist; } /** * Insert an URL into the database * * \param url Absolute URL to insert * \return true on success, false otherwise */ bool urldb_add_url(const char *url) { struct host_part *h; struct path_data *p; char *fragment = NULL, *host, *plq, *scheme, *colon, *urlt; unsigned short port; url_func_result ret; assert(url); urlt = strdup(url); if (!urlt) return false; host = strchr(urlt, '#'); if (host) { *host = '\0'; fragment = strdup(host+1); if (!fragment) { free(urlt); return false; } } /* extract host */ ret = url_host(url, &host); if (ret != URL_FUNC_OK) { free(fragment); free(urlt); return false; } /* extract path, leafname, query */ ret = url_plq(url, &plq); if (ret != URL_FUNC_OK) { free(host); free(fragment); free(urlt); return false; } /* extract scheme */ ret = url_scheme(url, &scheme); if (ret != URL_FUNC_OK) { free(plq); free(host); free(fragment); free(urlt); return false; } colon = strrchr(host, ':'); if (!colon) { port = 0; } else { *colon = '\0'; port = atoi(colon + 1); } /* Get host entry */ if (strcasecmp(scheme, "file") == 0) h = urldb_add_host("localhost"); else h = urldb_add_host(host); if (!h) { free(scheme); free(plq); free(host); free(fragment); free(urlt); return false; } /* Get path entry */ p = urldb_add_path(scheme, port, h, plq, fragment, urlt); if (!p) { return false; } free(scheme); free(plq); free(host); free(fragment); free(urlt); return true; } /** * Set an URL's title string, replacing any existing one * * \param url The URL to look for * \param title The title string to use (copied) */ void urldb_set_url_title(const char *url, const char *title) { struct path_data *p; char *temp; assert(url && title); p = urldb_find_url(url); if (!p) return; temp = strdup(title); if (!temp) return; free(p->urld.title); p->urld.title = temp; } /** * Set an URL's content type * * \param url The URL to look for * \param type The type to set */ void urldb_set_url_content_type(const char *url, content_type type) { struct path_data *p; assert(url); p = urldb_find_url(url); if (!p) return; p->urld.type = type; } /** * Update an URL's visit data * * \param url The URL to update */ void urldb_update_url_visit_data(const char *url) { struct path_data *p; assert(url); p = urldb_find_url(url); if (!p) return; p->urld.last_visit = time(NULL); p->urld.visits++; } /** * Reset an URL's visit statistics * * \param url The URL to reset */ void urldb_reset_url_visit_data(const char *url) { struct path_data *p; assert(url); p = urldb_find_url(url); if (!p) return; p->urld.last_visit = (time_t)0; p->urld.visits = 0; } /** * Find data for an URL. * * \param url Absolute URL to look for * \return Pointer to result struct, or NULL */ const struct url_data *urldb_get_url_data(const char *url) { struct path_data *p; assert(url); p = urldb_find_url(url); if (!p) return NULL; return (struct url_data *)&p->urld; } /** * Extract an URL from the db * * \param url URL to extract * \return Pointer to database's copy of URL or NULL if not found */ const char *urldb_get_url(const char *url) { struct path_data *p; assert(url); p = urldb_find_url(url); if (!p) return NULL; return p->url; } /** * Look up authentication details in database * * \param url Absolute URL to search for * \return Pointer to authentication details, or NULL if not found */ const char *urldb_get_auth_details(const char *url) { struct path_data *p, *q = NULL; assert(url); /* add to the db, so our lookup will work */ urldb_add_url(url); p = urldb_find_url(url); if (!p) return NULL; /* Check for any auth details attached to this node */ if (p && p->auth.realm && p->auth.auth) return p->auth.auth; /* Now consider ancestors */ for (; p; p = p->parent) { /* The parent path entry is stored hung off the * parent entry with an empty (not NULL) segment string. * We look for this here. */ for (q = p->children; q; q = q->next) { if (strlen(q->segment) == 0) break; } if (q && q->auth.realm && q->auth.auth) break; } if (!q) return NULL; return q->auth.auth; } /** * Retrieve certificate verification permissions from database * * \param url Absolute URL to search for * \return true to permit connections to hosts with invalid certificates, * false otherwise. */ bool urldb_get_cert_permissions(const char *url) { struct path_data *p; struct host_part *h; assert(url); p = urldb_find_url(url); if (!p) return false; for (; p && p->parent; p = p->parent) /* do nothing */; h = (struct host_part *)p; return h->permit_invalid_certs; } /** * Set authentication data for an URL * * \param url The URL to consider * \param realm The authentication realm * \param auth The authentication details (in form username:password) */ void urldb_set_auth_details(const char *url, const char *realm, const char *auth) { struct path_data *p; char *urlt, *t1, *t2; assert(url && realm && auth); urlt = strdup(url); if (!urlt) return; /* strip leafname from URL */ t1 = strrchr(urlt, '/'); if (t1) { *(t1 + 1) = '\0'; } /* add url, in case it's missing */ urldb_add_url(urlt); p = urldb_find_url(urlt); free(urlt); if (!p) return; /** \todo search subtree for same realm/auth details * and remove them (as the lookup routine searches up the tree) */ t1 = strdup(realm); t2 = strdup(auth); if (!t1 || !t2) { free(t1); free(t2); return; } free(p->auth.realm); free(p->auth.auth); p->auth.realm = t1; p->auth.auth = t2; } /** * Set certificate verification permissions * * \param url URL to consider * \param permit Set to true to allow invalid certificates */ void urldb_set_cert_permissions(const char *url, bool permit) { struct path_data *p; struct host_part *h; assert(url); /* add url, in case it's missing */ urldb_add_url(url); p = urldb_find_url(url); if (!p) return; for (; p && p->parent; p = p->parent) /* do nothing */; h = (struct host_part *)p; h->permit_invalid_certs = permit; } /** * Set thumbnail for url, replacing any existing thumbnail * * \param url Absolute URL to consider * \param bitmap Opaque pointer to thumbnail data, or NULL to invalidate */ void urldb_set_thumbnail(const char *url, struct bitmap *bitmap) { struct path_data *p; assert(url); p = urldb_find_url(url); if (!p) return; if (p->thumb) bitmap_destroy(p->thumb); p->thumb = bitmap; } /** * Retrieve thumbnail data for given URL * * \param url Absolute URL to search for * \return Pointer to thumbnail data, or NULL if not found. */ const struct bitmap *urldb_get_thumbnail(const char *url) { struct path_data *p; assert(url); p = urldb_find_url(url); if (!p) return NULL; return p->thumb; } /** * Iterate over entries in the database which match the given prefix * * \param prefix Prefix to match * \param callback Callback function */ void urldb_iterate_partial(const char *prefix, bool (*callback)(const char *url, const struct url_data *data)) { char host[256]; char buf[260]; /* max domain + "www." */ const char *slash, *scheme_sep; struct search_node *tree; const struct host_part *h; assert(prefix && callback); /* strip scheme */ scheme_sep = strstr(prefix, "://"); if (scheme_sep) prefix = scheme_sep + 3; slash = strchr(prefix, '/'); if (*prefix >= '0' && *prefix <= '9') tree = search_trees[ST_IP]; else if (isalpha(*prefix)) tree = search_trees[ST_DN + tolower(*prefix) - 'a']; else return; if (slash) { /* if there's a slash in the input, then we can * assume that we're looking for a path */ char *domain = host; snprintf(host, sizeof host, "%.*s", slash - prefix, prefix); h = urldb_search_find(tree, host); if (!h) { int len = slash - prefix; if ((len == 1 && tolower(host[0]) != 'w') || (len == 2 && (tolower(host[0]) != 'w' || tolower(host[1]) != 'w')) || (len >= 3 && strncasecmp(host, "www", 3))) { snprintf(buf, sizeof buf, "www.%s", host); h = urldb_search_find( search_trees[ST_DN + 'w' - 'a'], buf); if (!h) return; domain = buf; } else return; } if (h->paths.children) { /* Have paths, iterate them */ urldb_iterate_partial_path(&h->paths, slash + 1, callback); } } else { int len = strlen(prefix); /* looking for hosts */ if (!urldb_iterate_partial_host(tree, prefix, callback)) return; if ((len == 1 && tolower(prefix[0]) != 'w') || (len == 2 && (tolower(prefix[0]) != 'w' || tolower(prefix[1]) != 'w')) || (len >= 3 && strncasecmp(prefix, "www", 3))) { /* now look for www.prefix */ snprintf(buf, sizeof buf, "www.%s", prefix); if(!urldb_iterate_partial_host( search_trees[ST_DN + 'w' - 'a'], buf, callback)) return; } } } /** * Partial host iterator (internal) * * \param root Root of (sub)tree to traverse * \param prefix Prefix to match * \param callback Callback function * \return true to continue, false otherwise */ bool urldb_iterate_partial_host(struct search_node *root, const char *prefix, bool (*callback)(const char *url, const struct url_data *data)) { int c; assert(root && prefix && callback); if (root == &empty) return true; c = urldb_search_match_prefix(root->data, prefix); if (c > 0) /* No match => look in left subtree */ return urldb_iterate_partial_host(root->left, prefix, callback); else if (c < 0) /* No match => look in right subtree */ return urldb_iterate_partial_host(root->right, prefix, callback); else { /* Match => iterate over l/r subtrees & process this node */ if (!urldb_iterate_partial_host(root->left, prefix, callback)) return false; if (root->data->paths.children) { /* and extract all paths attached to this host */ if (!urldb_iterate_entries_path(&root->data->paths, callback, NULL)) { return false; } } if (!urldb_iterate_partial_host(root->right, prefix, callback)) return false; } return true; } /** * Partial path iterator (internal) * * \param parent Root of (sub)tree to traverse * \param prefix Prefix to match * \param callback Callback function * \return true to continue, false otherwise */ bool urldb_iterate_partial_path(const struct path_data *parent, const char *prefix, bool (*callback)(const char *url, const struct url_data *data)) { const struct path_data *p; const char *slash, *end = prefix + strlen(prefix); int c; slash = strchr(prefix, '/'); if (!slash) slash = end; if (slash == prefix && *prefix == '/') /* Ignore "//" */ return true; for (p = parent->children; p; p = p->next) { if ((c = strncasecmp(p->segment, prefix, slash - prefix)) < 0) /* didn't match, but may be more */ continue; else if (c > 0) /* no more possible matches */ break; /* prefix matches so far */ if (slash == end) { /* we've run out of prefix, so all * paths below this one match */ if (!urldb_iterate_entries_path(p, callback, NULL)) return false; } else { /* more prefix to go => recurse */ if (!urldb_iterate_partial_path(p, slash + 1, callback)) return false; } } return true; } /** * Iterate over all entries in database * * \param callback Function to callback for each entry */ void urldb_iterate_entries(bool (*callback)(const char *url, const struct url_data *data)) { int i; assert(callback); for (i = 0; i < NUM_SEARCH_TREES; i++) { if (!urldb_iterate_entries_host(search_trees[i], callback, NULL)) break; } } /** * Iterate over all cookies in database * * \param callback Function to callback for each entry */ void urldb_iterate_cookies(bool (*callback)(const struct cookie_data *data)) { int i; assert(callback); for (i = 0; i < NUM_SEARCH_TREES; i++) { if (!urldb_iterate_entries_host(search_trees[i], NULL, callback)) break; } } /** * Host data iterator (internal) * * \param parent Root of subtree to iterate over * \param url_callback Callback function * \param cookie_callback Callback function * \return true to continue, false otherwise */ bool urldb_iterate_entries_host(struct search_node *parent, bool (*url_callback)(const char *url, const struct url_data *data), bool (*cookie_callback)(const struct cookie_data *data)) { if (parent == &empty) return true; if (!urldb_iterate_entries_host(parent->left, url_callback, cookie_callback)) return false; if ((parent->data->paths.children) || ((cookie_callback) && (parent->data->paths.cookies))) { /* We have paths (or domain cookies), so iterate them */ if (!urldb_iterate_entries_path(&parent->data->paths, url_callback, cookie_callback)) { return false; } } if (!urldb_iterate_entries_host(parent->right, url_callback, cookie_callback)) return false; return true; } /** * Path data iterator (internal) * * \param parent Root of subtree to iterate over * \param url_callback Callback function * \param cookie_callback Callback function * \return true to continue, false otherwise */ bool urldb_iterate_entries_path(const struct path_data *parent, bool (*url_callback)(const char *url, const struct url_data *data), bool (*cookie_callback)(const struct cookie_data *data)) { const struct path_data *p; if (!parent->children) { /* leaf node */ /* All leaf nodes in the path tree should have an URL or cookies * attached to them. If this is not the case, it indicates * that there's a bug in the file loader/URL insertion code. * Therefore, assert this here. */ assert(url_callback || cookie_callback); /** \todo handle fragments? */ if (url_callback) { assert(parent->url); if (!url_callback(parent->url, (const struct url_data *) &parent->urld)) return false; } else { if (parent->cookies && !cookie_callback( (const struct cookie_data *) parent->cookies)) return false; } } for (p = parent->children; p; p = p->next) { if (!urldb_iterate_entries_path(p, url_callback, cookie_callback)) return false; } return true; } /** * Add a host node to the tree * * \param part Host segment to add (or whole IP address) (copied) * \param parent Parent node to add to * \return Pointer to added node, or NULL on memory exhaustion */ struct host_part *urldb_add_host_node(const char *part, struct host_part *parent) { struct host_part *d; assert(part && parent); d = calloc(1, sizeof(struct host_part)); if (!d) return NULL; d->part = strdup(part); if (!d->part) { free(d); return NULL; } d->next = parent->children; if (parent->children) parent->children->prev = d; d->parent = parent; parent->children = d; return d; } /** * Add a host to the database, creating any intermediate entries * * \param host Hostname to add * \return Pointer to leaf node, or NULL on memory exhaustion */ struct host_part *urldb_add_host(const char *host) { struct host_part *d = (struct host_part *) &db_root, *e; struct search_node *s; char buf[256]; /* 256 bytes is sufficient - domain names are * limited to 255 chars. */ char *part; assert(host); if (*(host) >= '0' && *(host) <= '9') { /* Host is an IP, so simply add as TLD */ /* Check for existing entry */ for (e = d->children; e; e = e->next) if (strcasecmp(host, e->part) == 0) /* found => return it */ return e; d = urldb_add_host_node(host, d); s = urldb_search_insert(search_trees[ST_IP], d); if (!s) { /* failed */ d = NULL; } else { search_trees[ST_IP] = s; } return d; } /* Copy host string, so we can corrupt it */ strncpy(buf, host, sizeof buf); buf[sizeof buf - 1] = '\0'; /* Process FQDN segments backwards */ do { part = strrchr(buf, '.'); if (!part) { /* last segment */ /* Check for existing entry */ for (e = d->children; e; e = e->next) if (strcasecmp(buf, e->part) == 0) break; if (e) { d = e; } else { d = urldb_add_host_node(buf, d); } /* And insert into search tree */ if (d) { if (isalpha(*buf)) { struct search_node **r; r = &search_trees[ tolower(*buf) - 'a' + ST_DN]; s = urldb_search_insert(*r, d); if (!s) { /* failed */ d = NULL; } else { *r = s; } } else { d = NULL; } } break; } /* Check for existing entry */ for (e = d->children; e; e = e->next) if (strcasecmp(part + 1, e->part) == 0) break; d = e ? e : urldb_add_host_node(part + 1, d); if (!d) break; *part = '\0'; } while (1); return d; } /** * Add a path node to the tree * * \param scheme URL scheme associated with path (copied) * \param port Port number on host associated with path * \param segment Path segment to add (copied) * \param fragment URL fragment (copied), or NULL * \param parent Parent node to add to * \return Pointer to added node, or NULL on memory exhaustion */ struct path_data *urldb_add_path_node(const char *scheme, unsigned int port, const char *segment, const char *fragment, struct path_data *parent) { struct path_data *d, *e; assert(scheme && segment && parent); d = calloc(1, sizeof(struct path_data)); if (!d) return NULL; d->scheme = strdup(scheme); if (!d->scheme) { free(d); return NULL; } d->port = port; d->segment = strdup(segment); if (!d->segment) { free(d->scheme); free(d); return NULL; } if (fragment) { if (!urldb_add_path_fragment(d, fragment)) { free(d->segment); free(d->scheme); free(d); return NULL; } } for (e = parent->children; e; e = e->next) if (strcmp(e->segment, d->segment) > 0) break; if (e) { d->prev = e->prev; d->next = e; if (e->prev) e->prev->next = d; else parent->children = d; e->prev = d; } else if (!parent->children) { d->prev = d->next = NULL; parent->children = parent->last = d; } else { d->next = NULL; d->prev = parent->last; parent->last->next = d; parent->last = d; } d->parent = parent; return d; } /** * Add a path to the database, creating any intermediate entries * * \param scheme URL scheme associated with path * \param port Port number on host associated with path * \param host Host tree node to attach to * \param path Absolute path to add * \param fragment URL fragment, or NULL * \param url_no_frag URL, without fragment * \return Pointer to leaf node, or NULL on memory exhaustion */ struct path_data *urldb_add_path(const char *scheme, unsigned int port, const struct host_part *host, const char *path, const char *fragment, const char *url_no_frag) { struct path_data *d, *e; char *buf; char *segment, *slash; assert(scheme && host && path && url_no_frag); d = (struct path_data *) &host->paths; /* Copy path string, so we can corrupt it */ buf = malloc(strlen(path) + 1); if (!buf) return NULL; /* + 1 to strip leading '/' */ strcpy(buf, path + 1); segment = buf; /* Process path segments */ do { slash = strchr(segment, '/'); if (!slash) { /* last segment */ /* look for existing entry */ for (e = d->children; e; e = e->next) if (strcmp(segment, e->segment) == 0 && strcasecmp(scheme, e->scheme) == 0 && e->port == port) break; d = e ? urldb_add_path_fragment(e, fragment) : urldb_add_path_node(scheme, port, segment, fragment, d); break; } *slash = '\0'; /* look for existing entry */ for (e = d->children; e; e = e->next) if (strcmp(segment, e->segment) == 0 && strcasecmp(scheme, e->scheme) == 0 && e->port == port) break; d = e ? e : urldb_add_path_node(scheme, port, segment, NULL, d); if (!d) break; segment = slash + 1; } while (1); free(buf); if (d && !d->url) { /* Insert URL */ d->url = strdup(url_no_frag); if (!d->url) return NULL; } return d; } /** * Fragment comparator callback for qsort */ int urldb_add_path_fragment_cmp(const void *a, const void *b) { return strcasecmp(*((const char **) a), *((const char **) b)); } /** * Add a fragment to a path segment * * \param segment Path segment to add to * \param fragment Fragment to add (copied), or NULL * \return segment or NULL on memory exhaustion */ struct path_data *urldb_add_path_fragment(struct path_data *segment, const char *fragment) { char **temp; assert(segment); /* If no fragment, this function is a NOP * This may seem strange, but it makes the rest * of the code cleaner */ if (!fragment) return segment; temp = realloc(segment->fragment, (segment->frag_cnt + 1) * sizeof(char *)); if (!temp) return NULL; segment->fragment = temp; segment->fragment[segment->frag_cnt] = strdup(fragment); if (!segment->fragment[segment->frag_cnt]) { /* Don't free temp - it's now our buffer */ return NULL; } segment->frag_cnt++; /* We want fragments in alphabetical order, so sort them * It may prove better to insert in alphabetical order instead */ qsort(segment->fragment, segment->frag_cnt, sizeof (char *), urldb_add_path_fragment_cmp); return segment; } /** * Find an URL in the database * * \param url Absolute URL to find * \return Pointer to path data, or NULL if not found */ struct path_data *urldb_find_url(const char *url) { const struct host_part *h; struct path_data *p; struct search_node *tree; char *host, *plq, *scheme, *colon; const char *domain; unsigned short port; url_func_result ret; assert(url); /* extract host */ ret = url_host(url, &host); if (ret != URL_FUNC_OK) return NULL; /* extract path, leafname, query */ ret = url_plq(url, &plq); if (ret != URL_FUNC_OK) { free(host); return NULL; } /* extract scheme */ ret = url_scheme(url, &scheme); if (ret != URL_FUNC_OK) { free(plq); free(host); return NULL; } colon = strrchr(host, ':'); if (!colon) { port = 0; } else { *colon = '\0'; port = atoi(colon + 1); } /* file urls have no host, so manufacture one */ if (strcasecmp(scheme, "file") == 0) domain = "localhost"; else domain = host; if (*domain >= '0' && *domain <= '9') tree = search_trees[ST_IP]; else if (isalpha(*domain)) tree = search_trees[ST_DN + tolower(*domain) - 'a']; else { free(plq); free(host); free(scheme); return NULL; } h = urldb_search_find(tree, domain); if (!h) { free(plq); free(host); free(scheme); return NULL; } p = urldb_match_path(&h->paths, plq, scheme, port); free(plq); free(host); free(scheme); return p; } /** * Match a path string * * \param parent Path (sub)tree to look in * \param path The path to search for * \param scheme The URL scheme associated with the path * \param port The port associated with the path * \return Pointer to path data or NULL if not found. */ struct path_data *urldb_match_path(const struct path_data *parent, const char *path, const char *scheme, unsigned short port) { struct path_data *p; const char *slash; if (*path == '\0') return (struct path_data *)parent; slash = strchr(path + 1, '/'); if (!slash) slash = path + strlen(path); for (p = parent->children; p; p = p->next) { if (strncmp(p->segment, path + 1, slash - path - 1) == 0 && strcmp(p->scheme, scheme) == 0 && p->port == port) break; } if (p) { return urldb_match_path(p, slash, scheme, port); } return NULL; } /** * Dump URL database to stderr */ void urldb_dump(void) { int i; urldb_dump_hosts(&db_root); for (i = 0; i != NUM_SEARCH_TREES; i++) urldb_dump_search(search_trees[i], 0); } /** * Dump URL database hosts to stderr * * \param parent Parent node of tree to dump */ void urldb_dump_hosts(struct host_part *parent) { struct host_part *h; if (parent->part) { LOG(("%s", parent->part)); LOG(("\t%s invalid SSL certs", parent->permit_invalid_certs ? "Permits" : "Denies")); } /* Dump path data */ urldb_dump_paths(&parent->paths); /* and recurse */ for (h = parent->children; h; h = h->next) urldb_dump_hosts(h); } /** * Dump URL database paths to stderr * * \param parent Parent node of tree to dump */ void urldb_dump_paths(struct path_data *parent) { struct path_data *p; unsigned int i; if (parent->segment) { LOG(("\t%s : %u", parent->scheme, parent->port)); LOG(("\t\t'%s'", parent->segment)); for (i = 0; i != parent->frag_cnt; i++) LOG(("\t\t\t#%s", parent->fragment[i])); } /* and recurse */ for (p = parent->children; p; p = p->next) urldb_dump_paths(p); } /** * Dump search tree * * \param parent Parent node of tree to dump * \param depth Tree depth */ void urldb_dump_search(struct search_node *parent, int depth) { const struct host_part *h; int i; if (parent == &empty) return; urldb_dump_search(parent->left, depth + 1); for (i = 0; i != depth; i++) fputc(' ', stderr); for (h = parent->data; h; h = h->parent) { fprintf(stderr, "%s", h->part); if (h->parent && h->parent->parent) fputc('.', stderr); } fputc('\n', stderr); urldb_dump_search(parent->right, depth + 1); } /** * Insert a node into the search tree * * \param root Root of tree to insert into * \param data User data to insert * \return Pointer to updated root, or NULL if failed */ struct search_node *urldb_search_insert(struct search_node *root, const struct host_part *data) { struct search_node *n; assert(root && data); n = malloc(sizeof(struct search_node)); if (!n) return NULL; n->level = 1; n->data = data; n->left = n->right = ∅ root = urldb_search_insert_internal(root, n); return root; } /** * Insert node into search tree * * \param root Root of (sub)tree to insert into * \param n Node to insert * \return Pointer to updated root */ struct search_node *urldb_search_insert_internal(struct search_node *root, struct search_node *n) { assert(root && n); if (root == &empty) { root = n; } else { int c = urldb_search_match_host(root->data, n->data); if (c > 0) { root->left = urldb_search_insert_internal( root->left, n); } else if (c < 0) { root->right = urldb_search_insert_internal( root->right, n); } else { /* exact match */ free(n); return root; } root = urldb_search_skew(root); root = urldb_search_split(root); } return root; } /** * Delete a node from a search tree * * \param root Tree to remove from * \param data Data to delete * \return Updated root of tree */ struct search_node *urldb_search_remove(struct search_node *root, const struct host_part *data) { static struct search_node *last, *deleted; assert(root && data); if (root != &empty) { int c = urldb_search_match_host(root->data, data); last = root; if (c > 0) { root->left = urldb_search_remove(root->left, data); } else { deleted = root; root->right = urldb_search_remove(root->right, data); } } if (root == last) { if (deleted != &empty && urldb_search_match_host(deleted->data, data) == 0) { deleted->data = last->data; deleted = ∅ root = root->right; } } else { if (root->left->level < root->level - 1 || root->right->level < root->level - 1) { if (root->right->level > --root->level) root->right->level = root->level; root = urldb_search_skew(root); root->right = urldb_search_skew(root->right); root->right->right = urldb_search_skew(root->right->right); root = urldb_search_split(root); root->right = urldb_search_split(root->right); } } return root; } /** * Find a node in a search tree * * \param root Tree to look in * \param host Host to find * \return Pointer to host tree node, or NULL if not found */ const struct host_part *urldb_search_find(struct search_node *root, const char *host) { int c; assert(root && host); if (root == &empty) { return NULL; } c = urldb_search_match_string(root->data, host); if (c > 0) return urldb_search_find(root->left, host); else if (c < 0) return urldb_search_find(root->right, host); else return root->data; } /** * Compare a pair of host_parts * * \param a * \param b * \return 0 if match, non-zero, otherwise */ int urldb_search_match_host(const struct host_part *a, const struct host_part *b) { int ret; assert(a && b); /* traverse up tree to root, comparing parts as we go. */ for (; a && a != &db_root && b && b != &db_root; a = a->parent, b = b->parent) if ((ret = strcasecmp(a->part, b->part)) != 0) /* They differ => return the difference here */ return ret; /* If we get here then either: * a) The path lengths differ * or b) The hosts are identical */ if (a && a != &db_root && (!b || b == &db_root)) /* len(a) > len(b) */ return 1; else if ((!a || a == &db_root) && b && b != &db_root) /* len(a) < len(b) */ return -1; /* identical */ return 0; } /** * Compare host_part with a string * * \param a * \param b * \return 0 if match, non-zero, otherwise */ int urldb_search_match_string(const struct host_part *a, const char *b) { const char *end, *dot; int plen, ret; assert(a && a != &db_root && b); if (*b >= '0' && *b <= '9') { /* IP address */ return strcasecmp(a->part, b); } end = b + strlen(b) + 1; while (b < end && a && a != &db_root) { dot = strchr(b, '.'); if (!dot) { /* last segment */ dot = end - 1; } /* Compare strings (length limited) */ if ((ret = strncasecmp(a->part, b, dot - b)) != 0) /* didn't match => return difference */ return ret; /* The strings matched, now check that the lengths do, too */ plen = strlen(a->part); if (plen > dot - b) /* len(a) > len(b) */ return 1; else if (plen < dot - b) /* len(a) < len(b) */ return -1; b = dot + 1; a = a->parent; } /* If we get here then either: * a) The path lengths differ * or b) The hosts are identical */ if (a && a != &db_root && b >= end) /* len(a) > len(b) */ return 1; else if ((!a || a == &db_root) && b < end) /* len(a) < len(b) */ return -1; /* Identical */ return 0; } /** * Compare host_part with prefix * * \param a * \param b * \return 0 if match, non-zero, otherwise */ int urldb_search_match_prefix(const struct host_part *a, const char *b) { const char *end, *dot; int plen, ret; assert(a && a != &db_root && b); if (*b >= '0' && *b <= '9') { /* IP address */ return strncasecmp(a->part, b, strlen(b)); } end = b + strlen(b) + 1; while (b < end && a && a != &db_root) { dot = strchr(b, '.'); if (!dot) { /* last segment */ dot = end - 1; } /* Compare strings (length limited) */ if ((ret = strncasecmp(a->part, b, dot - b)) != 0) /* didn't match => return difference */ return ret; /* The strings matched */ if (dot < end - 1) { /* Consider segment lengths only in the case * where the prefix contains segments */ plen = strlen(a->part); if (plen > dot - b) /* len(a) > len(b) */ return 1; else if (plen < dot - b) /* len(a) < len(b) */ return -1; } b = dot + 1; a = a->parent; } /* If we get here then either: * a) The path lengths differ * or b) The hosts are identical */ if (a && a != &db_root && b >= end) /* len(a) > len(b) => prefix matches */ return 0; else if ((!a || a == &db_root) && b < end) /* len(a) < len(b) => prefix does not match */ return -1; /* Identical */ return 0; } /** * Rotate a subtree right * * \param root Root of subtree to rotate * \return new root of subtree */ struct search_node *urldb_search_skew(struct search_node *root) { struct search_node *temp; assert(root); if (root->left->level == root->level) { temp = root->left; root->left = temp->right; temp->right = root; root = temp; } return root; } /** * Rotate a node left, increasing the parent's level * * \param root Root of subtree to rotate * \return New root of subtree */ struct search_node *urldb_search_split(struct search_node *root) { struct search_node *temp; assert(root); if (root->right->right->level == root->level) { temp = root->right; root->right = temp->left; temp->left = root; root = temp; root->level++; } return root; } /** * Retrieve cookies for an URL * * \param url URL being fetched * \param referer Referring resource, or NULL * \return Cookies string for libcurl (on heap), or NULL on error/no cookies * * \todo Handle unvalidated fetches */ char *urldb_get_cookie(const char *url, const char *referer) { const struct path_data *p, *q; const struct host_part *h; struct cookie_internal_data *c; int count = 0, version = COOKIE_RFC2965; int ret_alloc = 4096, ret_used = 1; char *path; char *ret; char *scheme; time_t now; url_func_result res; assert(url); // LOG(("%s : %s", url, referer)); // if (referer) // /* No unvalidated fetches for now */ // return NULL; urldb_add_url(url); p = urldb_find_url(url); if (!p) return NULL; scheme = p->scheme; ret = malloc(ret_alloc); if (!ret) return NULL; ret[0] = '\0'; res = url_path(url, &path); if (res != URL_FUNC_OK) { free(ret); return NULL; } now = time(NULL); if (*(p->segment) != '\0') { /* Match exact path, unless directory, when prefix matching * will handle this case for us. */ for (q = p->parent->children; q; q = q->next) { if (strcmp(q->segment, p->segment)) continue; /* Consider all cookies associated with * this exact path */ for (c = q->cookies; c; c = c->next) { if (c->expires != 1 && c->expires < now) /* cookie has expired => ignore */ continue; if (c->secure && strcasecmp( q->scheme, "https")) /* secure cookie for insecure host. * ignore */ continue; if (!urldb_concat_cookie(c, &ret_used, &ret_alloc, &ret)) { free(path); free(ret); return NULL; } if (c->version < (unsigned int)version) version = c->version; c->last_used = now; count++; } } } // LOG(("%s", ret)); /* Now consider cookies whose paths prefix-match ours */ for (p = p->parent; p; p = p->parent) { /* Find directory's path entry(ies) */ /* There are potentially multiple due to differing schemes */ for (q = p->children; q; q = q->next) { if (*(q->segment) != '\0') continue; for (c = q->cookies; c; c = c->next) { // LOG(("%p: %s=%s", c, c->name, c->value)); if (c->expires != 1 && c->expires < now) /* cookie has expired => ignore */ continue; if (c->secure && strcasecmp( q->scheme, "https")) /* Secure cookie for insecure server * => ignore */ continue; if (!urldb_concat_cookie(c, &ret_used, &ret_alloc, &ret)) { free(path); free(ret); return NULL; } if (c->version < (unsigned int) version) version = c->version; c->last_used = now; count++; } } if (!p->parent) { /* No parent, so bail here. This can't go in the * loop exit condition as we want to process the * top-level node, too */ break; } } // LOG(("%s", ret)); /* Finally consider domain cookies for hosts which domain match ours */ for (h = (const struct host_part *)p; h && h != &db_root; h = h->parent) { for (c = h->paths.cookies; c; c = c->next) { if (c->expires != 1 && c->expires < now) /* cookie has expired => ignore */ continue; /* Ensure cookie path is a prefix of the resource */ if (strncmp(c->path, path, strlen(c->path)) != 0) /* paths don't match => ignore */ continue; if (c->secure && strcasecmp(scheme, "https")) /* secure cookie for insecure host. ignore */ continue; if (!urldb_concat_cookie(c, &ret_used, &ret_alloc, &ret)) { free(path); free(ret); return NULL; } if (c->version < (unsigned int)version) version = c->version; c->last_used = now; count++; } } // LOG(("%s", ret)); if (count == 0) { /* No cookies found */ free(path); free(ret); return NULL; } /* and build output string */ { char *temp; if (version > 0) temp = malloc(12 + ret_used); else temp = malloc(ret_used); if (!temp) { free(path); free(ret); return NULL; } if (version > 0) sprintf(temp, "$Version=%d%s", version, ret); else { /* Old-style cookies => no version & skip "; " */ sprintf(temp, "%s", ret + 2); } free(path); free(ret); ret = temp; } return ret; } /** * Parse Set-Cookie header and insert cookie(s) into database * * \param header Header to parse, with Set-Cookie: stripped * \param url URL being fetched * \return true on success, false otherwise */ bool urldb_set_cookie(const char *header, const char *url) { const char *cur = header, *end; char *path, *host, *scheme, *urlt; url_func_result res; assert(url && header); // LOG(("'%s' : '%s'", url, header)); /* strip fragment */ urlt = strdup(url); scheme = strchr(urlt, '#'); if (scheme) *scheme = '\0'; res = url_scheme(url, &scheme); if (res != URL_FUNC_OK) { free(urlt); return false; } res = url_path(url, &path); if (res != URL_FUNC_OK) { free(scheme); free(urlt); return false; } res = url_host(url, &host); if (res != URL_FUNC_OK) { free(path); free(scheme); free(urlt); return false; } end = cur + strlen(cur) - 2 /* Trailing CRLF */; do { struct cookie_internal_data *c; char *dot; c = urldb_parse_cookie(url, &cur); if (!c) { /* failed => stop parsing */ goto error; } /* validate cookie */ /* 4.2.2:i Cookie must have NAME and VALUE */ if (!c->name || !c->value) { urldb_free_cookie(c); goto error; } /* 4.3.2:i Cookie path must be a prefix of URL path */ if (strncmp(c->path, path, strlen(c->path)) != 0 || strlen(c->path) > strlen(path)) { urldb_free_cookie(c); goto error; } /* 4.3.2:ii Cookie domain must contain embedded dots */ dot = strchr(c->domain + 1, '.'); if (!dot || *(dot + 1) == '\0') { /* no embedded dots */ urldb_free_cookie(c); goto error; } /* Domain match fetch host with cookie domain */ if (strcasecmp(host, c->domain) != 0) { int hlen, dlen; char *domain = c->domain; /* 4.3.2:iii */ if (host[0] >= '0' && host[0] <= '9') { /* IP address, so no partial match */ urldb_free_cookie(c); goto error; } hlen = strlen(host); dlen = strlen(c->domain); if (hlen <= dlen && hlen != dlen - 1) { /* Partial match not possible */ urldb_free_cookie(c); goto error; } if (hlen == dlen - 1) { /* Relax matching to allow * host a.com to match .a.com */ domain++; dlen--; } if (strcasecmp(host + (hlen - dlen), domain)) { urldb_free_cookie(c); goto error; } /* 4.3.2:iv Ensure H contains no dots */ for (int i = 0; i < (hlen - dlen); i++) if (host[i] == '.') { urldb_free_cookie(c); goto error; } } /* Now insert into database */ if (!urldb_insert_cookie(c, scheme, urlt)) goto error; cookies_update((struct cookie_data *)c); } while (cur < end); free(host); free(path); free(scheme); free(urlt); return true; error: free(host); free(path); free(scheme); free(urlt); return false; } /** * Parse a cookie * * \param url URL being fetched * \param cookie Pointer to cookie string (updated on exit) * \return Pointer to cookie structure (on heap, caller frees) or NULL */ struct cookie_internal_data *urldb_parse_cookie(const char *url, const char **cookie) { struct cookie_internal_data *c; const char *cur; char name[1024], value[4096]; char *n = name, *v = value; bool had_equals = false; bool quoted = false; url_func_result res; assert(url && cookie && *cookie); c = calloc(1, sizeof(struct cookie_internal_data)); if (!c) return NULL; c->expires = -1; name[0] = '\0'; value[0] = '\0'; for (cur = *cookie; *cur && *cur != '\r' && *cur != '\n'; cur++) { if (had_equals && (*cur == '"' || *cur == '\'')) { /* Only values may be quoted */ quoted = !quoted; continue; } if (!quoted && !had_equals && *cur == '=') { /* First equals => attr-value separator */ had_equals = true; continue; } if (!quoted && *cur == ';') { /* Semicolon => end of current avpair */ /* NUL-terminate tokens */ *n = '\0'; *v = '\0'; if (!urldb_parse_avpair(c, name, value)) { /* Memory exhausted */ urldb_free_cookie(c); return NULL; } /* And reset to start */ n = name; v = value; had_equals = false; continue; } /* And now handle commas. These are a pain as they may mean * any of the following: * * + End of cookie * + Day separator in Expires avpair * + (Invalid) comma in unquoted value * * Therefore, in order to handle all 3 cases (2 and 3 are * identical, the difference being that 2 is in the spec and * 3 isn't), we need to determine where the comma actually * lies. We use the following heuristic: * * Given a comma at the current input position, find the * immediately following semicolon (or end of input if none * found). Then, consider the input characters between * these two positions. If any of these characters is an * '=', we must assume that the comma signified the end of * the current cookie. * * This holds as the first avpair of any cookie must be * NAME=VALUE, so the '=' is guaranteed to appear in the * case where the comma marks the end of a cookie. * * This will fail, however, in the case where '=' appears in * the value of the current avpair after the comma or the * subsequent cookie does not start with NAME=VALUE. Neither * of these is particularly likely and if they do occur, the * website is more broken than we can be bothered to handle. */ if (!quoted && *cur == ',') { /* Find semi-colon, if any */ const char *p; const char *semi = strchr(cur + 1, ';'); if (!semi) semi = cur + strlen(cur) - 2 /* CRLF */; /* Look for equals sign between comma and semi */ for (p = cur + 1; p < semi; p++) if (*p == '=') break; if (p == semi) { /* none found => comma internal to value */ /* do nothing */ } else { /* found one => comma marks end of cookie */ cur++; break; } } /* Accumulate into buffers, always leaving space for a NUL */ if (!had_equals) { if (n < name + 1023) *n++ = *cur; } else { if (v < value + 4095) *v++ = *cur; } } /* Parse final avpair */ *n = '\0'; *v = '\0'; if (!urldb_parse_avpair(c, name, value)) { /* Memory exhausted */ urldb_free_cookie(c); return NULL; } /* Now fix-up default values */ if (!c->domain) { res = url_host(url, &c->domain); if (res != URL_FUNC_OK) { urldb_free_cookie(c); return NULL; } } if (!c->path) { res = url_path(url, &c->path); if (res != URL_FUNC_OK) { urldb_free_cookie(c); return NULL; } } if (c->expires == -1) c->expires = 1; /* Write back current position */ *cookie = cur; return c; } /** * Parse a cookie avpair * * \param c Cookie struct to populate * \param n Name component * \param v Value component * \return true on success, false on memory exhaustion */ bool urldb_parse_avpair(struct cookie_internal_data *c, char *n, char *v) { int vlen; assert(c && n && v); /* Strip whitespace from start of name */ for (; *n; n++) { if (*n != ' ' && *n != '\t') break; } /* Strip whitespace from end of name */ for (vlen = strlen(n); vlen; vlen--) { if (n[vlen] == ' ' || n[vlen] == '\t') n[vlen] = '\0'; else break; } /* Strip whitespace from start of value */ for (; *v; v++) { if (*v != ' ' && *v != '\t') break; } /* Strip whitespace from end of value */ for (vlen = strlen(v); vlen; vlen--) { if (v[vlen] == ' ' || v[vlen] == '\t') v[vlen] = '\0'; else break; } if (!c->comment && strcasecmp(n, "Comment") == 0) { c->comment = strdup(v); if (!c->comment) return false; } else if (!c->domain && strcasecmp(n, "Domain") == 0) { if (v[0] == '.') { /* Domain must start with a dot */ c->domain_from_set = true; c->domain = strdup(v); if (!c->domain) return false; } } else if (strcasecmp(n, "Max-Age") == 0) { int temp = atoi(v); if (temp == 0) /* Special case - 0 means delete */ c->expires = 0; else c->expires = time(NULL) + temp; } else if (!c->path && strcasecmp(n, "Path") == 0) { c->path_from_set = true; c->path = strdup(v); if (!c->path) return false; } else if (strcasecmp(n, "Version") == 0) { c->version = atoi(v); } else if (strcasecmp(n, "Expires") == 0) { char *datenoday; time_t expires; /* Strip dayname from date (these are hugely * variable and liable to break the parser. * They also serve no useful purpose) */ for (datenoday = v; *datenoday && !isdigit(*datenoday); datenoday++) ; /* do nothing */ expires = curl_getdate(datenoday, NULL); if (expires == -1) { /* assume we have an unrepresentable * date => force it to the maximum * possible value of a 32bit time_t * (this may break in 2038. We'll * deal with that once we come to * it) */ expires = (time_t)0x7fffffff; } c->expires = expires; } else if (strcasecmp(n, "Secure") == 0) { c->secure = true; } else if (!c->name) { c->name = strdup(n); c->value = strdup(v); if (!c->name || !c->value) return false; } return true; } /** * Insert a cookie into the database * * \param c The cookie to insert * \param scheme URL scheme associated with cookie path * \param url URL (sans fragment) associated with cookie * \return true on success, false on memory exhaustion (c will be freed) */ bool urldb_insert_cookie(struct cookie_internal_data *c, const char *scheme, const char *url) { struct cookie_internal_data *d; const struct host_part *h; struct path_data *p; assert(c && scheme && url); if (c->domain[0] == '.') { h = urldb_search_find( search_trees[tolower(c->domain[1]) - 'a' + ST_DN], c->domain + 1); if (!h) { h = urldb_add_host(c->domain + 1); if (!h) { urldb_free_cookie(c); return false; } } p = &h->paths; } else { if (c->domain[0] >= '0' && c->domain[0] <= '9') h = urldb_search_find(search_trees[ST_IP], c->domain); else h = urldb_search_find(search_trees[ tolower(c->domain[0]) - 'a' + ST_DN], c->domain); if (!h) { h = urldb_add_host(c->domain); if (!h) { urldb_free_cookie(c); return false; } } /* find path */ p = urldb_add_path(scheme, 0, h, c->path, NULL, url); if (!p) { urldb_free_cookie(c); return false; } } /* add cookie */ for (d = p->cookies; d; d = d->next) { if (!strcmp(d->domain, c->domain) && !strcmp(d->path, c->path) && !strcmp(d->name, c->name)) break; } if (d) { if (c->expires == 0) { /* remove cookie */ if (d->next) d->next->prev = d->prev; if (d->prev) d->prev->next = d->next; else p->cookies = d->next; urldb_free_cookie(d); urldb_free_cookie(c); } else { /* replace d with c */ c->prev = d->prev; c->next = d->next; if (c->next) c->next->prev = c; if (c->prev) c->prev->next = c; else p->cookies = c; urldb_free_cookie(d); // LOG(("%p: %s=%s", c, c->name, c->value)); } } else { c->prev = NULL; c->next = p->cookies; if (p->cookies) p->cookies->prev = c; p->cookies = c; // LOG(("%p: %s=%s", c, c->name, c->value)); } return true; } /** * Free a cookie * * \param c The cookie to free */ void urldb_free_cookie(struct cookie_internal_data *c) { assert(c); free(c->comment); free(c->domain); free(c->path); free(c->name); free(c->value); free(c); } /** * Concatenate a cookie into the provided buffer * * \param c Cookie to concatenate * \param used Pointer to amount of buffer used (updated) * \param alloc Pointer to allocated size of buffer (updated) * \param buf Pointer to Pointer to buffer (updated) * \return true on success, false on memory exhaustion */ bool urldb_concat_cookie(struct cookie_internal_data *c, int *used, int *alloc, char **buf) { int clen; assert(c && used && alloc && buf && *buf); clen = 2 + strlen(c->name) + 1 + strlen(c->value) + (c->path_from_set ? 8 + strlen(c->path) : 0) + (c->domain_from_set ? 10 + strlen(c->domain) : 0); if (*used + clen >= *alloc) { char *temp = realloc(*buf, *alloc + 4096); if (!temp) { return false; } *buf = temp; *alloc += 4096; } /** \todo Quote value strings iff version > 0 */ sprintf(*buf + *used - 1, "; %s=%s%s%s%s%s", c->name, c->value, (c->path_from_set ? "; $Path=" : "" ), (c->path_from_set ? c->path : "" ), // (c->path_from_set ? "\"" : ""), (c->domain_from_set ? "; $Domain=" : ""), (c->domain_from_set ? c->domain : "") // ,(c->domain_from_set ? "\"" : "") ); *used += clen; return true; } /** * Load a cookie file into the database * * \param filename File to load */ void urldb_load_cookies(const char *filename) { FILE *fp; char s[16*1024]; int file_version = 0; assert(filename); fp = fopen(filename, "r"); if (!fp) return; #define FIND_T { \ for (; *p && *p != '\t'; p++) \ ; /* do nothing */ \ if (p >= end) { \ LOG(("Overran input")); \ continue; \ } \ *p++ = '\0'; \ } #define SKIP_T { \ for (; *p && *p == '\t'; p++) \ ; /* do nothing */ \ if (p >= end) { \ LOG(("Overran input")); \ continue; \ } \ } while (fgets(s, sizeof s, fp)) { char *p = s, *end = 0, *domain, *path, *name, *value, *scheme, *url, *comment; int version, domain_specified, path_specified, secure, no_destroy; time_t expires, last_used; if(s[0] == 0 || s[0] == '#') /* Skip blank lines or comments */ continue; s[strlen(s) - 1] = '\0'; /* lose terminating newline */ end = s + strlen(s); /* Look for file version first * (all input is ignored until this is read) */ if (strncasecmp(s, "Version:", 8) == 0) { FIND_T; SKIP_T; file_version = atoi(p); if (file_version != COOKIE_FILE_VERSION) { LOG(("Unknown Cookie file version")); break; } continue; } else if (file_version == 0) { /* Haven't yet seen version; skip this input */ continue; } /* One cookie/line */ /* Parse input */ FIND_T; version = atoi(s); SKIP_T; domain = p; FIND_T; SKIP_T; domain_specified = atoi(p); FIND_T; SKIP_T; path = p; FIND_T; SKIP_T; path_specified = atoi(p); FIND_T; SKIP_T; secure = atoi(p); FIND_T; SKIP_T; expires = (time_t)atoi(p); FIND_T; SKIP_T; last_used = (time_t)atoi(p); FIND_T; SKIP_T; no_destroy = atoi(p); FIND_T; SKIP_T; name = p; FIND_T; SKIP_T; value = p; FIND_T; SKIP_T; scheme = p; FIND_T; SKIP_T; url = p; FIND_T; /* Comment may have no content, so don't * use macros as they'll break */ for (; *p && *p == '\t'; p++) ; /* do nothing */ comment = p; assert(p <= end); /* Now create cookie */ struct cookie_internal_data *c = malloc(sizeof(struct cookie_internal_data)); if (!c) break; c->name = strdup(name); c->value = strdup(value); c->comment = strdup(comment); c->domain_from_set = domain_specified; c->domain = strdup(domain); c->path_from_set = path_specified; c->path = strdup(path); c->expires = expires; c->last_used = last_used; c->secure = secure; c->version = version; c->no_destroy = no_destroy; if (!(c->name && c->value && c->comment && c->domain && c->path)) { urldb_free_cookie(c); break; } /* And insert it into database */ if (!urldb_insert_cookie(c, scheme, url)) { /* Cookie freed for us */ break; } } #undef SKIP_WS #undef FIND_WS fclose(fp); } /** * Save persistent cookies to file * * \param filename Path to save to */ void urldb_save_cookies(const char *filename) { FILE *fp; assert(filename); fp = fopen(filename, "w"); if (!fp) return; fprintf(fp, "# >%s\n", filename); fprintf(fp, "# NetSurf cookies file.\n" "#\n" "# Lines starting with a '#' are comments, " "blank lines are ignored.\n" "#\n" "# All lines prior to \"Version:\t%d\" are discarded.\n" "#\n" "# Version\tDomain\tDomain from Set-Cookie\tPath\t" "Path from Set-Cookie\tSecure\tExpires\tLast used\t" "No destroy\tName\tValue\tScheme\tURL\tComment\n", COOKIE_FILE_VERSION); fprintf(fp, "Version:\t%d\n", COOKIE_FILE_VERSION); urldb_save_cookie_hosts(fp, &db_root); fclose(fp); } /** * Save a host subtree's cookies * * \param fp File pointer to write to * \param parent Parent host */ void urldb_save_cookie_hosts(FILE *fp, struct host_part *parent) { assert(fp && parent); urldb_save_cookie_paths(fp, &parent->paths); for (struct host_part *h = parent->children; h; h = h->next) urldb_save_cookie_hosts(fp, h); } /** * Save a path subtree's cookies * * \param fp File pointer to write to * \param parent Parent path */ void urldb_save_cookie_paths(FILE *fp, struct path_data *parent) { time_t now = time(NULL); assert(fp && parent); if (parent->cookies) { for (struct cookie_internal_data *c = parent->cookies; c; c = c->next) { if (c->expires < now) /* Skip expired cookies */ continue; fprintf(fp, "%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t" "%s\t%s\t%s\t%s\t%s\n", c->version, c->domain, c->domain_from_set, c->path, c->path_from_set, c->secure, (int)c->expires, (int)c->last_used, c->no_destroy, c->name, c->value, parent->scheme ? parent->scheme : "unused", parent->url ? parent->url : "unused", c->comment ? c->comment : ""); } } for (struct path_data *p = parent->children; p; p = p->next) urldb_save_cookie_paths(fp, p); } #ifdef TEST_URLDB int option_expire_url = 0; void die(const char *error) { printf("die: %s\n", error); exit(1); } void warn_user(const char *warning, const char *detail) { printf("WARNING: %s %s\n", warning, detail); } int main(void) { struct host_part *h; struct path_data *p; int i; url_init(); h = urldb_add_host("127.0.0.1"); if (!h) { LOG(("failed adding host")); return 1; } /* Get host entry */ h = urldb_add_host("netsurf.strcprstskrzkrk.co.uk"); if (!h) { LOG(("failed adding host")); return 1; } /* Get path entry */ p = urldb_add_path("http", 80, h, "/path/to/resource.htm?a=b", "zz", "http://netsurf.strcprstskrzkrk.co.uk/path/to/resource.htm?a=b"); if (!p) { LOG(("failed adding path")); return 1; } p = urldb_add_path("http", 80, h, "/path/to/resource.htm?a=b", "aa", "http://netsurf.strcprstskrzkrk.co.uk/path/to/resource.htm?a=b"); if (!p) { LOG(("failed adding path")); return 1; } p = urldb_add_path("http", 80, h, "/path/to/resource.htm?a=b", "yy", "http://netsurf.strcprstskrzkrk.co.uk/path/to/resource.htm?a=b"); if (!p) { LOG(("failed adding path")); return 1; } urldb_set_cookie("mmblah=foo; path=/; expires=Thur, 31-Dec-2099 00:00:00 GMT\r\n", "http://www.minimarcos.org.uk/cgi-bin/forum/Blah.pl?,v=login,p=2"); urldb_set_cookie("BlahPW=bar; path=/; expires=Thur, 31-Dec-2099 00:00:00 GMT\r\n", "http://www.minimarcos.org.uk/cgi-bin/forum/Blah.pl?,v=login,p=2"); urldb_set_cookie("details=foo|bar|Sun, 03-Jun-2007;expires=Mon, 24-Jul-2006 09:53:45 GMT", "http://ccdb.cropcircleresearch.com/"); urldb_set_cookie("PREF=ID=a:TM=b:LM=c:S=d; path=/; domain=.google.com", "http://www.google.com/"); urldb_set_cookie("test=foo, bar, baz; path=/, quux=blah; path=/", "http://www.bbc.co.uk/"); urldb_set_cookie("a=b; path=/; domain=.a.com", "http://a.com/"); urldb_dump(); return 0; } #endif