netsurf/utils/genpubsuffix.pl

#
# Public suffix C code generator
#
# Copyright 2015 Vincent Sanders <vince@kyllikki.og>
#
# Permission to use, copy, modify, and/or distribute this software for
# any purpose with or without fee is hereby granted, provided that the
# above copyright notice and this permission notice appear in all
# copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.


# This program converts the public suffix list data [1] into a C
#  program with static data representation and acessor function.
#
# The actual data list [2] should be placed in a file effective_tld_names.dat
#
# The C program is written to stdout, the typical 160K input file
#  generates 500K of program and compiles down to a 100K object file
#
# There is a single exported function
#
# const char *getpublicsuffix(const char *hostname)
#
# This returns the public suffix of the passed hostname or NULL if
#  there was an error processing the hostname. The returned pointer is
#  within the passed hostname so if the returned pointer is the same as
#  hostname the whole hostname is a public suffix otherwise the passed
#  hostname has a private part.
#
# The resulting C file is mearly a conversion of the input data (the
#  added c code is from this source and licenced under the same terms)
#  and imposes no additional copyright above that of the source data
#  file.
#
# Note: The pnode structure is built assuming there will never be more
#  label nodes than can fit in an unsigned 16 bit value (65535) but as
#  there are currently around 7500 nodes there is space for another
#  58,000 before this becomes an issue.
#
# [1] https://publicsuffix.org/
# [2] https://publicsuffix.org/list/effective_tld_names.dat


# debian package for ordered hashes: libtie-ixhash-perl

use strict;
use warnings;
use utf8;
use Tie::IxHash;


sub treesubdom
{
    my ($tldtree_ref, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref) = @_;

    my $domelem = pop @{$parts_ref};
    my $isexception = 0;
    tie my %node, 'Tie::IxHash'; # this nodes hash

    # deal with explicit domain exceptions
    $isexception = ($domelem =~ s/\A!//);
    if ($isexception != 0) {
	$node{"!"} = {};
	$$nodeidx_ref += 1;
    }

    # Update string table
    if (! exists $strtab_ref->{$domelem}) {
	# add to string table
	$strtab_ref->{$domelem} = $$stridx_ref;
	{
	    use bytes;
	    # update the character count index
	    $$stridx_ref += length($domelem);
	    $$stridx_ref += 1; # terminator
	}

    }

    # link new node list into tree
    if (! exists $tldtree_ref->{$domelem}) {
	$tldtree_ref->{$domelem} = \%node;
	$$nodeidx_ref += 1;
    }

    # recurse down if there are more parts to the domain
    if (($isexception == 0) && (scalar(@{$parts_ref}) > 0)) {
	treesubdom($tldtree_ref->{$domelem}, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref);
    }
}

sub phexstr
{
    use bytes;

    my ($str) = @_;
    my $ret;

    my @bytes = unpack('C*', $str);

    $ret = $ret . sprintf("0x%02x, ", scalar(@bytes));

    foreach (@bytes) {
	$ret = $ret . sprintf("0x%02x, ", $_);
    }

    return $ret;
}

# generate all the children of a parent node and recurse into each of
#  those updating optidx to point to the next free node
sub calc_pnode
{
    my ($parent_ref, $strtab_ref, $opidx_ref) = @_;
    my $our_dat;
    my $child_dat = "";
    my $startidx = $$opidx_ref;
    my $lineidx = -1;

    # update the output index to after this node
    $$opidx_ref += scalar keys %$parent_ref;

    # entry block
    if ($startidx == ($$opidx_ref - 1)) {
	$our_dat = "\n    /* entry " . $startidx . " */\n    ";
    } else {
	$our_dat = "\n    /* entries " . $startidx . " to " . ($$opidx_ref - 1) . " */\n    ";
    }

    # iterate over each child element domain/ref pair
    while ( my ($cdom, $cref) = each(%$parent_ref) ) {
        # make array look pretty by limiting entries per line
	if ($lineidx == 3) {
	    $our_dat .= "\n    ";
	    $lineidx = 0;
	} elsif ($lineidx == -1) {
	    $lineidx = 1;
	} else {
	    $our_dat .= " ";
	    $lineidx += 1;
	}

	$our_dat .= "{ ";
	$our_dat .= $strtab_ref->{$cdom} . ", ";
	my $child_count = scalar keys (%$cref);
	$our_dat .= $child_count . ", ";
	if ($child_count != 0) {
	    $our_dat .= $$opidx_ref;
	    $child_dat .= calc_pnode($cref, $strtab_ref, $opidx_ref);
	} else {
	    $our_dat .= 0;
	}
	$our_dat .= " },";

    }

    return $our_dat . $child_dat;
}

# main
binmode(STDOUT, ":utf8");

my $filename = "effective_tld_names.dat";

open(my $fh, '<:encoding(UTF-8)', $filename)
    or die "Could not open file '$filename' $!";

tie my %tldtree, 'Tie::IxHash'; # node tree
my $nodeidx = 1; # count of nodes allowing for the root node
tie my %strtab, 'Tie::IxHash'; # string table
my $stridx = 0;

# put the wildcard match at 0 in the string table
$strtab{'*'} = $stridx;
$stridx += 2;

# put the invert match at 2 in the string table
$strtab{'!'} = $stridx;
$stridx += 2;

# read each line from prefix data and inject into hash tree
while (my $line = <$fh>) {
    chomp $line;

    if (($line ne "") && ($line !~ /\/\/.*$/)) {

	# print "$line\n";
	my @parts=split("\\.", $line);

	# recusrsive call to build tree from root

	treesubdom(\%tldtree, \$nodeidx, \%strtab, \$stridx, \@parts);
    }
}

# C program header
print <<EOF;
/*
 * Generated with the genpubsuffix tool from effective_tld_names.dat
 */

#include <stdint.h>
#include <string.h>

EOF

# output string table
#
# array of characters each string is prefixed with its length and the
#  node table below directly indexes emtries. As labels cannot be more
#  than 63 characters a byte length is more than sufficient.

print "static const char stab[" . $stridx . "] = {\n";
while ( my ($key, $value) = each(%strtab) ) {
    print "    " . phexstr($key) . "/* " . $key . " " . $value . " */\n";
}
print "};\n\n";

print "enum stab_entities {\n";
print "    STAB_WILDCARD = 0,\n";
print "    STAB_EXCEPTION = 2\n";
print "};\n\n";


# output static node array
#
# The constructed array of nodes has all siblings sequentialy and an
# index/count to its children. This yeilds a very compact data
# structure easily traversable.
#
# Additional flags for * (match all) and ! (exception) are omitted as
# they can be infered by having a node with a label of 0 (*) or 2 (!)
# as the string table has those values explicitly created.

print "struct pnode {\n";
print "    uint32_t label; /* index of domain element in string table */\n";
print "    uint16_t child_count; /* number of children of this node */\n";
print "    uint16_t child_index; /* index of first child node */\n";
print "};\n\n";

my $opidx = 1; # output index of node

print "static const struct pnode pnodes[" . $nodeidx . "] = {\n";

# root node
print "    /* root entry */\n    { 0," . scalar keys(%tldtree) . ", " . $opidx . " },";

# all subsequent nodes
print calc_pnode(\%tldtree, \%strtab, \$opidx);
print "\n};\n\n";

# lookup code
print <<EOF;

#define DOMSEP '.'

static int matchlabel(int parent, const char *start, int len)
{
	int clast = pnodes[parent].child_index + pnodes[parent].child_count;
	int cidx; /*child node index */
	int ridx = -1; /* index of match or -1 */

	if (pnodes[parent].child_count != 0) {
		/* there are child nodes present to scan */

		for (cidx = pnodes[parent].child_index; cidx < clast; cidx++) {
			if (pnodes[cidx].label == STAB_WILDCARD) {
				/* wildcard match */
				ridx = cidx;
			} else {
				if ((stab[pnodes[cidx].label] == len) &&
				    (strncasecmp(&stab[pnodes[cidx].label + 1],
						 start,
						 len) == 0)) {

					if ((pnodes[cidx].child_count == 1) &&
					    (pnodes[pnodes[cidx].child_index].label == STAB_EXCEPTION)) {
						/* exception to previous */
						ridx = -1;
					} else {
						ridx = cidx;
					}
					break;
				}
			}
		}
	}
	return ridx;
}

/*
 * Exported public API
 */
const char *getpublicsuffix(const char *hostname)
{
	int treeidx = 0; /* index to current tree node */
	const char *elem_start;
	const char *elem_end;
	int lab_count = 0;

	/* deal with obviously bad hostname */
	if ((hostname == NULL) ||
	    (hostname[0]) == 0 ||
	    (hostname[0] == DOMSEP)) {
		return NULL;
	}

	/* hostnames are ass backwards and we need to consider elemets
	 * from the end first.
	 */
	 elem_end = hostname + strlen(hostname);
	 /* fqdn have a separator on the end */
	 if (elem_end[-1] == DOMSEP) {
		 elem_end--;
	 }
	 elem_start = elem_end;

	 /* extract the element and check for a match in our tree */
	 for(;;) {
		 /* find the start of the element */
		 while ((elem_start > hostname) && (*elem_start != DOMSEP)) {
			 elem_start--;
		 }
		 if (*elem_start == DOMSEP) {
			 elem_start++;
		 }

		 lab_count++;

		 /* search child nodes for label */
		 treeidx = matchlabel(treeidx, elem_start, elem_end - elem_start);
		 if (treeidx == -1) {
			 break;
		 }

		 if (elem_start == hostname) {
			 /* not valid */
			 return NULL;
		 }

		 elem_end = elem_start - 1;
		 elem_start = elem_end - 1;
	 }

	 /* The public suffix algorithm says: "the domain must match
	  * the public suffix plus one additional label." This
	  * requires there to be at least two labels so we need to
	  * check
	  */
	 if (lab_count == 1) {
		 if (elem_start == hostname) {
			 elem_start = NULL;
		 } else {
			 /* strip the non matching part */
			 elem_start -= 2;
			 while (elem_start > hostname && *elem_start != DOMSEP) {
				 elem_start--;
			 }
			 if (*elem_start == DOMSEP)
				 elem_start++;
		 }
	 }


	 return elem_start;
}


EOF