netsurf/utils/genpubsuffix.pl
2015-05-06 23:45:48 +01:00

379 lines
9.7 KiB
Perl

#
# Public suffix C code generator
#
# Copyright 2015 Vincent Sanders <vince@kyllikki.og>
#
# Permission to use, copy, modify, and/or distribute this software for
# any purpose with or without fee is hereby granted, provided that the
# above copyright notice and this permission notice appear in all
# copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
# This program converts the public suffix list data [1] into a C
# program with static data representation and acessor function.
#
# The actual data list [2] should be placed in a file effective_tld_names.dat
#
# The C program is written to stdout, the typical 160K input file
# generates 500K of program and compiles down to a 100K object file
#
# There is a single exported function
#
# const char *getpublicsuffix(const char *hostname)
#
# This returns the public suffix of the passed hostname or NULL if
# there was an error processing the hostname. The returned pointer is
# within the passed hostname so if the returned pointer is the same as
# hostname the whole hostname is a public suffix otherwise the passed
# hostname has a private part.
#
# The resulting C file is mearly a conversion of the input data (the
# added c code is from this source and licenced under the same terms)
# and imposes no additional copyright above that of the source data
# file.
#
# Note: The pnode structure is built assuming there will never be more
# label nodes than can fit in an unsigned 16 bit value (65535) but as
# there are currently around 7500 nodes there is space for another
# 58,000 before this becomes an issue.
#
# [1] https://publicsuffix.org/
# [2] https://publicsuffix.org/list/effective_tld_names.dat
# debian package for ordered hashes: libtie-ixhash-perl
use strict;
use warnings;
use utf8;
use Tie::IxHash;
sub treesubdom
{
my ($tldtree_ref, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref) = @_;
my $domelem = pop @{$parts_ref};
my $isexception = 0;
tie my %node, 'Tie::IxHash'; # this nodes hash
# deal with explicit domain exceptions
$isexception = ($domelem =~ s/\A!//);
if ($isexception != 0) {
$node{"!"} = {};
$$nodeidx_ref += 1;
}
# Update string table
if (! exists $strtab_ref->{$domelem}) {
# add to string table
$strtab_ref->{$domelem} = $$stridx_ref;
{
use bytes;
# update the character count index
$$stridx_ref += length($domelem);
$$stridx_ref += 1; # terminator
}
}
# link new node list into tree
if (! exists $tldtree_ref->{$domelem}) {
$tldtree_ref->{$domelem} = \%node;
$$nodeidx_ref += 1;
}
# recurse down if there are more parts to the domain
if (($isexception == 0) && (scalar(@{$parts_ref}) > 0)) {
treesubdom($tldtree_ref->{$domelem}, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref);
}
}
sub phexstr
{
use bytes;
my ($str) = @_;
my $ret;
my @bytes = unpack('C*', $str);
$ret = $ret . sprintf("0x%02x, ", scalar(@bytes));
foreach (@bytes) {
$ret = $ret . sprintf("0x%02x, ", $_);
}
return $ret;
}
# generate all the children of a parent node and recurse into each of
# those updating optidx to point to the next free node
sub calc_pnode
{
my ($parent_ref, $strtab_ref, $opidx_ref) = @_;
my $our_dat;
my $child_dat = "";
my $startidx = $$opidx_ref;
my $lineidx = -1;
# update the output index to after this node
$$opidx_ref += scalar keys %$parent_ref;
# entry block
if ($startidx == ($$opidx_ref - 1)) {
$our_dat = "\n /* entry " . $startidx . " */\n ";
} else {
$our_dat = "\n /* entries " . $startidx . " to " . ($$opidx_ref - 1) . " */\n ";
}
# iterate over each child element domain/ref pair
while ( my ($cdom, $cref) = each(%$parent_ref) ) {
# make array look pretty by limiting entries per line
if ($lineidx == 3) {
$our_dat .= "\n ";
$lineidx = 0;
} elsif ($lineidx == -1) {
$lineidx = 1;
} else {
$our_dat .= " ";
$lineidx += 1;
}
$our_dat .= "{ ";
$our_dat .= $strtab_ref->{$cdom} . ", ";
my $child_count = scalar keys (%$cref);
$our_dat .= $child_count . ", ";
if ($child_count != 0) {
$our_dat .= $$opidx_ref;
$child_dat .= calc_pnode($cref, $strtab_ref, $opidx_ref);
} else {
$our_dat .= 0;
}
$our_dat .= " },";
}
return $our_dat . $child_dat;
}
# main
binmode(STDOUT, ":utf8");
my $filename = "effective_tld_names.dat";
open(my $fh, '<:encoding(UTF-8)', $filename)
or die "Could not open file '$filename' $!";
tie my %tldtree, 'Tie::IxHash'; # node tree
my $nodeidx = 1; # count of nodes allowing for the root node
tie my %strtab, 'Tie::IxHash'; # string table
my $stridx = 0;
# put the wildcard match at 0 in the string table
$strtab{'*'} = $stridx;
$stridx += 2;
# put the invert match at 2 in the string table
$strtab{'!'} = $stridx;
$stridx += 2;
# read each line from prefix data and inject into hash tree
while (my $line = <$fh>) {
chomp $line;
if (($line ne "") && ($line !~ /\/\/.*$/)) {
# print "$line\n";
my @parts=split("\\.", $line);
# recusrsive call to build tree from root
treesubdom(\%tldtree, \$nodeidx, \%strtab, \$stridx, \@parts);
}
}
# C program header
print <<EOF;
/*
* Generated with the genpubsuffix tool from effective_tld_names.dat
*/
#include <stdint.h>
#include <string.h>
EOF
# output string table
#
# array of characters each string is prefixed with its length and the
# node table below directly indexes emtries. As labels cannot be more
# than 63 characters a byte length is more than sufficient.
print "static const char stab[" . $stridx . "] = {\n";
while ( my ($key, $value) = each(%strtab) ) {
print " " . phexstr($key) . "/* " . $key . " " . $value . " */\n";
}
print "};\n\n";
print "enum stab_entities {\n";
print " STAB_WILDCARD = 0,\n";
print " STAB_EXCEPTION = 2\n";
print "};\n\n";
# output static node array
#
# The constructed array of nodes has all siblings sequentialy and an
# index/count to its children. This yeilds a very compact data
# structure easily traversable.
#
# Additional flags for * (match all) and ! (exception) are omitted as
# they can be infered by having a node with a label of 0 (*) or 2 (!)
# as the string table has those values explicitly created.
print "struct pnode {\n";
print " uint32_t label; /* index of domain element in string table */\n";
print " uint16_t child_count; /* number of children of this node */\n";
print " uint16_t child_index; /* index of first child node */\n";
print "};\n\n";
my $opidx = 1; # output index of node
print "static const struct pnode pnodes[" . $nodeidx . "] = {\n";
# root node
print " /* root entry */\n { 0," . scalar keys(%tldtree) . ", " . $opidx . " },";
# all subsequent nodes
print calc_pnode(\%tldtree, \%strtab, \$opidx);
print "\n};\n\n";
# lookup code
print <<EOF;
#define DOMSEP '.'
static int matchlabel(int parent, const char *start, int len)
{
int clast = pnodes[parent].child_index + pnodes[parent].child_count;
int cidx; /*child node index */
int ridx = -1; /* index of match or -1 */
if (pnodes[parent].child_count != 0) {
/* there are child nodes present to scan */
for (cidx = pnodes[parent].child_index; cidx < clast; cidx++) {
if (pnodes[cidx].label == STAB_WILDCARD) {
/* wildcard match */
ridx = cidx;
} else {
if ((stab[pnodes[cidx].label] == len) &&
(strncasecmp(&stab[pnodes[cidx].label + 1],
start,
len) == 0)) {
if ((pnodes[cidx].child_count == 1) &&
(pnodes[pnodes[cidx].child_index].label == STAB_EXCEPTION)) {
/* exception to previous */
ridx = -1;
} else {
ridx = cidx;
}
break;
}
}
}
}
return ridx;
}
/*
* Exported public API
*/
const char *getpublicsuffix(const char *hostname)
{
int treeidx = 0; /* index to current tree node */
const char *elem_start;
const char *elem_end;
int lab_count = 0;
/* deal with obviously bad hostname */
if ((hostname == NULL) ||
(hostname[0]) == 0 ||
(hostname[0] == DOMSEP)) {
return NULL;
}
/* hostnames are ass backwards and we need to consider elemets
* from the end first.
*/
elem_end = hostname + strlen(hostname);
/* fqdn have a separator on the end */
if (elem_end[-1] == DOMSEP) {
elem_end--;
}
elem_start = elem_end;
/* extract the element and check for a match in our tree */
for(;;) {
/* find the start of the element */
while ((elem_start > hostname) && (*elem_start != DOMSEP)) {
elem_start--;
}
if (*elem_start == DOMSEP) {
elem_start++;
}
lab_count++;
/* search child nodes for label */
treeidx = matchlabel(treeidx, elem_start, elem_end - elem_start);
if (treeidx == -1) {
break;
}
if (elem_start == hostname) {
/* not valid */
return NULL;
}
elem_end = elem_start - 1;
elem_start = elem_end - 1;
}
/* The public suffix algorithm says: "the domain must match
* the public suffix plus one additional label." This
* requires there to be at least two labels so we need to
* check
*/
if (lab_count == 1) {
if (elem_start == hostname) {
elem_start = NULL;
} else {
/* strip the non matching part */
elem_start -= 2;
while (elem_start > hostname && *elem_start != DOMSEP) {
elem_start--;
}
if (*elem_start == DOMSEP)
elem_start++;
}
}
return elem_start;
}
EOF