Catch up to 20 years of HTML and URL changes.

2023-01-28 13:12:16 +00:00 · 2023-01-28 13:12:16 +00:00 · e0e5768dcf
parent 2f8c60d111
commit e0e5768dcf
2 changed files with 72 additions and 31 deletions
--- a/share/misc/nanpa.awk
+++ b/share/misc/nanpa.awk
@ -1,11 +1,49 @@
-# $NetBSD: nanpa.awk,v 1.2 2003/03/13 02:55:01 jhawk Exp $
+# $NetBSD: nanpa.awk,v 1.3 2023/01/28 13:12:16 jmcneill Exp $
 #
 # todo:
-#	parse "http://docs.nanpa.com/cgi-bin/npa_reports/nanpa?
-#	    function=list_npa_introduced" to produce parenthetical
-#	    notes about what area codes are overlayed by others
-#	    (or split from).
+#	parse "https://nationalnanpa.com/nanp1/npa_report.csv"
+#	    instead of scraping HTML.
 #
+function trim(s)
+{
+	gsub(/^[ \t]+|[ \t]+$/, "", s);
+	return s;
+}
+function mapinit(postdb)
+{
+	while ((getline < postdb) > 0) {
+		sub(/#.*/, "");
+		if (length($0)==0) continue;
+		NF=split($0, f);
+		location[f[1]] = f[2];
+		flocation[tolower(f[2])] = f[2];
+		country[f[1]] = f[4];
+		fcountry[tolower(f[2])] = f[4];
+	}
+}
+function countrymap(s)
+{
+	if (s == "CA") return "Canada";
+	if (s == "US") return "USA";
+	return s;
+}
+function locationmap(s,	t)
+{
+	if (s in location) {
+		t = location[s];
+		if (s in country) {
+			t = t " (" countrymap(country[s]) ")";
+		}
+	} else if (tolower(s) in flocation) {
+		t = flocation[tolower(s)];
+		if (tolower(s) in fcountry) {
+			t = t " (" countrymap(fcountry[tolower(s)]) ")";
+		}
+	} else {
+		t = s;
+	}
+	return t;
+}
 function parse(file, ispipe, isplanning,	i, planinit, t)
 {
 	planinit = 0;
@ -13,30 +51,30 @@ function parse(file, ispipe, isplanning,	i, planinit, t)
 		sub(/#.*/, "");
 		if (length($0)==0) continue;
 		if (isplanning) {
-			split($0, f);
-			if (!planinit && f[2]=="NEW NPA") {
+			NF=split($0, f);
+			if (!planinit && f[2]=="New NPA") {
 				planinit=1;
 				for (i=1; i<=NF; i++)
-					fnames[$i]=i-1;
-			} else if (planinit && length(f[fnames["NEW NPA"]])>1) {
-				t = f[fnames["LOCATION"]] FS;
-				if (f[fnames["OVERLAY?"]]=="Yes")
-				  t = t "Overlay of " f[fnames["OLD NPA"]];
-				else if (f[fnames["OLD NPA"]])
-				  t = t "Split of " f[fnames["OLD NPA"]];
-				if (f[fnames["STATUS"]])
-					t = t " (" f[fnames["STATUS"]] ")";
-				if (length(f[fnames["IN SERVICE DATE"]]) > 1)
+					fnames[f[i]]=i-1;
+			} else if (planinit && length(f[fnames["New NPA"]])>1) {
+				t = locationmap(trim(f[fnames["Location"]])) FS;
+				if (trim(f[fnames["Overlay?"]])=="Yes")
+				  t = t "Overlay of " trim(f[fnames["Old NPA"]]);
+				else if (f[fnames["Old NPA"]])
+				  t = t "Split of " trim(f[fnames["Old NPA"]]);
+				if (f[fnames["Status"]])
+					t = t " (" trim(f[fnames["Status"]]) ")";
+				if (length(f[fnames["In Service Date"]]) > 1)
 					t = t " effective " \
-					    f[fnames["IN SERVICE DATE"]];
-				data[f[fnames["NEW NPA"]] "*"] = t;
+					    trim(f[fnames["In Service Date"]]);
+				data[trim(f[fnames["New NPA"]]) "*"] = t;
 			}
 		} else {
 			# digits only
 			match($0, /^[0-9]/);
 			if (RSTART==0) continue;
 			i=index($0, FS);
-			data[substr($0, 1, i-1)]=substr($0,i+1);
+			data[substr($0, 1, i-1)]=locationmap(trim(substr($0,i+1)));
 		}
 	}
 	close(file);
@ -44,8 +82,9 @@ function parse(file, ispipe, isplanning,	i, planinit, t)

 BEGIN{
 	FS=":"
+	mapinit("na.postal");
 	print "# $""NetBSD: $";
-	print "# Generated from http://www.nanpa.com/area_codes/index.html";
+	print "# Generated from https://nationalnanpa.com/area_codes/index.html";
 	print "# (with local exceptions)";
 	print "# ";
 	print "# format:";
@ -54,14 +93,14 @@ BEGIN{
 	print "#   A * in the Area Code field indicates a future area code."
 	print "# ";
 	parse("ftp -o - " \
-	    "http://docs.nanpa.com/cgi-bin/npa_reports/nanpa\\?" \
-	    "function=list_npa_geo_number | sed -f nanpa.sed", 1, 0);
+	    "https://nationalnanpa.com/enas/geoAreaCodeNumberReport.do" \
+	    " | sed -f nanpa.sed", 1, 0);
 	parse("ftp -o - " \
-	    "http://docs.nanpa.com/cgi-bin/npa_reports/nanpa\\?" \
-	    "function=list_npa_non_geo | sed -f nanpa.sed", 1, 0);
+	    "https://nationalnanpa.com/enas/nonGeoNpaServiceReport.do" \
+	    " | sed -f nanpa.sed", 1, 0);
 	parse("ftp -o - " \
-	    "http://docs.nanpa.com/cgi-bin/npa_reports/nanpa\\?" \
-	    "function=list_npa_not_in_service | sed -f nanpa.sed", 1, 1);
+	    "https://nationalnanpa.com/enas/plannedNpasNotInServiceReport.do" \
+	    " | sed -f nanpa.sed", 1, 1);
 	parse("na.phone.add", 0, 0);
 	sort="sort -n";
 	for (i in data)
--- a/share/misc/nanpa.sed
+++ b/share/misc/nanpa.sed
@ -1,4 +1,4 @@
-# $NetBSD: nanpa.sed,v 1.2 2006/12/25 18:39:48 wiz Exp $
+# $NetBSD: nanpa.sed,v 1.3 2023/01/28 13:12:16 jmcneill Exp $
 #
 # Parse HTML tables output by 
 #   http://docs.nanpa.com/cgi-bin/npa_reports/nanpa
@ -34,7 +34,7 @@ s/\$$//
 #				Remove lines not starting with <TR>
 /<[Tt][Rr][^>]*>/!d
 #				Replace all <TD> with colon
-s/[ 	]*<TD[^>]*> */:/g
+s/[ 	]*<[Tt][Dd][^>]*> */:/g
 #				Strip all HTML tags
 s/<[^>]*>//g
 #				Handle HTML characters
@ -42,7 +42,9 @@ s/&nbsp;/ /g
 #				Compress spaces/tabs
 s/[ 	][ 	]*/ /g
 #				Strip leading colons
-s/^://
+s/://
 #				Strip leading/trailing whitespace
-s/^ //
+s/ *//
 s/ $//
+#				Strip HTML comments
+s/^--.*$//