#!/usr/bin/perl -w
# Copyright (c) 2010-2017 Sullivan Beck. All rights reserved.
# This program is free software; you can redistribute it and/or modify it
# under the same terms as Perl itself.
###############################################################################
###############################################################################
# This script is used to harvest data from the various standards and use that
# data to automatically generate the Locale::Codes module containing that data.
require 5.000000;
use YAML;
use IO::File;
use strict;
use warnings;
use Archive::Zip;
use Encode;
use Text::CSV::Slurp;
use Spreadsheet::XLSX;
use Text::Iconv;
use lib "./internal";
our $VERSION;
$VERSION='3.53';
# Some required executables
my @exe = qw( wget xls2csv );
###############################################################################
# GLOBAL VARIABLES
###############################################################################
# We need to create the following variables:
#
# %ID2Names{COUNTRY_ID} => [ COUNTRY, COUNTRY, ... ]
# A list of all valid country names that
# correspond to a given COUNTRY_ID.
# The names are all real (i.e. correct
# spelling and capitalization).
# %Alias{ALIAS} => [ COUNTRY_ID, I ]
# A hash of all aliases for a country.
# Aliases are all lowercase. It is
# the I'th entry in the list of countries.
# %Code2ID{CODESET}{CODE} => [ COUNTRY_ID, I ]
# In a given CODESET, CODE corresponds to
# the I'th entry in the list of countries.
# %ID2Code{CODESET}{COUNTRY_ID} => CODE
# In the given CODESET, the COUNTRY_ID
# corresponds to the given CODE.
#
# %Data is a complete description of changes that need to be made to the
# raw data to turn it into the form used by the module.
#
# $Data{TYPE}{SOURCE} = SOURCE_DESCRIPTION
# TYPE is the type of codeset (i.e. country, language)
# SOURCE is the source of data (i.e. iso, iana)
# SOURCE_DESCRIPTION is a hash as described below.
#
# $Data{TYPE}{SOURCE}{'orig'}{KEY}{ORIG_VALUE} => NEW_VALUE
# KEY is either the name of one of the codesets (i.e. alpha2) or 'name'.
# ORIG_VALUE is the value exactly as it is read in from the original source.
# NEW_VALUE is the value expressed the way it should be in this module.
#
# $Data{TYPE}{SOURCE}{'ignore'}{KEY}{VALUE} => 1
# VALUE is one possible value for that KEY. If an element is read in
# with KEY having this VALUE, the element is ignored.
#
# $Data{TYPE}{SOURCE}{'new'}{NAME} => 1
# This permits the source to add a new element named NAME.
# The first source is automatically permitted to add all elements
# contained in it... all others must be explicitly permitted.
#
# $Data{TYPE}{'link'} => [ [ NAME1a, NAME1b, ... ] [ NAME2a, NAME2b, ... ] ... ]
# Links all of NAMEi together (i.e. they are different names for the
# same element).
# $Data{TYPE}{'alias'}{ALIAS} => NAME
# Generated from 'link'.
our($ModDir,$Module,$ID,%ID2Names,%Alias,%Code2ID,%ID2Code,%Std,%Data);
$ModDir = "lib/Locale/Codes";
########################################
# COUNTRY
our $country_iso_url = "http://www.iso.org/iso/home/standards/country_codes.htm";
# IANA publishes a list of codes. The country names must be looked up in an
# extended list of ISO 3166 codes.
our $country_iana_url = "http://www.iana.org/domains/root/db/";
our $country_un_url = "https://unstats.un.org/unsd/methodology/m49/";
our $country_genc_url = "https://nsgreg.nga.mil/genc/discovery";
require "data.country.pl";
########################################
# LANGUAGE
our $language_iso2_url = "http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt";
our $language_iso5_url = "http://www.loc.gov/standards/iso639-5/id.php";
our $language_iana_url = "http://www.iana.org/assignments/language-subtag-registry";
require "data.language.pl";
########################################
# CURRENCY
our $currency_iso_url = "http://www.currency-iso.org/dam/downloads/lists/list_one.xls";
require "data.currency.pl";
########################################
# SCRIPT
our $script_iso_url = "http://www.unicode.org/iso15924/iso15924.txt.zip";
our $script_iso_zip = qr/^iso15924/;
our $script_iana_url = $language_iana_url;
require "data.script.pl";
########################################
# LANGUAGE EXTENSIONS
our $langext_iana_url = $language_iana_url;
require "data.langext.pl";
########################################
# LANGUAGE VARIATIONS
our $langvar_iana_url = $language_iana_url;
require "data.langvar.pl";
########################################
# LANGUAGE FAMILIESS
our $langfam_iso_url = "http://www.loc.gov/standards/iso639-5/id.php";
require "data.langfam.pl";
# ########################################
# # REGIONS
# #
# # IANA language registration
# #
# # Data available consists of the script names and 2-letter and
# # 3-letter codes. Script names include non-ASCII characters encoded in
# # UTF-8.
# #
# our($region_iana_url,%region_iana_orig,%region_iana_ignore);
# $region_iana_url = $language_iana_url;
# require "data.region.pl";
###############################################################################
# HELP
###############################################################################
our($usage);
my $COM = $0;
$COM =~ s/^.*\///;
$usage=
"usage: $COM OPTIONS
-h/--help : Print help.
-a/--all : Do all steps
-c/--country : Get the country codes
-l/--language : Get the language codes
-r/--currency : Get the currency codes
-s/--script : Get the script codes
-L/--langext : Get the language extension codes
-V/--langvar : Get the language variation codes
-F/--langfam : Get the language family codes
";
###############################################################################
# PARSE ARGUMENTS
###############################################################################
my $do_all = 0;
my $do_country = 0;
my $do_language = 0;
my $do_currency = 0;
my $do_script = 0;
my $do_langext = 0;
my $do_langvar = 0;
my $do_langfam = 0;
while ($_ = shift) {
(print $usage), exit if ($_ eq "-h" || $_ eq "--help");
$do_all = 1, next if ($_ eq "-a" || $_ eq "--all");
$do_country = 1, next if ($_ eq "-c" || $_ eq "--country");
$do_language = 1, next if ($_ eq "-l" || $_ eq "--language");
$do_currency = 1, next if ($_ eq "-r" || $_ eq "--currency");
$do_script = 1, next if ($_ eq "-s" || $_ eq "--script");
$do_langext = 1, next if ($_ eq "-L" || $_ eq "--langext");
$do_langvar = 1, next if ($_ eq "-V" || $_ eq "--langvar");
$do_langfam = 1, next if ($_ eq "-F" || $_ eq "--langfam");
}
############################################################################
# MAIN PROGRAM
############################################################################
foreach my $exe (@exe) {
if (system("which $exe > /dev/null") != 0) {
die "ERROR: required executable not found: $exe\n";
}
}
$ID = "0001";
%ID2Names = ();
%Alias = ();
%Code2ID = ();
%ID2Code = ();
%Std = ();
do_country() if ($do_all || $do_country);
do_language() if ($do_all || $do_language);
do_currency() if ($do_all || $do_currency);
do_script() if ($do_all || $do_script);
do_langext() if ($do_all || $do_langext);
do_langvar() if ($do_all || $do_langvar);
do_langfam() if ($do_all || $do_langfam);
############################################################################
# DO_COUNTRY
############################################################################
sub do_country {
print "Country codes...\n";
$Module = "Country";
_do_codeset('country','iso', ['alpha-2','alpha-3','numeric'],
['alpha-2','alpha-3','numeric']);
_do_codeset('country','iana', ['dom'],
['dom']);
_do_codeset('country','un', ['un-numeric','un-alpha-3'],
['un-numeric','un-alpha-3']);
_do_codeset('country','genc', ['genc-alpha-2','genc-alpha-3','genc-numeric'],
['genc-alpha-2','genc-alpha-3','genc-numeric']);
do_aliases("country");
write_module("country");
}
########################################
#
# GENC
#
# The GENC web page contains a set of country codes which is very
# similar to the ISO codes, but contains some differences. As a result,
# this is a separate list.
#
# File format is:
#
# <tr ...>
# <td ...>
# <table width="100%">
# <tr>
# <td ...><a ...><font ...>2-char<br/>Code</a></td>
# <td ...><a ...><img ...></a></td>
# </tr>
# </table>
# </td>
#
# <td ...>
# <table width="100%">
# <tr>
# <td ...><a ...><font ...>3-char<br/>Code</a></td>
# <td ...><a ...><img ...></a></td>
# </tr>
# </table>
# </td>
#
# <td ...>
# <table width="100%">
# <tr>
# <td ...><a ...><font ...>Numeric<br/>Code</a></td>
# <td ...><a ...><img ...></a></td>
# </tr>
# </table>
# </td>
#
# <td ...>
# <table width="100%">
# <tr>
# <td ...><a ...><font ...>Name</a></td>
# <td ...><a ...><img ...></a></td>
# </tr>
# </table>
# </td>
#
# <td ...>
# <table width="100%">
# <tr>
# <td ...><a ...><font ...>U.S. Recognition</a></td>
# <td ...><a ...><img ...></a></td>
# </tr>
# </table>
# </td>
#
# <td ...>
# <table width="100%">
# <tr>
# <td ...><a ...><font ...>GENC<br/>Status</a></td>
# <td ...><a ...><img ...></a></td>
# </tr>
# </table>
# </td>
# </tr>
#
# <tr ...>
#
# <td ...>
# <a ...><font ...>AF</a>
# </td>
#
# <td ...>
# <a ...><font ...>AFG</a>
# </td>
#
# <td ...>
# <a ...><font ...>004</a>
# </td>
#
# <td ...>
# <a ...><font ...>AFGHANISTAN</a>
# </td>
#
# <td ...><span ...>Independent</span></td>
# <td ...><span ...>Exception</span></td>
# </tr>
{
my $in;
sub _init_country_genc {
$in = _read_file('url' => $country_genc_url,
'type' => 'html',
'as_list' => 0,
'html_strip' => [ qw(a font img br span) ],
'html_repl' => [ qw( ) ],
);
# Look for a table who's first row has the header:
# Country or area name
my $found = jump_to_row(\$in,"U.S. Recognition",1);
if (! $found) {
die "ERROR [genc]: country code file format changed!\n";
}
}
sub _read_country_genc {
while (1) {
my @row = get_row("genc",\$in);
return () if (! @row);
my($alpha2,$alpha3,$num,$country) = @row;
my($id,$i);
if (exists $Code2ID{'alpha-2'}{lc($alpha2)}) {
($id,$i) = @{ $Code2ID{'alpha-2'}{lc($alpha2)} };
}
if (exists $Code2ID{'alpha-3'}{lc($alpha3)}) {
if (! defined($id)) {
print "WARNING [genc]: Code mismatch (alpha-3 defined, alpha-2 not): $country\n";
next;
}
my($id2,$i2) = @{ $Code2ID{'alpha-3'}{lc($alpha3)} };
if ($id ne $id2) {
print "WARNING [genc]: Code mismatch (alpha-3 != alpha-2): $country\n";
next;
}
}
if (exists $Code2ID{'numeric'}{$num}) {
if (! defined($id)) {
print "WARNING [genc]: Code mismatch (numeric defined, alpha-2 not): $country\n";
next;
}
my($id2,$i2) = @{ $Code2ID{'numeric'}{$num} };
if ($id ne $id2) {
print "WARNING [genc]: Code mismatch (numeric != alpha-2): $country\n";
next;
}
}
my @country;
if (exists $Alias{lc($country)}) {
my($id2,$i2) = @{ $Alias{lc($country)} };
if (! defined($id)) {
($id,$i) = ($id2,$i2);
} elsif ($id ne $id2) {
print "WARNING [genc]: Code mismatch (alias incorrect): $country\n";
next;
}
my @name = @{ $ID2Names{$id} };
@country = ($name[$i]);
} elsif (defined($id)) {
my @name = @{ $ID2Names{$id} };
@country = (_country_name($country),
@name);
} else {
@country = _country_name($country);
}
return ($alpha2,$alpha3,$num,@country);
}
}
}
########################################
#
# UN
#
# The United Nations web page contains a set of country codes which is very
# similar to the ISO Alpha-3 codes, but contains some differences. As a result,
# this is a separate list.
#
# File format is:
#
# <table border=0 cellpadding=2 cellspacing=0>
# <tbody>
# <tr>
# <td align=left valign=top class="theader" width="66"><div align="left"><strong>Numerical<br>
# code</strong></div></td>
# <td valign=top class="theader" width="312"><strong> Country
# or area name</strong></td>
# <td valign=top class="theader" width="121"><strong>ISO ALPHA-3</strong><strong>
# code</strong></td>
# </tr>
# <tr>
# <td width="66" align=middle valign=top class="lcont">
# <p align=left>004 </p> </td>
# <td width="312" valign=top class="lcont">
# <p>Afghanistan </p> </td>
# <td width="121" valign=top class="lcont">
# <p>AFG </p> </td>
# </tr>
{
my $in;
sub _init_country_un {
$in = _read_file('url' => $country_un_url,
'type' => 'html',
'as_list' => 0,
'html_strip' => [ qw(p div strong br) ],
'html_repl' => [ qw( ) ],
);
# Look for a table who's first row has the header:
# Country or area name
my $found = jump_to_row(\$in,"Country or Area");
if (! $found) {
die "ERROR [un]: country code file format changed!\n";
}
}
sub _read_country_un {
while (1) {
my @row = get_row("un",\$in);
return () if (! @row);
my($country,$num,$alpha) = @row;
my($id,$i);
if (exists $Code2ID{'alpha-3'}{lc($alpha)}) {
my($id1,$i1) = @{ $Code2ID{'alpha-3'}{lc($alpha)} };
if (exists $Code2ID{'numeric'}{$num}) {
my($id2,$i2) = @{ $Code2ID{'numeric'}{$num} };
if ($id1 ne $id2) {
print "WARNING [un]: UN/ISO code alpha/numeric mismatch: $country\n";
next;
}
($id,$i) = ($id1,$i1);
} else {
print "WARNING [un]: UN/ISO code mismatch (alpha defined): $country\n";
next;
}
} elsif (exists $Code2ID{'numeric'}{$num}) {
print "WARNING [un]: UN/ISO code mismatch (numeric defined): $country\n";
next;
}
my @country;
if (exists $Alias{lc($country)}) {
my($id2,$i2) = @{ $Alias{lc($country)} };
if (! defined($id)) {
($id,$i) = ($id2,$i2);
} elsif ($id ne $id2) {
print "WARNING [un]: UN/ISO code mismatch: $country\n";
next;
}
my @name = @{ $ID2Names{$id} };
@country = ($name[$i]);
} elsif (defined($id)) {
my @name = @{ $ID2Names{$id} };
@country = (_country_name($country),
@name);
} else {
@country = _country_name($country);
}
return ($num,$alpha,@country);
}
}
}
########################################
#
# ISO 3166-1
#
# The standard contains the alpha-2, alpha-3, and numeric codes. This
# is the official source of these codes.
#
# File format:
# =================
# Country name
# Country french name
# alpha-2
# alpha-3
# numeric
# =================
#
{
my $in;
sub _init_country_iso {
my $inst = qq
(Please download the data manually for ISO 3166 country codes.
Go to the following URL:
$country_iso_url
Click on:
'Online Browsing Platform'
'Officially assigned codes'
300 results per page
Select the entire chart (not including the header). If not all of the
countries fit on a single page, do it in multiple steps.
);
$in = _read_file('type' => 'manual',
'inst' => $inst,
'as_list' => 1,
);
}
sub _read_country_iso {
while (@$in) {
my $name = shift(@$in); shift(@$in);
shift(@$in); shift(@$in);
my $alpha2 = lc(shift(@$in)); shift(@$in);
my $alpha3 = lc(shift(@$in)); shift(@$in);
my $num = shift(@$in);
$name =~ s/\(the/\(The/;
return($alpha2,$alpha3,$num,_country_name($name));
}
return ();
}
}
# This takes some common country name formats and produces common aliases.
#
sub _country_name {
my($name) = @_;
my @ret;
if ($name =~ /^(.+), The (.+?) of$/ ||
$name =~ /^(.+) \(The (.+?) of\)$/) {
# NAME1, The NAME2 of
# NAME1 (The NAME2 of) =>
# The NAME2 of NAME1
# NAME2 of NAME1
my($n1,$n2) = ($1,$2);
push(@ret,"$n1, The $n2 of",
"$n1 (The $n2 of)",
"$n1, $n2 of",
"$n1 ($n2 of)",
"The $n2 of $n1",
"$n2 of $n1");
} elsif ($name =~ /^(.+), (.+?) of$/ |\
$name =~ /^(.+), \((.+?) of\)$/) {
# NAME1, NAME2 of
# NAME1, (NAME2 of) =>
# NAME2 of NAME1
my($n1,$n2) = ($1,$2);
push(@ret,"$n1, $n2 of",
"$n1 ($n2 of)",
"$n2 of $n1");
} elsif ($name =~ /^(.+), The$/ ||
$name =~ /^(.+) \(The\)$/) {
# NAME, The
# NAME (The) =>
# The NAME
# NAME
my($n1) = ($1);
push(@ret,$n1,
"The $n1",
"$n1, The",
"$n1 (The)");
# } elsif ($name =~ /^The (.+?) of (.+)$/) {
# # The NAME2 of NAME1
# my($n2,$n1) = ($1,$2);
# push(@ret,"$n1, The $n2 of",
# "$n1 (The $n2 of)",
# "$n1, $n2 of",
# "$n1 ($n2 of)",
# "The $n2 of $n1",
# "$n2 of $n1");
# } elsif ($name =~ /^(.+?) of (.+)$/) {
# # NAME2 of NAME1
# my($n2,$n1) = ($1,$2);
# push(@ret,"$n1, $n2 of",
# "$n1 ($n2 of)",
# "$n2 of $n1");
# } elsif ($name =~ /^The (.+)$/) {
# # The NAME
# my($n1) = ($1);
# push(@ret,$n1,
# "The $n1",
# "$n1, The",
# "$n1 (The)");
} else {
push(@ret,$name);
}
return @ret;
}
########################################
#
# IANA Domain Registry
#
# The IANA domain registry is the official source of domain management.
# The codes are stored in the IANA URL, but the country names must be
# read from the extended ISO list.
#
# File format for the IANA URL:
# ============
# <tr ...>
# <th>Domain</th>
# <th>Type</th>
# <th>Sponsoring Organisation</th>
# </tr>
# <tr ...>
# <td><span ...><a ...>.AD</a></span></td>
# <td>country-code</td>
# ...
# </tr>
# ============
#
# The extended ISO list is of the format:
# ============
# <tr ...>
# <th ...>Code</th>
# <th ...>Name</th>
# <th ...>Remark</th>
# <th ...>Status</th>
# </tr>
# <tr ...>
# <td ...><a ...></a>AD</td>
# <td ...>NAME</td>
# <td ...>...</td>
# <td ...>...</td>
# </tr>
# ============
{
my $in;
my %codes;
sub _init_country_iana {
#
# Get the extended ISO list first as a hash:
# $codes{CODE} = NAME
#
foreach my $code (keys %{ $Code2ID{'alpha-2'} }) {
my($id,$idx) = @{ $Code2ID{'alpha-2'}{$code} };
my $name = $ID2Names{$id}[$idx];
$codes{$code} = $name;
}
#
# The actual IANA list
#
$in = _read_file('url' => $country_iana_url,
'type' => 'html',
'as_list' => 0,
'html_strip' => [ qw(a span) ],
);
# Look for a table who's first row has the header:
# Sponsoring Organisation
my $found = jump_to_row(\$in,"Sponsoring Organisation");
if (! $found) {
die "ERROR [iana]: country code file format changed!\n";
}
}
sub _read_country_iana {
while (1) {
my @row = get_row("iana",\$in);
return () if (! @row);
my($dom,$type,$tmp) = @row;
next unless ($type eq "country-code" &&
$dom =~ /^\.[a-z][a-z]/);
$dom =~ s/^\.//;
my @country;
if (exists $Code2ID{'alpha-2'}{$dom}) {
my ($id,$i) = @{ $Code2ID{'alpha-2'}{$dom} };
my @name = @{ $ID2Names{$id} };
@country = ($name[$i]);
} elsif (exists $codes{$dom}) {
@country = _country_name($codes{$dom});
} else {
next;
}
return ($dom,@country);
}
}
}
############################################################################
# DO_LANGUAGE
############################################################################
sub do_language {
print "Language codes...\n";
$Module = "Language";
_do_codeset('language','iso2', ['alpha-3','term','alpha-2'],
['alpha-3','term','alpha-2']);
_do_codeset('language','iso5', ['alpha-3'],
['alpha-3'],'allow');
_do_codeset('language','iana', ['alpha-2','alpha-3'],
['alpha-2','alpha-3'],'allow');
do_aliases("language");
write_module("language");
}
########################################
#
# The official ISO 639.
#
# Data available consists of the language names and 2-letter and
# 3-letter codes. Language names include non-ASCII characters encoded in
# UTF-8. And (amazingly enough) it's available in a field delimited file!!!
#
{
my $in;
sub _init_language_iso2 {
$in = _read_file('url' => $language_iso2_url,
'as_list' => 1,
'encoding' => 'UTF-8',
);
}
sub _read_language_iso2 {
# File is a set of lines of fields delimited by "|". Fields are:
#
# alpha3
# term
# alpha2
# English names (semicolon separated list)
# French name
while (@$in) {
my $line = shift(@$in);
next if (! $line);
my($alpha3,$term,$alpha2,$language,$french) = split(/\|/,$line);
# The first line has some binary characters at the start.
if (length($alpha3)>3) {
$alpha3 = substr($alpha3,length($alpha3)-3);
}
my @language = split(/\s*;\s*/,$language);
$term = $alpha3 if (! $term);
return ($alpha3,$term,$alpha2,@language);
}
return ();
}
}
########################################
{
my $in;
sub _init_language_iso5 {
$in = _read_file('url' => $language_iso5_url,
'as_list' => 0,
);
# Look for a table who's first row has the header:
# Identifier
my $found = jump_to_row(\$in,'Identifier');
if (! $found) {
die "ERROR [iso5]: language code file format changed!\n";
}
}
sub _read_language_iso5 {
while (1) {
my @row = get_row("iso5",\$in);
return () if (! @row);
my($alpha3,$language) = @row;
next if (! $language);
if ($alpha3 && $alpha3 !~ /^[a-z][a-z][a-z]$/) {
print "WARNING [iso5]: Invalid alpha-3 code: $language => $alpha3\n";
next;
}
return ($alpha3,$language);
}
}
}
########################################
###
### The IANA language registration data is used to check:
### alpha-2, alpha-3
###
#
# Each entry is of the form:
# %%
# Type: language
# Subtag: aa
# Description: Afar
# Description: Afar 2
# Added: 2005-10-16
# Deprecated: 2009-01-01
#
# Ignore them if they're deprecated. We're only doing type 'language' here.
{
my $in;
sub _init_language_iana {
$in = _read_file('url' => $language_iana_url,
'as_list' => 1,
);
shift(@$in) until ($$in[0] eq '%%');
}
sub _read_language_iana {
while (1) {
my %entry = _iana_entry($in,'language');
last if (! %entry);
my(@language,$code,$alpha2,$alpha3);
$code = $entry{'Subtag'};
foreach my $language (@{ $entry{'Description'} }) {
push(@language,$language);
}
if (length($code) == 2) {
$alpha2 = lc($code);
} else {
$alpha3 = lc($code);
}
return ($alpha2,$alpha3,@language);
}
return ();
}
}
########################################
# Read the next entry from the IANA file
sub _iana_entry {
my ($in,@type) = @_;
my %type = map { $_,1 } @type;
my %entry;
while (1) {
%entry = ();
return %entry if (! @$in);
# Read an entire entry (starting with '%%' and ending
# just before the next '%%'.
#
# Long lines may be split (and all lines but the first
# are indented)
my $oldkey;
shift(@$in);
while (@$in && $$in[0] ne '%%') {
my $line = shift(@$in);
while (@$in &&
$$in[0] =~ /^\s+/) {
$$in[0] =~ s/^\s+//;
$line .= " $$in[0]";
shift(@$in);
}
$line =~ /^(.*?):\s*(.*)$/;
my($key,$val) = ($1,$2);
if ($key eq 'Description') {
if (exists $entry{$key}) {
push( @{ $entry{$key} },$val );
} else {
$entry{$key} = [ $val ];
}
} else {
$entry{$key} = $val;
}
}
# If the entry is deprecated, or the wrong type,
# read the next one.
next if (! %entry ||
exists $entry{'Deprecated'} ||
! exists $entry{'Type'} ||
! exists $type{ $entry{'Type'} });
return %entry;
}
}
############################################################################
# DO_CURRENCY
############################################################################
sub do_currency {
print "Currency codes...\n";
$Module = "Currency";
_do_codeset('currency','iso', ['alpha','num'], ['alpha','num']);
do_aliases("currency");
write_module("currency");
}
########################################
###
### The first set we'll do is the ISO 4217 codes.
###
{
my $in;
sub _init_currency_iso {
$in = _read_file('url' => $currency_iso_url,
'head' => 'ENTITY',
'as_list' => 1,
'type' => 'xls',
'join' => 1,
'encoding' => 'UTF-8',
);
}
sub _read_currency_iso {
while (@$in) {
my $ele = shift(@$in);
next if (! $ele);
my $currency = $$ele{'Currency'};
my $alpha = $$ele{'Alphabetic Code'};
my $num = $$ele{'Numeric Code'};
$num = "" if (! defined($num));
$currency = "" if (! defined($currency));
$alpha = "" if (! defined($alpha));
$currency =~ s/\s+$//;
if ($num) {
$num = "0$num" while (length($num) < 3);
if ($num !~ /^\d\d\d+$/) {
print "WARNING [iso]: Invalid numeric code: $currency => $num\n";
next;
}
}
$alpha = uc($alpha);
if ($alpha && $alpha !~ /^[A-Z][A-Z][A-Z]$/) {
print "WARNING [iso]: Invalid alpha code: $currency => $alpha\n";
next;
}
next if (! $alpha && ! $num);
return ($alpha,$num,$currency);
}
return ();
}
}
############################################################################
# DO_SCRIPT
############################################################################
sub do_script {
print "Script codes...\n";
$Module = "Script";
_do_codeset('script','iso', ['alpha','num'], ['alpha','num']);
_do_codeset('script','iana', ['alpha'], ['alpha'], 'allow');
do_aliases("script");
write_module("script");
}
########################################
# We'll first read data from the official ISO 15924.
#
# Data available consists of the script names and 2-letter and
# 3-letter codes. Script names include non-ASCII characters encoded in
# UTF-8. And (amazingly enough) it's available in a field delimited file!!!
#
# The zip file contains a series of lines in the form:
# alpha;numeric;english;...
# The data is in UTF-8.
#
# Every line has an unprintable character at the end.
#
{
my $in;
sub _init_script_iso {
$in = _read_file('url' => $script_iso_url,
'as_list' => 1,
'type' => 'zip',
'file' => $script_iso_zip,
'chop' => 1,
);
}
sub _read_script_iso {
while (@$in) {
my $line = shift(@$in);
next if (! $line || $line =~ /^\043/);
my($alpha,$num,$script) = split(/;/,$line);
return ($alpha,$num,$script);
}
return ();
}
}
########################################
###
### The IANA script registration data is used to check:
### alpha
###
# Each entry is of the form:
# %%
# Type: script
# Subtag: Elba
# Description: Elbasan
# Added: 2005-10-16
# Deprecated: 2009-01-01
#
# Ignore them if they're deprecated. We're only doing type 'script' here.
{
my $in;
sub _init_script_iana {
$in = _read_file('url' => $script_iana_url,
'as_list' => 1,
);
shift(@$in) until ($$in[0] eq '%%');
}
sub _read_script_iana {
while (1) {
my %entry = _iana_entry($in,'script');
last if (! %entry);
my(@script,$alpha);
$alpha = $entry{'Subtag'};
foreach my $script (@{ $entry{'Description'} }) {
push(@script,$script);
}
return ($alpha,@script);
}
return ();
}
}
############################################################################
# DO_LANGEXT
############################################################################
sub do_langext {
print "LangExt codes...\n";
$Module = "LangExt";
_do_codeset('langext','iana', ['alpha'], ['alpha']);
do_aliases("langext");
write_module("langext");
}
########################################
#
# IANA language registration
#
# Data available consists of the script names and 2-letter and
# 3-letter codes. Script names include non-ASCII characters encoded in
# UTF-8. And (amazingly enough) it's available in a field delimited file!!!
#
###
### The IANA langext registration data is used to check:
### alpha
###
# Each entry is of the form:
# %%
# Type: extlang
# Subtag: aao
# Description: Algerian Saharan Arabic
# Prefix: ar
# Added: 2005-10-16
# Deprecated: 2009-01-01
#
# Ignore them if they're deprecated. We're only doing type 'extlang' here.
{
my $in;
sub _init_langext_iana {
$in = _read_file('url' => $langext_iana_url,
'as_list' => 1,
);
shift(@$in) until ($$in[0] eq '%%');
}
sub _read_langext_iana {
while (1) {
my %entry = _iana_entry($in,'extlang');
last if (! %entry);
my(@langext,$alpha);
$alpha = $entry{'Subtag'};
foreach my $langext (@{ $entry{'Description'} }) {
push(@langext,$langext);
}
return ($alpha,@langext);
}
return ();
}
}
############################################################################
# DO_LANGVAR
############################################################################
sub do_langvar {
print "LangVar codes...\n";
$Module = "LangVar";
_do_codeset('langvar','iana', ['alpha'], ['alpha']);
do_aliases("langvar");
write_module("langvar");
}
########################################
#
# IANA language registration
#
# Data available consists of the script names and 2-letter and
# 3-letter codes. Script names include non-ASCII characters encoded in
# UTF-8. And (amazingly enough) it's available in a field delimited file!!!
#
###
### The IANA langvar registration data is used to check:
### alpha
###
# Each entry is of the form:
# %%
# Type: variant
# Subtag: 1901
# Description: Traditional German orthography
# Added: 2005-10-16
# Prefix: de
# Deprecated: 2009-01-01
#
# Ignore them if they're deprecated. We're only doing type 'variant' here.
{
my $in;
sub _init_langvar_iana {
$in = _read_file('url' => $langvar_iana_url,
'as_list' => 1,
);
shift(@$in) until ($$in[0] eq '%%');
}
sub _read_langvar_iana {
while (1) {
my %entry = _iana_entry($in,'variant');
last if (! %entry);
my(@langvar,$alpha);
$alpha = $entry{'Subtag'};
foreach my $langvar (@{ $entry{'Description'} }) {
push(@langvar,$langvar);
}
return ($alpha,@langvar);
}
return ();
}
}
############################################################################
# DO_LANGFAM
############################################################################
sub do_langfam {
print "LangFam codes...\n";
$Module = "LangFam";
_do_codeset('langfam','iso', ['alpha'], ['alpha']);
do_aliases("langfam");
write_module("langfam");
}
########################################
#
# ISO 639-5
#
# <table class="Dynamic639-5OutputTables" ... >
# <tr valign="top">
# <th scope="col">Identifier<br />Indicatif</th>
# <th scope="col">English name<br />Nom anglais</th>
# <th scope="col">French name<br />Nom français</th>
# <th scope="col">639-2</th>
# <th scope="col">Hierarchy<br />Hiérarchie</th>
# <th scope="col">Notes<br />Notes</th>
# </tr>
# <tr>
# <td scope="row">aav</td>
# <td>Austro-Asiatic languages</td>
# <td>austro-asiatiques, langues</td>
# <td>
# <br />
# </td>
# <td>aav</td>
# <td>
# <br />
# </td>
# </tr>
#
# ...
#
# <tr valign="top">
# <td colspan="6">
# <ol class="loweralpha">
{
my $in;
sub _init_langfam_iso {
$in = _read_file('url' => $langfam_iso_url,
'type' => 'html',
'as_list' => 0,
'html_strip' => [ qw(br p strong div) ],
'html_repl' => [ qw( ) ],
);
# Look for a table who's first row has the header:
# Identifier
my $found = jump_to_row(\$in,"Identifier");
if (! $found) {
die "ERROR [iso]: language family code file format changed!\n";
}
}
sub _read_langfam_iso {
while (1) {
my @row = get_row("iso",\$in);
return () if (! @row);
my($alpha,$langfam) = @row;
return () if ($alpha =~ /class="loweralpha"/);
if (! $alpha || ! $langfam) {
$alpha = '' if (! $alpha);
$langfam = '' if (! $langfam);
print "WARNING [iso]: Invalid langfam code: $langfam => $alpha\n";
next;
}
$alpha = lc($alpha);
if ($alpha !~ /^[a-z][a-z][a-z]$/) {
print "WARNING [iso]: Invalid alpha code: $langfam => $alpha\n";
next;
}
return($alpha,$langfam);
}
}
}
############################################################################
# PRINT_TABLE
############################################################################
sub _type_hashes {
my($caller) = @_;
return($Data{$caller}{'alias'});
}
############################################################################
# CHECK CODES
############################################################################
sub check_code {
my($type,$codeset,$code,$name,$currID,$noprint) = @_;
# Check to make sure that the code is defined.
if (exists $Code2ID{$codeset}{$code}) {
return _check_code_exists($type,$codeset,$code,$name,$currID);
} else {
return _check_code_new($type,$codeset,$code,$name,$currID,$noprint);
}
}
sub _check_code_exists {
my($type,$codeset,$code,$name,$currID) = @_;
# Check the currID for the code. It must be the same as the one
# passed in.
my $oldID = $Code2ID{$codeset}{$code}[0];
if ($currID != $oldID) {
print "ERROR [$type]: ID mismatch in code: [$codeset, $name, $code, $currID != $oldID ]\n";
return 1;
}
# If the name is defined, it must be the same ID. If it is not,
# create a new alias.
if (exists $Alias{lc($name)}) {
my $altID = $Alias{lc($name)}[0];
if ($currID != $altID) {
print "ERROR [$type]: ID mismatch: [$codeset, $name, $code, $currID != $altID ]\n";
return 1;
}
} else {
push @{ $ID2Names{$currID} },$name;
my $i = $#{ $ID2Names{$currID} };
$Alias{lc($name)} = [ $currID, $i ];
}
return 0;
}
# This is a new code.
sub _check_code_new {
my($type,$codeset,$code,$name,$newID,$noprint) = @_;
print "INFO [$type]: New code: $codeset [$code] => $name\n" unless ($noprint);
# If this code's name isn't defined, create it.
my $i;
if (exists $Alias{lc($name)}) {
$i = $Alias{lc($name)}[1];
} else {
push @{ $ID2Names{$newID} },$name;
$i = $#{ $ID2Names{$newID} };
$Alias{lc($name)} = [ $newID, $i ];
}
# This name is the canonical name for the code.
$ID2Code{$codeset}{$newID} = $code;
$Code2ID{$codeset}{$code} = [ $newID, $i ];
return 0;
}
########################################
sub _get_ID {
my($op,$type,$name,$no_create) = @_;
my $type_alias = _type_hashes($op);
my($currID,$i,$t);
if (exists $Alias{lc($name)}) {
# The element is the same name as one previously defined
($currID,$i) = @{ $Alias{lc($name)} };
$t = "same";
} elsif (exists $$type_alias{$name}) {
# It's a new alias for an existing element
my $c = $$type_alias{$name};
if (! exists $Alias{lc($c)}) {
print "WARNING [$op,$type]: alias referenced before it is defined: $name => $c\n";
return (1);
}
$currID = $Alias{lc($c)}[0];
push @{ $ID2Names{$currID} },$name;
$i = $#{ $ID2Names{$currID} };
$Alias{lc($name)} = [ $currID, $i ];
$t = "alias";
} else {
# It's a new element.
if ($no_create) {
return(0,-1,-1,"new");
}
$currID = $ID++;
$i = 0;
$ID2Names{$currID} = [ $name ];
$Alias{lc($name)} = [ $currID, $i ];
$t = "new";
}
return(0,$currID,$i,$t);
}
# This takes a list of codes and names and checks to see if we've got
# an ID for this element, or if it is a new element.
#
# If $second is non-zero, then this is the second (or more) codeset of
# a given type and we are expected to always have an element to match
# with, or that it is flagged in the data files as a known new value.
# This can be overridden if $allow is non-zero.
#
sub _get_ID_new {
my($type,$src,$second,$allow,$codes,$names) = @_;
my($id,$subid) = ('','');
#
# Check each of the names to see if it's been previously defined.
#
NAME:
foreach my $name (@$names) {
#
# If we've already used this name before, it'll be defined in
# %Alias. Make sure that the ID is the same for all names assigned
# to this element.
#
if (exists $Alias{lc($name)}) {
my $i = $Alias{lc($name)}[0];
if ($id && $i ne $id) {
print "WARNING [$type,$src]: " .
"name refers to multiple elements: $name => $id,$i\n";
return (1);
}
$id = $i;
next NAME;
}
#
# If we've already got an ID, or if this is the first standard
# read in, then this is just a new alias.
#
next NAME if ($id || ! $second || $allow);
#
# If this is a totally new name, then we need to have explicitly
# allow it.
#
if (! exists $Data{$type}{$src}{'new'}{$name} &&
! exists $Data{$type}{$src}{'orig'}{'name'}) {
print "WARNING [$type,$src]: " .
"new name not allowed: $name\n";
return (1);
}
}
#
# If any of the codes entered here are already defined in another
# data source, make sure they are consistent. In general, if a
# codeset only comes from a single source, this should not be a
# problem.
#
foreach my $codeset (keys %$codes) {
my $code = $$codes{$codeset};
if (exists $Code2ID{$codeset}{$code}) {
my($i,$s) = @{ $Code2ID{$codeset}{$code} };
if ($id && $i ne $id) {
print "WARNING [$type,$src,$codeset]: " .
"code refers to multiple elements: $code => $id,$i\n";
return (1);
}
($id,$subid) = ($i,$s);
}
}
#
# If it's a new name for an existing element, add each of the names
# to %Alias.
#
if ($id) {
my $name = $$names[0];
if (exists $Alias{lc($name)}) {
$subid = $Alias{lc($name)}[1];
} else {
push @{ $ID2Names{$id} },$name;
$subid = $#{ $ID2Names{$id} };
$Alias{lc($name)} = [ $id, $subid ];
}
foreach $name (@$names) {
if (! exists $Alias{lc($name)}) {
push @{ $ID2Names{$id} },$name;
my $s = $#{ $ID2Names{$id} };
$Alias{lc($name)} = [ $id, $s ];
}
}
}
#
# If it's a new element, create it and all aliases.
#
if (! $id) {
$id = $ID++;
$subid = 0;
$ID2Names{$id} = [ @$names ];
my $sid = $subid;
foreach my $name (@$names) {
$Alias{lc($name)} = [ $id, $sid++ ];
}
}
return(0,$id,$subid);
}
############################################################################
# DO_ALIASES
############################################################################
sub do_aliases {
my($caller) = @_;
my ($type_alias) = _type_hashes($caller);
# Add remaining aliases.
foreach my $alias (keys %$type_alias) {
my $type = $$type_alias{$alias};
next if (exists $Alias{lc($type)} &&
exists $Alias{lc($alias)});
if (! exists $Alias{lc($type)} &&
! exists $Alias{lc($alias)}) {
print "WARNING: unused type in alias list: $type\n";
print "WARNING: unused type in alias list: $alias\n";
next;
}
my ($typeID);
if (exists $Alias{lc($type)}) {
$typeID = $Alias{lc($type)}[0];
$type = $alias;
} else {
$typeID = $Alias{lc($alias)}[0];
}
push @{ $ID2Names{$typeID} },$type;
my $i = $#{ $ID2Names{$typeID} };
$Alias{lc($type)} = [ $typeID, $i ];
}
}
############################################################################
# WRITE_MODULE
############################################################################
sub write_module {
my($type) = @_;
my(%hashes) = ("id2names" => "ID2Names",
"alias2id" => "Alias",
"code2id" => "Code2ID",
"id2code" => "ID2Code");
my $file = "$ModDir/${Module}_Codes.pm";
my $out = new IO::File;
$out->open(">$file");
binmode $out, ":encoding(UTF-8)";
my $timestamp = `date`;
chomp($timestamp);
print $out "package #
Locale::Codes::${Module}_Codes;
# This file was automatically generated. Any changes to this file will
# be lost the next time 'harvest_data' is run.
# Generated on: $timestamp
use strict;
require 5.006;
use warnings;
use utf8;
our(\$VERSION);
\$VERSION='3.53';
\$Locale::Codes::Data{'$type'}{'id'} = '$ID';
";
foreach my $h (qw(id2names alias2id code2id id2code)) {
my $hash = $hashes{$h};
print $out "\$Locale::Codes::Data{'$type'}{'$h'} = {\n";
_write_hash($out,$hash);
print $out "};\n\n";
}
print $out "1;\n";
$out->close();
}
sub _write_hash {
my($out,$hashname) = @_;
no strict 'refs';
my %hash = %$hashname;
use strict 'refs';
_write_subhash($out,3,\%hash);
}
sub _write_subhash {
my($out,$indent,$hashref) = @_;
my %hash = %$hashref;
my $ind = " "x$indent;
foreach my $key (sort keys %hash) {
my $val = $hash{$key};
if (ref($val) eq "HASH") {
print $out "${ind}q($key) => {\n";
_write_subhash($out,$indent+3,$val);
print $out "${ind} },\n";
} elsif (ref($val) eq "ARRAY") {
print $out "${ind}q($key) => [\n";
_write_sublist($out,$indent+3,$val);
print $out "${ind} ],\n";
} else {
print $out "${ind}q($key) => q($val),\n";
}
}
}
sub _write_sublist {
my($out,$indent,$listref) = @_;
my @list = @$listref;
my $ind = " "x$indent;
foreach my $val (@list) {
if (ref($val) eq "HASH") {
print $out "${ind}{\n";
_write_subhash($out,$indent+3,$val);
print $out "${ind}},\n";
} elsif (ref($val) eq "ARRAY") {
print $out "${ind}[\n";
_write_sublist($out,$indent+3,$val);
print $out "${ind}],\n";
} else {
print $out "${ind}q($val),\n";
}
}
}
############################################################################
# HANDLE CODESET
############################################################################
sub _read_file {
my(%opts) = @_;
#
# Get the URL
#
# The temporary file
my $file; # _init_country_iso
if (exists $opts{'local'}) {
$file = $opts{'local'};
} else {
$file = (caller(1))[3];
$file =~ s/main:://;
}
# The type of file
my $type = $opts{'type'};
$type = 'text' if (! $type);
my $file2 = '';
if ($type eq 'html') {
$file .= ".htm";
} elsif ($type eq 'xls') {
$file .= ".xls";
} elsif ($type eq 'xlsx') {
$file .= ".xlsx";
} elsif ($type eq 'zip') {
$file2 = "$file.txt";
$file .= ".zip";
} else {
$file .= ".txt";
}
# Get the file
if ($type eq 'manual') {
while (! -f $file) {
my $inst = $opts{'inst'};
print $inst,"\n";
print "Put the data into the file:\n";
print " $file\n";
print "Strip out any leading/trailing blank lines.\n\n";
print "Press any key to continue...\n";
my $c = getone();
}
} else {
my $url = $opts{'url'};
system("wget -N -q --no-check-certificate -O $file '$url'");
}
#
# Read the local file
#
my(@in);
if ($type eq 'xls') {
#
# Read an XLS file
#
my $csv = $file;
$csv =~ s/.xls/.csv/;
# New command
my $cmd = "xls2csv.py $file > $csv; dos2unix $csv";
system($cmd);
@in = `cat $csv`;
chomp(@in);
if ($opts{'head'}) {
my $head = $opts{'head'};
while ($in[0] !~ /$head/) {
shift(@in);
}
}
# The first line (headers) must have the correct number of fields.
my $n = _csv_count_columns($in[0]);
if ($opts{'join'}) {
# Some CSV files have newlines in the value. This looks
# for lines without the correct number of fields. When found,
# the following line is joined to it.
my @tmp;
LINE:
while (@in) {
my $line = shift(@in);
while (1) {
my $nn = _csv_count_columns($line);
if ($nn == $n) {
push(@tmp,$line);
next LINE;
} elsif ($nn > $n) {
print "ERROR: Invalid line skipped:\n$line\n";
next LINE;
} else {
$line .= " " . shift(@in);
next;
}
}
}
@in = @tmp;
}
my $in = Text::CSV::Slurp->load(string => join("\n",@in));
@in = @$in;
$opts{'as_list'} = 1; # required
} elsif ($type eq 'xlsx') {
#
# Read an XLSX file
#
my $excel = Spreadsheet::XLSX->new($file);
foreach my $sheet (@{$excel->{Worksheet}}) {
my $name = $sheet->{Name};
next if ($opts{'sheet'} && $opts{'sheet'} ne $name);
$sheet->{MaxRow} ||= $sheet->{MinRow};
foreach my $row ($sheet->{MinRow} .. $sheet->{MaxRow}) {
$sheet->{MaxCol} ||= $sheet->{MinCol};
my @row = ();
foreach my $col ($sheet->{MinCol} .. $sheet->{MaxCol}) {
my $cell = $sheet->{Cells}[$row][$col];
my $val = $cell->{Val} if ($cell);
$val = '' if (! defined $val);
push(@row,"\"$val\"");
}
push(@in,join(',',@row) . "\n");
}
}
} elsif ($type eq 'zip') {
#
# Read one file in a zip file
#
my $zip = Archive::Zip->new($file);
my @file = grep /$opts{'file'}/,$zip->memberNames();
my $flag = $zip->extractMember($file[0],$file2);
if (! defined($flag)) {
die "ERROR [iso]: zip file changed format\n";
}
@in = `cat $file2`;
} elsif ($opts{'encoding'}) {
#
# Read an encoded text file
#
open(my $in,"<:encoding($opts{encoding})",$file);
@in = <$in>;
close($in);
} else {
#
# Read an ASCII text file
#
@in = `cat $file`;
}
chomp(@in);
chop(@in) if ($opts{'chop'});
#
# If it was encoded, make sure it's in UTF-8
#
if ($opts{'encoding'} && $opts{'encoding'} ne 'UTF-8') {
my $in = join("\n",@in);
$in = encode('UTF-8',$in);
@in = split("\n",$in);
}
#
# Strip out some problem strings.
#
if ($opts{'html_strip'} || $opts{'html_repl'}) {
my $in = join("\n",@in);
strip_tags(\$in,@{ $opts{'html_strip'} }) if ($opts{'html_strip'});
if ($opts{'html_repl'}) {
foreach my $repl (@{ $opts{'html_repl'} }) {
if (ref($repl)) {
$in =~ s/$repl/ /sg;
} else {
$in =~ s/\Q$repl\E/ /sg;
}
}
$in =~ s/\s+/ /sg;
}
@in = split("\n",$in);
}
#
# Return the contents of the file as a list or a string.
#
if ($opts{'as_list'}) {
return \@in;
} else {
return join(" ",@in);
}
}
sub _csv_count_columns {
my($line) = @_;
my $c = 0; # Number of commas found
while ($line) {
# "Value"
# "Value\n continued"
if ($line =~ /^"/) {
$line =~ s/^".*?($|")//;
} else {
$line =~ s/^[^,]*//;
}
$c++ if ($line =~ s/^,//);
}
return $c+1;
}
{
my $second; # This will be set to 1 once the first set is read in.
# This reads a source of data containing one or more code sets of
# a given type.
#
# $type The type of codesets being input (country, language, etc.)
# $src The label for this source of data
# $codesets A listref of code sets that are included in this data
# source. The order is important. It tells what order the
# data is stored in the data source. A data source may
# include data sets for which it is not the standard, and
# these will be used simply to match with existing elements.
# Element names (and links) will be determined using all
# sources, but codes will only be added from codesets for
# which a source is listed as a standard.
# $stdcodesets A listref of code sets. This is the subset of $codesets
# for which this source is the standard. The first time a
# codeset it read in, it must be from a standard. Multiple
# standards can be used (and the data from them will be
# merged) but all standards should be read before other
# sources are read.
# $allow This source is allowed to add new codes without explicit
# allows. This only applies to the second or higher source.
#
sub _do_codeset {
my($type,$src,$codesets,$stdcodesets,$allow) = @_;
$allow = 0 if (! $allow);
if (! defined $second) {
$second = 0;
} else {
$second = 1;
}
my %std = map { $_,1 } @$stdcodesets;
#
# The _init_TYPE_CAT function gets all of the data from
# this source and puts it in some sort of list.
#
# The _read_TYPE_CAT function reads one element from that list.
#
no strict 'refs';
my $func = "_init_${type}_${src}";
&$func();
$func = "_read_${type}_${src}";
ELE:
while (1) {
#
# Read the next element.
#
# Output is (CODE1, CODE2, ... CODEN, NAME1, NAME2, ... NAMEM)
#
# The order of the codes is specified by $codesets.
#
my @ele = &$func();
last if (! @ele);
#
# Store the codes in %codes
# %codes = ( CODESET => CODE )
# If CODE is blank, it is quietly ignored.
#
# A code is also ignored if it is in the 'ignore' list. If a name
# is ignored, the entire element is skipped.
#
my (%codes,@names);
foreach my $codeset (@$codesets) {
my $code = shift(@ele);
next if (! defined($code) ||
$code eq '' ||
exists $Data{$type}{$src}{'ignore'}{$codeset}{$code});
$codes{$codeset} = $code;
}
foreach my $name (@ele) {
if ($name) {
next ELE if (exists $Data{$type}{$src}{'ignore'}{'name'}{$name});
push(@names,$name);
}
}
next if (! @names && ! %codes);
if (! @names) {
my @codes = sort values(%codes);
print "WARNING [$type,$src]: Codes with no name: @codes\n";
next;
}
if (! %codes) {
print "WARNING [$type,$src]: Element with no codes: @names\n";
next;
}
#
# Some codes and/or element names must be rewritten (probably
# to remove non-ASCII characters, but other reasons also
# occur). If a name appears as both ASCII and non-ASCII,
# make sure it isn't duplicated)
#
foreach my $codeset (sort keys %codes) {
my $code = $codes{$codeset};
if (exists $Data{$type}{$src}{'orig'}{$codeset}{$code}) {
$codes{$codeset} = $Data{$type}{$src}{'orig'}{$codeset}{$code};
}
}
my(%tmp,@tmp);
foreach my $name (@names) {
if (exists $Data{$type}{$src}{'orig'}{'name'}{$name}) {
$name = $Data{$type}{$src}{'orig'}{'name'}{$name};
}
next if (exists $tmp{$name});
$tmp{$name} = 1;
push(@tmp,$name);
}
@names = @tmp;
#
# Check that everything is ASCII
#
foreach my $codeset (sort keys %codes) {
my $code = $codes{$codeset};
_ascii_new($type,$src,$codeset,$code);
}
foreach my $name (@names) {
_ascii_new($type,$src,'name',$name);
}
#
# Get the ID for the current element
#
my($err,$id,$subid) = _get_ID_new($type,$src,$second,$allow,
\%codes,\@names);
next if ($err);
#
# Store the codes (but only if we're reading a standard). If we're
# not reading from a standard, we'll check to see if this would have
# been a new code, and warn if it was.
#
foreach my $codeset (keys %codes) {
my $code = $codes{$codeset};
if ($std{$codeset}) {
$Code2ID{$codeset}{$code} = [ $id, $subid ];
$ID2Code{$codeset}{$id} = $code;
} elsif (! exists $Code2ID{$codeset}{$code}) {
print "WARNING [$type,$src,$codeset]: " .
"new code not added from a non-standard source: $code\n";
}
}
}
#
# Update %Alias with the values in $Data{TYPE}{'link'}.
#
my @tmp;
LINKS:
foreach my $links (@{ $Data{$type}{'link'} }) {
# Check to see if any of the names in a link group are defined
# in %Alias. If any are, they must have the same ID.
my $id;
foreach my $link (@$links) {
if (exists $Alias{lc($link)}) {
my $i = $Alias{lc($link)}[0];
if ($id && $i != $id) {
print "WARNING [$type,$src]: " .
"alias refers to multiple elements: $link\n";
next LINKS;
}
$id = $i;
}
}
# If any are defined, add all the rest to %Alias with the same
# ID. Otherwise, save this link group for later.
if ($id) {
foreach my $name (@$links) {
if (! exists $Alias{lc($name)}) {
push @{ $ID2Names{$id} },$name;
my $subid = $#{ $ID2Names{$id} };
$Alias{lc($name)} = [ $id, $subid ];
}
}
} else {
push(@tmp,$links);
}
}
$Data{$type}{'link'} = \@tmp;
}
}
sub _ascii_new {
my($type,$src,$key,$val) = @_;
if ($val !~ /^[[:ascii:]]*$/) {
my $tmp = $val;
$tmp =~ s/[[:ascii:]]//g;
print "NON-ASCII [$type,$src,$key]: '$val' [$tmp]\n";
}
}
############################################################################
# HTML SCRAPING
############################################################################
sub get_row {
my($type,$inref) = @_;
return () if ($$inref !~ m,^\s*<tr,);
if ($$inref !~ s,^(.*?)</tr[^>]*>,,) {
die "ERROR [$type]: malformed HTML\n";
}
my $row = $1;
if ($row =~ m,<table,) {
die "ERROR [$type]: embedded table\n";
}
my @row;
while ($row =~ s,(?:.*?)<(td|th)[^>]*>\s*(.*?)\s*</\1[^>]*>,,) {
my $val = $2;
push(@row,$val);
}
return @row;
}
# If nested is non-zero, then the header row has a table nested in each column
# and we're looking for $header somewhere in that nested table.
#
sub jump_to_row {
my($inref,$header,$nested) = @_;
if ($nested) {
my $err;
return 0
if ($$inref !~ s,^(.*?)\Q$header\E(.*?)</table[^>]*>\s*</td[^>]*>\s*,,);
while ($$inref =~ m,^<td,) {
$err = strip_entry($inref);
return 0 if ($err);
}
return 0 if ($$inref !~ s,^\s*</tr[^>]*>,,);
return 1;
}
if ($$inref =~ s,^(.*?)\Q$header\E(.*?)</tr[^>]*>\s*(?=<tr),,) {
return 1;
} else {
return 0;
}
}
sub jump_to_entry {
my($inref,$value) = @_;
if ($$inref =~ s,(.*?)(?=<(?:td|th)[^>]*>\s*\Q$value\E\s*),,) {
return 1;
} else {
return 0;
}
}
sub jump_to_table {
my($inref) = @_;
if ($$inref =~ s,(.*?)(?=<table),,) {
return 1;
} else {
return 0;
}
}
sub get_entry {
my($inref) = @_;
if ($$inref =~ s,.*?<td[^>]*>\s*(.*?)\s*</td[^>]*>,,) {
return $1;
}
return "";
}
sub strip_tags {
my($inref,@tags) = @_;
foreach my $tag (@tags) {
$$inref =~ s,</?$tag[^>]*>, ,g;
}
}
sub strip_token {
my($inref) = @_;
$$inref =~ s,^\s*,,;
if ($$inref =~ s,^</([^>]*)>,,) {
my $tag = $1;
$tag =~ s,\s.*$,,;
return ('close',$tag);
} elsif ($$inref =~ s,^<([^>]*)>,,) {
my $tag = $1;
$tag =~ s,\s.*$,,;
return ('open',$tag);
} else {
$$inref =~ s,^([^<]*),,;
my $val = $1;
$val =~ s,\s*$,,;
return ('val',$val);
}
}
# Strip an entire portion of HTML. If the HTML starts with
# <TAG>
# it will strip everything up to the matching
# </TAG>
# correctly handling nested elements.
#
sub strip_entry {
my($inref) = @_;
my(@tag);
while (1) {
my($op,$val) = strip_token($inref);
if ($op eq 'open') {
push(@tag,$val);
next;
} elsif ($op eq 'close') {
my $old = pop(@tag);
if ($old ne $val) {
return 1;
}
last if (! @tag);
} else {
last if (! @tag);
next;
}
}
return 0;
}
###############################################################################
BEGIN {
use POSIX qw(:termios_h);
my $fd_stdin = fileno(STDIN);
my $term = POSIX::Termios->new();
$term->getattr($fd_stdin);
my $oterm = $term->getlflag();
my $echo = ECHO | ECHOK | ICANON;
my $noecho = $oterm & ~$echo;
sub cbreak {
$term->setlflag($noecho);
$term->setcc(VTIME, 1);
$term->setattr($fd_stdin, TCSANOW);
}
sub cooked {
$term->setlflag($oterm);
$term->setcc(VTIME, 0);
$term->setattr($fd_stdin, TCSANOW);
}
sub getone {
my $key = '';
cbreak();
sysread(STDIN, $key, 1);
cooked();
return $key;
}
}
END { cooked() }
# Local Variables:
# mode: cperl
# indent-tabs-mode: nil
# cperl-indent-level: 3
# cperl-continued-statement-offset: 2
# cperl-continued-brace-offset: 0
# cperl-brace-offset: 0
# cperl-brace-imaginary-offset: 0
# cperl-label-offset: 0
# End: