#
#
{
my $in;
sub _init_langfam_iso {
$in = _read_file('url' => $langfam_iso_url,
'type' => 'html',
'as_list' => 0,
'html_strip' => [ qw(br p strong div) ],
'html_repl' => [ qw( ) ],
);
# Look for a table who's first row has the header:
# Identifier
my $found = jump_to_row(\$in,"Identifier");
if (! $found) {
die "ERROR [iso]: language family code file format changed!\n";
}
}
sub _read_langfam_iso {
while (1) {
my @row = get_row("iso",\$in);
return () if (! @row);
my($alpha,$langfam) = @row;
return () if ($alpha =~ /class="loweralpha"/);
if (! $alpha || ! $langfam) {
$alpha = '' if (! $alpha);
$langfam = '' if (! $langfam);
print "WARNING [iso]: Invalid langfam code: $langfam => $alpha\n";
next;
}
$alpha = lc($alpha);
if ($alpha !~ /^[a-z][a-z][a-z]$/) {
print "WARNING [iso]: Invalid alpha code: $langfam => $alpha\n";
next;
}
return($alpha,$langfam);
}
}
}
############################################################################
# PRINT_TABLE
############################################################################
sub _type_hashes {
my($caller) = @_;
return($Data{$caller}{'alias'});
}
############################################################################
# CHECK CODES
############################################################################
sub check_code {
my($type,$codeset,$code,$name,$currID,$noprint) = @_;
# Check to make sure that the code is defined.
if (exists $Code2ID{$codeset}{$code}) {
return _check_code_exists($type,$codeset,$code,$name,$currID);
} else {
return _check_code_new($type,$codeset,$code,$name,$currID,$noprint);
}
}
sub _check_code_exists {
my($type,$codeset,$code,$name,$currID) = @_;
# Check the currID for the code. It must be the same as the one
# passed in.
my $oldID = $Code2ID{$codeset}{$code}[0];
if ($currID != $oldID) {
print "ERROR [$type]: ID mismatch in code: [$codeset, $name, $code, $currID != $oldID ]\n";
return 1;
}
# If the name is defined, it must be the same ID. If it is not,
# create a new alias.
if (exists $Alias{lc($name)}) {
my $altID = $Alias{lc($name)}[0];
if ($currID != $altID) {
print "ERROR [$type]: ID mismatch: [$codeset, $name, $code, $currID != $altID ]\n";
return 1;
}
} else {
push @{ $ID2Names{$currID} },$name;
my $i = $#{ $ID2Names{$currID} };
$Alias{lc($name)} = [ $currID, $i ];
}
return 0;
}
# This is a new code.
sub _check_code_new {
my($type,$codeset,$code,$name,$newID,$noprint) = @_;
print "INFO [$type]: New code: $codeset [$code] => $name\n" unless ($noprint);
# If this code's name isn't defined, create it.
my $i;
if (exists $Alias{lc($name)}) {
$i = $Alias{lc($name)}[1];
} else {
push @{ $ID2Names{$newID} },$name;
$i = $#{ $ID2Names{$newID} };
$Alias{lc($name)} = [ $newID, $i ];
}
# This name is the canonical name for the code.
$ID2Code{$codeset}{$newID} = $code;
$Code2ID{$codeset}{$code} = [ $newID, $i ];
return 0;
}
########################################
sub _get_ID {
my($op,$type,$name,$no_create) = @_;
my $type_alias = _type_hashes($op);
my($currID,$i,$t);
if (exists $Alias{lc($name)}) {
# The element is the same name as one previously defined
($currID,$i) = @{ $Alias{lc($name)} };
$t = "same";
} elsif (exists $$type_alias{$name}) {
# It's a new alias for an existing element
my $c = $$type_alias{$name};
if (! exists $Alias{lc($c)}) {
print "WARNING [$op,$type]: alias referenced before it is defined: $name => $c\n";
return (1);
}
$currID = $Alias{lc($c)}[0];
push @{ $ID2Names{$currID} },$name;
$i = $#{ $ID2Names{$currID} };
$Alias{lc($name)} = [ $currID, $i ];
$t = "alias";
} else {
# It's a new element.
if ($no_create) {
return(0,-1,-1,"new");
}
$currID = $ID++;
$i = 0;
$ID2Names{$currID} = [ $name ];
$Alias{lc($name)} = [ $currID, $i ];
$t = "new";
}
return(0,$currID,$i,$t);
}
# This takes a list of codes and names and checks to see if we've got
# an ID for this element, or if it is a new element.
#
# If $second is non-zero, then this is the second (or more) codeset of
# a given type and we are expected to always have an element to match
# with, or that it is flagged in the data files as a known new value.
# This can be overridden if $allow is non-zero.
#
sub _get_ID_new {
my($type,$src,$second,$allow,$codes,$names) = @_;
my($id,$subid) = ('','');
#
# Check each of the names to see if it's been previously defined.
#
NAME:
foreach my $name (@$names) {
#
# If we've already used this name before, it'll be defined in
# %Alias. Make sure that the ID is the same for all names assigned
# to this element.
#
if (exists $Alias{lc($name)}) {
my $i = $Alias{lc($name)}[0];
if ($id && $i ne $id) {
print "WARNING [$type,$src]: " .
"name refers to multiple elements: $name => $id,$i\n";
return (1);
}
$id = $i;
next NAME;
}
#
# If we've already got an ID, or if this is the first standard
# read in, then this is just a new alias.
#
next NAME if ($id || ! $second || $allow);
#
# If this is a totally new name, then we need to have explicitly
# allow it.
#
if (! exists $Data{$type}{$src}{'new'}{$name} &&
! exists $Data{$type}{$src}{'orig'}{'name'}) {
print "WARNING [$type,$src]: " .
"new name not allowed: $name\n";
return (1);
}
}
#
# If any of the codes entered here are already defined in another
# data source, make sure they are consistent. In general, if a
# codeset only comes from a single source, this should not be a
# problem.
#
foreach my $codeset (keys %$codes) {
my $code = $$codes{$codeset};
if (exists $Code2ID{$codeset}{$code}) {
my($i,$s) = @{ $Code2ID{$codeset}{$code} };
if ($id && $i ne $id) {
print "WARNING [$type,$src,$codeset]: " .
"code refers to multiple elements: $code => $id,$i\n";
return (1);
}
($id,$subid) = ($i,$s);
}
}
#
# If it's a new name for an existing element, add each of the names
# to %Alias.
#
if ($id) {
my $name = $$names[0];
if (exists $Alias{lc($name)}) {
$subid = $Alias{lc($name)}[1];
} else {
push @{ $ID2Names{$id} },$name;
$subid = $#{ $ID2Names{$id} };
$Alias{lc($name)} = [ $id, $subid ];
}
foreach $name (@$names) {
if (! exists $Alias{lc($name)}) {
push @{ $ID2Names{$id} },$name;
my $s = $#{ $ID2Names{$id} };
$Alias{lc($name)} = [ $id, $s ];
}
}
}
#
# If it's a new element, create it and all aliases.
#
if (! $id) {
$id = $ID++;
$subid = 0;
$ID2Names{$id} = [ @$names ];
my $sid = $subid;
foreach my $name (@$names) {
$Alias{lc($name)} = [ $id, $sid++ ];
}
}
return(0,$id,$subid);
}
############################################################################
# DO_ALIASES
############################################################################
sub do_aliases {
my($caller) = @_;
my ($type_alias) = _type_hashes($caller);
# Add remaining aliases.
foreach my $alias (keys %$type_alias) {
my $type = $$type_alias{$alias};
next if (exists $Alias{lc($type)} &&
exists $Alias{lc($alias)});
if (! exists $Alias{lc($type)} &&
! exists $Alias{lc($alias)}) {
print "WARNING: unused type in alias list: $type\n";
print "WARNING: unused type in alias list: $alias\n";
next;
}
my ($typeID);
if (exists $Alias{lc($type)}) {
$typeID = $Alias{lc($type)}[0];
$type = $alias;
} else {
$typeID = $Alias{lc($alias)}[0];
}
push @{ $ID2Names{$typeID} },$type;
my $i = $#{ $ID2Names{$typeID} };
$Alias{lc($type)} = [ $typeID, $i ];
}
}
############################################################################
# WRITE_MODULE
############################################################################
sub write_module {
my($type) = @_;
my(%hashes) = ("id2names" => "ID2Names",
"alias2id" => "Alias",
"code2id" => "Code2ID",
"id2code" => "ID2Code");
my $file = "$ModDir/${Module}_Codes.pm";
my $out = new IO::File;
$out->open(">$file");
binmode $out, ":encoding(UTF-8)";
my $timestamp = `date`;
chomp($timestamp);
print $out "package #
Locale::Codes::${Module}_Codes;
# This file was automatically generated. Any changes to this file will
# be lost the next time 'harvest_data' is run.
# Generated on: $timestamp
use strict;
require 5.006;
use warnings;
use utf8;
our(\$VERSION);
\$VERSION='3.51';
\$Locale::Codes::Data{'$type'}{'id'} = '$ID';
";
foreach my $h (qw(id2names alias2id code2id id2code)) {
my $hash = $hashes{$h};
print $out "\$Locale::Codes::Data{'$type'}{'$h'} = {\n";
_write_hash($out,$hash);
print $out "};\n\n";
}
print $out "1;\n";
$out->close();
}
sub _write_hash {
my($out,$hashname) = @_;
no strict 'refs';
my %hash = %$hashname;
use strict 'refs';
_write_subhash($out,3,\%hash);
}
sub _write_subhash {
my($out,$indent,$hashref) = @_;
my %hash = %$hashref;
my $ind = " "x$indent;
foreach my $key (sort keys %hash) {
my $val = $hash{$key};
if (ref($val) eq "HASH") {
print $out "${ind}q($key) => {\n";
_write_subhash($out,$indent+3,$val);
print $out "${ind} },\n";
} elsif (ref($val) eq "ARRAY") {
print $out "${ind}q($key) => [\n";
_write_sublist($out,$indent+3,$val);
print $out "${ind} ],\n";
} else {
print $out "${ind}q($key) => q($val),\n";
}
}
}
sub _write_sublist {
my($out,$indent,$listref) = @_;
my @list = @$listref;
my $ind = " "x$indent;
foreach my $val (@list) {
if (ref($val) eq "HASH") {
print $out "${ind}{\n";
_write_subhash($out,$indent+3,$val);
print $out "${ind}},\n";
} elsif (ref($val) eq "ARRAY") {
print $out "${ind}[\n";
_write_sublist($out,$indent+3,$val);
print $out "${ind}],\n";
} else {
print $out "${ind}q($val),\n";
}
}
}
############################################################################
# HANDLE CODESET
############################################################################
sub _read_file {
my(%opts) = @_;
#
# Get the URL
#
# The temporary file
my $file; # _init_country_iso
if (exists $opts{'local'}) {
$file = $opts{'local'};
} else {
$file = (caller(1))[3];
$file =~ s/main:://;
}
# The type of file
my $type = $opts{'type'};
$type = 'text' if (! $type);
my $file2 = '';
if ($type eq 'html') {
$file .= ".htm";
} elsif ($type eq 'xls') {
$file .= ".xls";
} elsif ($type eq 'xlsx') {
$file .= ".xlsx";
} elsif ($type eq 'zip') {
$file2 = "$file.txt";
$file .= ".zip";
} else {
$file .= ".txt";
}
# Get the file
if ($type eq 'manual') {
while (! -f $file) {
my $inst = $opts{'inst'};
print $inst,"\n";
print "Put the data into the file:\n";
print " $file\n";
print "Strip out any leading/trailing blank lines.\n\n";
print "Press any key to continue...\n";
my $c = getone();
}
} else {
my $url = $opts{'url'};
system("wget -N -q --no-check-certificate -O $file '$url'");
}
#
# Read the local file
#
my(@in);
if ($type eq 'xls') {
#
# Read an XLS file
#
my $csv = $file;
$csv =~ s/.xls/.csv/;
# New command
my $cmd = "xls2csv.py $file > $csv; dos2unix $csv";
system($cmd);
@in = `cat $csv`;
chomp(@in);
if ($opts{'head'}) {
my $head = $opts{'head'};
while ($in[0] !~ /$head/) {
shift(@in);
}
}
# The first line (headers) must have the correct number of fields.
my $n = _csv_count_columns($in[0]);
if ($opts{'join'}) {
# Some CSV files have newlines in the value. This looks
# for lines without the correct number of fields. When found,
# the following line is joined to it.
my @tmp;
LINE:
while (@in) {
my $line = shift(@in);
while (1) {
my $nn = _csv_count_columns($line);
if ($nn == $n) {
push(@tmp,$line);
next LINE;
} elsif ($nn > $n) {
print "ERROR: Invalid line skipped:\n$line\n";
next LINE;
} else {
$line .= " " . shift(@in);
next;
}
}
}
@in = @tmp;
}
my $in = Text::CSV::Slurp->load(string => join("\n",@in));
@in = @$in;
$opts{'as_list'} = 1; # required
} elsif ($type eq 'xlsx') {
#
# Read an XLSX file
#
my $excel = Spreadsheet::XLSX->new($file);
foreach my $sheet (@{$excel->{Worksheet}}) {
my $name = $sheet->{Name};
next if ($opts{'sheet'} && $opts{'sheet'} ne $name);
$sheet->{MaxRow} ||= $sheet->{MinRow};
foreach my $row ($sheet->{MinRow} .. $sheet->{MaxRow}) {
$sheet->{MaxCol} ||= $sheet->{MinCol};
my @row = ();
foreach my $col ($sheet->{MinCol} .. $sheet->{MaxCol}) {
my $cell = $sheet->{Cells}[$row][$col];
my $val = $cell->{Val} if ($cell);
$val = '' if (! defined $val);
push(@row,"\"$val\"");
}
push(@in,join(',',@row) . "\n");
}
}
} elsif ($type eq 'zip') {
#
# Read one file in a zip file
#
my $zip = Archive::Zip->new($file);
my @file = grep /$opts{'file'}/,$zip->memberNames();
my $flag = $zip->extractMember($file[0],$file2);
if (! defined($flag)) {
die "ERROR [iso]: zip file changed format\n";
}
@in = `cat $file2`;
} elsif ($opts{'encoding'}) {
#
# Read an encoded text file
#
open(my $in,"<:encoding($opts{encoding})",$file);
@in = <$in>;
close($in);
} else {
#
# Read an ASCII text file
#
@in = `cat $file`;
}
chomp(@in);
chop(@in) if ($opts{'chop'});
#
# If it was encoded, make sure it's in UTF-8
#
if ($opts{'encoding'} && $opts{'encoding'} ne 'UTF-8') {
my $in = join("\n",@in);
$in = encode('UTF-8',$in);
@in = split("\n",$in);
}
#
# Strip out some problem strings.
#
if ($opts{'html_strip'} || $opts{'html_repl'}) {
my $in = join("\n",@in);
strip_tags(\$in,@{ $opts{'html_strip'} }) if ($opts{'html_strip'});
if ($opts{'html_repl'}) {
foreach my $repl (@{ $opts{'html_repl'} }) {
if (ref($repl)) {
$in =~ s/$repl/ /sg;
} else {
$in =~ s/\Q$repl\E/ /sg;
}
}
$in =~ s/\s+/ /sg;
}
@in = split("\n",$in);
}
#
# Return the contents of the file as a list or a string.
#
if ($opts{'as_list'}) {
return \@in;
} else {
return join(" ",@in);
}
}
sub _csv_count_columns {
my($line) = @_;
my $c = 0; # Number of commas found
while ($line) {
# "Value"
# "Value\n continued"
if ($line =~ /^"/) {
$line =~ s/^".*?($|")//;
} else {
$line =~ s/^[^,]*//;
}
$c++ if ($line =~ s/^,//);
}
return $c+1;
}
{
my $second; # This will be set to 1 once the first set is read in.
# This reads a source of data containing one or more code sets of
# a given type.
#
# $type The type of codesets being input (country, language, etc.)
# $src The label for this source of data
# $codesets A listref of code sets that are included in this data
# source. The order is important. It tells what order the
# data is stored in the data source. A data source may
# include data sets for which it is not the standard, and
# these will be used simply to match with existing elements.
# Element names (and links) will be determined using all
# sources, but codes will only be added from codesets for
# which a source is listed as a standard.
# $stdcodesets A listref of code sets. This is the subset of $codesets
# for which this source is the standard. The first time a
# codeset it read in, it must be from a standard. Multiple
# standards can be used (and the data from them will be
# merged) but all standards should be read before other
# sources are read.
# $allow This source is allowed to add new codes without explicit
# allows. This only applies to the second or higher source.
#
sub _do_codeset {
my($type,$src,$codesets,$stdcodesets,$allow) = @_;
$allow = 0 if (! $allow);
if (! defined $second) {
$second = 0;
} else {
$second = 1;
}
my %std = map { $_,1 } @$stdcodesets;
#
# The _init_TYPE_CAT function gets all of the data from
# this source and puts it in some sort of list.
#
# The _read_TYPE_CAT function reads one element from that list.
#
no strict 'refs';
my $func = "_init_${type}_${src}";
&$func();
$func = "_read_${type}_${src}";
ELE:
while (1) {
#
# Read the next element.
#
# Output is (CODE1, CODE2, ... CODEN, NAME1, NAME2, ... NAMEM)
#
# The order of the codes is specified by $codesets.
#
my @ele = &$func();
last if (! @ele);
#
# Store the codes in %codes
# %codes = ( CODESET => CODE )
# If CODE is blank, it is quietly ignored.
#
# A code is also ignored if it is in the 'ignore' list. If a name
# is ignored, the entire element is skipped.
#
my (%codes,@names);
foreach my $codeset (@$codesets) {
my $code = shift(@ele);
next if (! defined($code) ||
$code eq '' ||
exists $Data{$type}{$src}{'ignore'}{$codeset}{$code});
$codes{$codeset} = $code;
}
foreach my $name (@ele) {
if ($name) {
next ELE if (exists $Data{$type}{$src}{'ignore'}{'name'}{$name});
push(@names,$name);
}
}
next if (! @names && ! %codes);
if (! @names) {
my @codes = sort values(%codes);
print "WARNING [$type,$src]: Codes with no name: @codes\n";
next;
}
if (! %codes) {
print "WARNING [$type,$src]: Element with no codes: @names\n";
next;
}
#
# Some codes and/or element names must be rewritten (probably
# to remove non-ASCII characters, but other reasons also
# occur). If a name appears as both ASCII and non-ASCII,
# make sure it isn't duplicated)
#
foreach my $codeset (sort keys %codes) {
my $code = $codes{$codeset};
if (exists $Data{$type}{$src}{'orig'}{$codeset}{$code}) {
$codes{$codeset} = $Data{$type}{$src}{'orig'}{$codeset}{$code};
}
}
my(%tmp,@tmp);
foreach my $name (@names) {
if (exists $Data{$type}{$src}{'orig'}{'name'}{$name}) {
$name = $Data{$type}{$src}{'orig'}{'name'}{$name};
}
next if (exists $tmp{$name});
$tmp{$name} = 1;
push(@tmp,$name);
}
@names = @tmp;
#
# Check that everything is ASCII
#
foreach my $codeset (sort keys %codes) {
my $code = $codes{$codeset};
_ascii_new($type,$src,$codeset,$code);
}
foreach my $name (@names) {
_ascii_new($type,$src,'name',$name);
}
#
# Get the ID for the current element
#
my($err,$id,$subid) = _get_ID_new($type,$src,$second,$allow,
\%codes,\@names);
next if ($err);
#
# Store the codes (but only if we're reading a standard). If we're
# not reading from a standard, we'll check to see if this would have
# been a new code, and warn if it was.
#
foreach my $codeset (keys %codes) {
my $code = $codes{$codeset};
if ($std{$codeset}) {
$Code2ID{$codeset}{$code} = [ $id, $subid ];
$ID2Code{$codeset}{$id} = $code;
} elsif (! exists $Code2ID{$codeset}{$code}) {
print "WARNING [$type,$src,$codeset]: " .
"new code not added from a non-standard source: $code\n";
}
}
}
#
# Update %Alias with the values in $Data{TYPE}{'link'}.
#
my @tmp;
LINKS:
foreach my $links (@{ $Data{$type}{'link'} }) {
# Check to see if any of the names in a link group are defined
# in %Alias. If any are, they must have the same ID.
my $id;
foreach my $link (@$links) {
if (exists $Alias{lc($link)}) {
my $i = $Alias{lc($link)}[0];
if ($id && $i != $id) {
print "WARNING [$type,$src]: " .
"alias refers to multiple elements: $link\n";
next LINKS;
}
$id = $i;
}
}
# If any are defined, add all the rest to %Alias with the same
# ID. Otherwise, save this link group for later.
if ($id) {
foreach my $name (@$links) {
if (! exists $Alias{lc($name)}) {
push @{ $ID2Names{$id} },$name;
my $subid = $#{ $ID2Names{$id} };
$Alias{lc($name)} = [ $id, $subid ];
}
}
} else {
push(@tmp,$links);
}
}
$Data{$type}{'link'} = \@tmp;
}
}
sub _ascii_new {
my($type,$src,$key,$val) = @_;
if ($val !~ /^[[:ascii:]]*$/) {
my $tmp = $val;
$tmp =~ s/[[:ascii:]]//g;
print "NON-ASCII [$type,$src,$key]: '$val' [$tmp]\n";
}
}
############################################################################
# HTML SCRAPING
############################################################################
sub get_row {
my($type,$inref) = @_;
return () if ($$inref !~ m,^\s*]*>,,) {
die "ERROR [$type]: malformed HTML\n";
}
my $row = $1;
if ($row =~ m,]*>\s*(.*?)\s*\1[^>]*>,,) {
my $val = $2;
push(@row,$val);
}
return @row;
}
# If nested is non-zero, then the header row has a table nested in each column
# and we're looking for $header somewhere in that nested table.
#
sub jump_to_row {
my($inref,$header,$nested) = @_;
if ($nested) {
my $err;
return 0
if ($$inref !~ s,^(.*?)\Q$header\E(.*?) ]*>\s*]*>\s*,,);
while ($$inref =~ m,^]*>,,);
return 1;
}
if ($$inref =~ s,^(.*?)\Q$header\E(.*?) | ]*>\s*(?=]*>\s*\Q$value\E\s*),,) {
return 1;
} else {
return 0;
}
}
sub jump_to_table {
my($inref) = @_;
if ($$inref =~ s,(.*?)(?=]*>\s*(.*?)\s*]*>,,) {
return $1;
}
return "";
}
sub strip_tags {
my($inref,@tags) = @_;
foreach my $tag (@tags) {
$$inref =~ s,?$tag[^>]*>, ,g;
}
}
sub strip_token {
my($inref) = @_;
$$inref =~ s,^\s*,,;
if ($$inref =~ s,^([^>]*)>,,) {
my $tag = $1;
$tag =~ s,\s.*$,,;
return ('close',$tag);
} elsif ($$inref =~ s,^<([^>]*)>,,) {
my $tag = $1;
$tag =~ s,\s.*$,,;
return ('open',$tag);
} else {
$$inref =~ s,^([^<]*),,;
my $val = $1;
$val =~ s,\s*$,,;
return ('val',$val);
}
}
# Strip an entire portion of HTML. If the HTML starts with
#
# it will strip everything up to the matching
#
# correctly handling nested elements.
#
sub strip_entry {
my($inref) = @_;
my(@tag);
while (1) {
my($op,$val) = strip_token($inref);
if ($op eq 'open') {
push(@tag,$val);
next;
} elsif ($op eq 'close') {
my $old = pop(@tag);
if ($old ne $val) {
return 1;
}
last if (! @tag);
} else {
last if (! @tag);
next;
}
}
return 0;
}
###############################################################################
BEGIN {
use POSIX qw(:termios_h);
my $fd_stdin = fileno(STDIN);
my $term = POSIX::Termios->new();
$term->getattr($fd_stdin);
my $oterm = $term->getlflag();
my $echo = ECHO | ECHOK | ICANON;
my $noecho = $oterm & ~$echo;
sub cbreak {
$term->setlflag($noecho);
$term->setcc(VTIME, 1);
$term->setattr($fd_stdin, TCSANOW);
}
sub cooked {
$term->setlflag($oterm);
$term->setcc(VTIME, 0);
$term->setattr($fd_stdin, TCSANOW);
}
sub getone {
my $key = '';
cbreak();
sysread(STDIN, $key, 1);
cooked();
return $key;
}
}
END { cooked() }
# Local Variables:
# mode: cperl
# indent-tabs-mode: nil
# cperl-indent-level: 3
# cperl-continued-statement-offset: 2
# cperl-continued-brace-offset: 0
# cperl-brace-offset: 0
# cperl-brace-imaginary-offset: 0
# cperl-label-offset: 0
# End:
|