The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package CGI::Lingua;

use warnings;
use strict;
use Carp;

use vars qw($VERSION);
our $VERSION = '0.51';

=head1 NAME

CGI::Lingua - Create a multilingual web page

=head1 VERSION

Version 0.51

=cut

=head1 SYNOPSIS

No longer does your website need to be in English only.
CGI::Lingua provides a simple basis to determine which language to display a
website. The website tells CGI::Lingua which languages it supports. Based on
that list CGI::Lingua tells the application which language the user would like
to use.

    use CGI::Lingua;
    # ...
    my $l = CGI::Lingua->new(supported => ['en', 'fr', 'en-gb', 'en-us']);
    my $language = $l->language();
    if ($language eq 'English') {
       print '<P>Hello</P>';
    } elsif($language eq 'French') {
	print '<P>Bonjour</P>';
    } else {	# $language eq 'Unknown'
	my $rl = $l->requested_language();
	print "<P>Sorry for now this page is not available in $rl.</P>";
    }
    my $c = $l->country();
    if ($c eq 'us') {
      # print contact details in the US
    } elsif ($c eq 'ca') {
      # print contact details in Canada
    } else {
      # print worldwide contact details
    }

    # ...

    use CHI;
    use CGI::Lingua;
    # ...
    my $cache = CHI->new(driver => 'File', root_dir => '/tmp/cache', namespace => 'CGI::Lingua-countries');
    my $l = CGI::Lingua->new({ supported => ['en', 'fr'], cache => $cache });

=head1 SUBROUTINES/METHODS

=head2 new

Creates a CGI::Lingua object.

Takes one mandatory parameter: a list of languages, in RFC-1766 format,
that the website supports.
Language codes are of the form primary-code [ - country-code ] e.g.
'en', 'en-gb' for English and British English respectively.

For a list of primary-codes refer to ISO-639 (e.g. 'en' for English).
For a list of country-codes refer to ISO-3166 (e.g. 'gb' for United Kingdom).

    # We support English, French, British and American English, in that order
    my $l = CGI::Lingua(supported => [('en', 'fr', 'en-gb', 'en-us')]);

Takes optional parameter cache, an object which is used to cache country
lookups.
This cache object is an object that understands get() and set() messages,
such as a L<CHI> object.

Takes an optional boolean parameter syslog, to log messages to
L<Sys::Syslog>.

Takes optional parameter logger, an object which is used for warnings
and traces.
This logger object is an object that understands warn() and trace()
messages, such as a L<Log::Log4perl> object.

Since emitting warnings from a CGI class can result in messages being lost (you
may forget to look in your server's log), or appearing to the client in
amongst HTML causing invalid HTML, it is recommended either either syslog
or logger (or both) are set.
If neither is given, L<Carp> will be used.

Takes an optional parameter dont_use_ip.  By default, if none of the
requested languages are supported, CGI::Lingua->language() looks in the IP
address for the language to use.  This may be not what you want, so use this
option to disable the feature.

=cut

sub new {
	my $proto = shift;

	my $class = ref($proto) || $proto;
	my %params = (ref($_[0]) eq 'HASH') ? %{$_[0]} : @_;

	# TODO: check that the number of supported languages is > 0
	# unless($params{supported} && ($#params{supported} > 0)) {
		# croak('You must give a list of supported languages');
	# }
	unless($params{supported}) {
		croak('You must give a list of supported languages');
	}

	return bless {
		_supported => $params{supported}, # List of languages (two letters) that the application
		_cache => $params{cache},	# CHI
		# _rlanguage => undef,	# Requested language
		# _slanguage => undef,	# Language that the website should display
		# _sublanguage => undef,	# E.g. United States for en-US if you want American English
		# _slanguage_code_alpha2 => undef, # E.g en, fr
		# _sublanguage_code_alpha2 => undef, # E.g. us, gb
		# _country => undef,	# Two letters, e.g. gb
		# _locale => undef,	# Locale::Object::Country
		_syslog => $params{syslog},
		_dont_use_ip => $params{dont_use_ip} || 0,
		_logger => $params{logger},
		_have_ipcountry => -1,	# -1 = don't know
		_have_geoip => -1,	# -1 = don't know
	}, $class;
}

# Emit a warning message somewhere
sub _warn {
	my ($self, $params) = @_;

	my $warning = $$params{'warning'};

	return unless($warning);

	if($self->{_syslog}) {
		require Sys::Syslog;
		require CGI::Info;

		Sys::Syslog->import();
		openlog(CGI::Info->new()->script_name(), 'cons,pid', 'user');
		syslog('warning', $warning);
		closelog();
	}

	if($self->{_logger}) {
		$self->{_logger}->warn($warning);
	} elsif(!defined($self->{_syslog})) {
		carp($warning);
	}
}

=head2 language

Tells the CGI application what language to display its messages in.
The language is the natural name e.g. 'English' or 'Japanese'.

Sublanguages are handled sensibly, so that if a client requests U.S. English
on a site that only serves British English, language() will return 'English'.

If none of the requested languages is included within the supported lists,
language() returns 'Unknown'.

    use CGI::Lingua;
    # Site supports English and British English
    my $l = CGI::Lingua->new(supported => ['en', 'fr', 'en-gb']);

    # If the browser requests 'en-us' , then language will be 'English' and
    # sublanguage will be undefined because we weren't able to satisfy the
    # request

    # Site supports British English only
    my $l = CGI::Lingua->new({supported => ['fr', 'en-gb']});

    # If the browser requests 'en-us' , then language will be 'English' and
    # sublanguage will also be undefined, which may seem strange, but it
    # ensures that sites behave sensibly.

=cut

sub language {
	my $self = shift;

	unless($self->{_slanguage}) {
		$self->_find_language();
	}
	return $self->{_slanguage};
}

=head2 name

Synonym for language, for compatibility with Local::Object::Language

=cut

sub name {
	my $self = shift;

	return $self->language();
}

=head2 sublanguage

Tells the CGI what variant to use e.g. 'United Kingdom', or 'Unknown' if
it can't be determined.

Sublanguages are handled sensibly, so that if a client requests U.S. English
on a site that only serves British English, sublanguage() will return undef.

=cut

sub sublanguage {
	my $self = shift;

	unless($self->{_sublanguage}) {
		$self->_find_language();
	}
	return $self->{_sublanguage};
}

=head2 language_code_alpha2

Gives the two character representation of the supported language, e.g. 'en'
when you've asked for en-gb.

If none of the requested languages is included within the supported lists,
language_code_alpha2() returns undef.

=cut

sub language_code_alpha2 {
	my $self = shift;

	unless($self->{_slanguage_code_alpha2}) {
		$self->_find_language();
	}
	return $self->{_slanguage_code_alpha2};
}


=head2 code_alpha2

Synonym for language_code_alpha2, kept for historical reasons.

=cut

sub code_alpha2 {
	my $self = shift;

	return $self->language_code_alpha2();
}

=head2 sublanguage_code_alpha2

Gives the two character representation of the supported language, e.g. 'gb'
when you've asked for en-gb, or undef.

=cut

sub sublanguage_code_alpha2 {
	my $self = shift;

	unless($self->{_sublanguage_code_alpha2}) {
		$self->_find_language();
	}
	return $self->{_sublanguage_code_alpha2};
}


=head2 requested_language

Gives a human readable rendition of what language the user asked for whether
or not it is supported.

=cut

sub requested_language {
	my $self = shift;

	unless($self->{_rlanguage}) {
		$self->_find_language();
	}
	return $self->{_rlanguage};
}

sub _find_language {
	my $self = shift;

	$self->{_rlanguage} = 'Unknown';
	$self->{_slanguage} = 'Unknown';

	require Locale::Object::Country;
	Locale::Object::Country->import();

	# Use what the client has said
	if($ENV{'HTTP_ACCEPT_LANGUAGE'}) {
		require I18N::AcceptLanguage;
		require Locale::Language;

		I18N::AcceptLanguage->import();
		Locale::Language->import();

		# Workaround for RT 74338
		local $SIG{__WARN__} = sub {
			if($_[0] !~ /^Use of uninitialized value/) {
				warn $_[0];
			}
		};
		my $l = I18N::AcceptLanguage->new()->accepts($ENV{'HTTP_ACCEPT_LANGUAGE'}, $self->{_supported});
		$SIG{__WARN__} = 'DEFAULT';
		if((!$l) && ($ENV{'HTTP_ACCEPT_LANGUAGE'} =~ /(.+)-.+/)) {
			# Fall back position, e,g. we want US English on a site
			# only giving British English, so allow it as English.
			# The calling program can detect that it's not the
			# wanted flavour of English by looking at
			# requested_language
			if(I18N::AcceptLanguage->new()->accepts($1, $self->{_supported})) {
				$l = $1;
			}
		}
		if($l) {
			$self->{_slanguage} = Locale::Language::code2language($l);
			if($self->{_slanguage}) {
				# We have the language, but not the right
				# sublanguage, e.g. they want US English but we
				# only support British English or English
				# wanted: en-us, got en-gb and en
				$self->{_slanguage_code_alpha2} = $l;
				$self->{_rlanguage} = $self->{_slanguage};

				if($ENV{'HTTP_ACCEPT_LANGUAGE'} =~ /..-(..)$/) {
					my $l = Locale::Object::Country->new(code_alpha2 => $1);
					if($l) {
						$self->{_rlanguage} .= ' (' . $l->name . ')';
						# The requested sublanguage
						# isn't supported so don't
						# define that
					}
				} elsif($ENV{'HTTP_ACCEPT_LANGUAGE'} =~ /..-([a-z]{2,3})$/i) {
					my $l = Locale::Object::Country->new(code_alpha3 => $1);
					if($l) {
						$self->{_rlanguage} .= ' (' . $l->name . ')';
						# The requested sublanguage isn't
						# supported so don't define that
					}
				}
				return;
			}
			# TODO: Handle es-419 "Spanish (Latin America)"
			if($l =~ /(.+)-(..)$/) {
				my $alpha2 = $1;
				my $variety = $2;
				my $accepts = I18N::AcceptLanguage->new()->accepts($l, $self->{_supported});

				if($accepts) {
					$self->{_slanguage} = Locale::Language::code2language($accepts);
					if(length($variety) == 2) {
						my $c = Locale::Object::Country->new(code_alpha2 => $variety);
						if(defined($c)) {
							$self->{_sublanguage} = $c->name;
						}
					}
					if($self->{_slanguage}) {
						$self->{_slanguage_code_alpha2} = $accepts;
						if($self->{_sublanguage}) {
							$self->{_rlanguage} = "$self->{_slanguage} ($self->{_sublanguage})";
						}
						$self->{_sublanguage_code_alpha2} = $variety;
						return;
					}
				}
				my $lang = Locale::Language::code2language($alpha2);
				unless($lang) {
					$lang = $1;
				}
				$self->{_rlanguage} = $lang;
				$self->_get_closest($alpha2, $alpha2);
				if($self->{_sublanguage}) {
					$ENV{'HTTP_ACCEPT_LANGUAGE'} =~ /(.{2})-(..)/;
					$variety = lc($2);
					# Ignore en-029 etc (Carribean English)
					if($variety =~ /[a-z]{2,3}/) {
						if($variety eq 'uk') {
							# ???
							$self->_warn({
								warning => "Resetting country code to GB for $ENV{'HTTP_ACCEPT_LANGUAGE'}"
							});
							$variety = 'gb';
						}
						my $db = Locale::Object::DB->new();
						my @results = $db->lookup(
							table => 'country',
							result_column => '*',
							search_column => 'code_alpha2',
							value => $variety
						);
						if(defined($results[0])) {
							eval {
								$lang = Locale::Object::Country->new(code_alpha2 => $variety);
							};
						} else {
							$lang = undef;
						}
						if($@ || !defined($lang)) {
							$self->{_sublanguage} = 'Unknown';
							$self->_warn({
								warning => "Can't determine values for $ENV{'HTTP_ACCEPT_LANGUAGE'}"
							});
						} else {
							$self->{_sublanguage} = $lang->name;
						}
						if(defined($self->{_sublanguage})) {
							$self->{_rlanguage} = "$self->{_slanguage} ($self->{_sublanguage})";
							$self->{_sublanguage_code_alpha2} = $variety;
							return;
						}
					}
				}
			}
		}
		if($self->{_slanguage} && ($self->{_slanguage} ne 'Unknown')) {
			if($self->{_rlanguage} eq 'Unknown') {
				require I18N::LangTags::Detect;
				$self->{_rlanguage} = I18N::LangTags::Detect::detect();
			}
			if($self->{_rlanguage}) {
				my $l = Locale::Language::code2language($self->{_rlanguage});
				if($l) {
					$self->{_rlanguage} = $l;
				} else {
					# We have the language, but not the right
					# sublanguage, e.g. they want US English but we
					# only support British English
					# wanted: en-us, got en-gb and not en
				}
				return;
			}
		}
		unless($self->{_rlanguage}) {
			$self->{_rlanguage} = 'Unknown';
		}
		$self->{_slanguage} = 'Unknown';
	}

	if($self->{_dont_use_ip}) {
		return;
	}

	# The client hasn't said which to use, guess from their IP address,
	# or the requested language(s) isn't/aren't supported so use the IP
	# address for an alternative
	my $country = $self->country();
	if(defined($country)) {
		# Determine the first official language of the country

		my $l = Locale::Object::Country->new(code_alpha2 => uc($country));
		if($l) {
			$l = ($l->languages_official)[0];
		}
		my $ip = $ENV{'REMOTE_ADDR'};
		if($l && $l->name) {
			$self->{_rlanguage} = $l->name;
			unless((exists($self->{_slanguage})) && ($self->{_slanguage} ne 'Unknown')) {
				# Check if the language is one that we support
				# Don't bother with secondary language
				require Locale::Language;

				Locale::Language->import();

				my $code = Locale::Language::language2code($self->{_rlanguage});
				unless($code) {
					if($ENV{'HTTP_ACCEPT_LANGUAGE'}) {
						$code = Locale::Language::language2code($ENV{'HTTP_ACCEPT_LANGUAGE'});
					}
					unless($code) {
						# If language is Norwegian (Nynorsk)
						# lookup Norwegian
						if($self->{_rlanguage} =~ /(.+)\s\(.+/) {
							$code = Locale::Language::language2code($1);
						}
						unless($code) {
							$self->_warn({
								warning => "Can\'t determine code from IP $ip for requested language $self->{_rlanguage}"
							});
						}
					}
				}
				if($code) {
					$self->_get_closest($code, $l->code_alpha2);
				}
			}
			if($self->{_cache} && defined($ip)) {
				$country = $self->{_cache}->set($ip, $country, '1 hour');
			}
		} elsif(defined($ip)) {
			$self->_warn({
				warning => "Can't determine language from IP $ip, country $country"
			});
		}
	}
}

# Try our very best to give the right country - if they ask for en-us and
# we only have en-gb then give it to them

sub _get_closest {
	my ($self, $language_string, $alpha2) = @_;

	foreach (@{$self->{_supported}}) {
		my $s;
		if(/^(.+)-.+/) {
			$s = $1;
		} else {
			$s = $_;
		}
		if($language_string eq $s) {
			$self->{_slanguage} = $self->{_rlanguage};
			$self->{_slanguage_code_alpha2} = $alpha2;
			last;
		}
	}
}

=head2 country

Returns the two character country code of the remote end in lower case.

If L<IP::Country> or L<Geo::IP> is installed, CGI::Lingua will make
use of that, otherwise it will do a Whois lookup.
If you do not have either installed I recommend you make use of the
caching capability of CGI::Lingua.

=cut

sub country {
	my $self = shift;

	# FIXME: If previous calls to country() return undef, we'll
	# waste time going through again and no doubt returning undef
	# again.
	if($self->{_country}) {
		return $self->{_country};
	}

	# mod_geoip
	if(defined($ENV{'GEOIP_COUNTRY_CODE'})) {
		$self->{_country} = lc($ENV{'GEOIP_COUNTRY_CODE'});
		return $self->{_country};
	}

	my $ip = $ENV{'REMOTE_ADDR'};

	unless($ip) {
		return();
	}
	require Data::Validate::IP;
	Data::Validate::IP->import();

	unless(is_ipv4($ip)) {
		if($ip eq '::1') {
			# special case that is easy to handle
			$ip = '127.0.0.1';
		} else {
			$self->_warn({
				warning => "$ip isn't a valid IPv4 address\n"
			});
			return;
		}
	}

	if($self->{_cache}) {
		$self->{_country} = $self->{_cache}->get($ip);
		if($self->{_logger}) {
			if(defined($self->{_country})) {
				$self->{_logger}->trace("Get $ip from cache = $self->{_country}");
			} else {
				$self->{_logger}->trace("$ip isn't in the cache");
			}
		}
	}

	unless(defined($self->{_country})) {
		if(($ENV{'HTTP_CF_IPCOUNTRY'}) && ($ENV{'HTTP_CF_IPCOUNTRY'} ne 'XX')) {
			# Hosted by Cloudfare
			$self->{_country} = lc($ENV{'HTTP_CF_IPCOUNTRY'});
		} else {
			if($self->{_have_ipcountry} == -1) {
				if(eval { require IP::Country; }) {
					IP::Country->import();
					$self->{_have_ipcountry} = 1;
					$self->{_ipcountry} = IP::Country::Fast->new();
				} else {
					$self->{_have_ipcountry} = 0;
				}
			}
			if($self->{_logger}) {
				$self->{_logger}->debug("have_ipcountry $self->{_have_ipcountry}");
			}

			if($self->{_have_ipcountry} == 1) {
				$self->{_country} = $self->{_ipcountry}->inet_atocc($ip);
				if($self->{_country}) {
					$self->{_country} = lc($self->{_country});
				} else {
					$self->_warn({
						warning => "$ip is not known by IP::Country"
					});
				}
			}
			unless(defined($self->{_country})) {
				if($self->{_have_geoip} == -1) {
					if(eval { require Geo::IP; }) {
						Geo::IP->import();
						$self->{_have_geoip} = 1;
						# GEOIP_STANDARD = 0, can't use that because you'll
						# get a syntax error
						$self->{_geoip} = Geo::IP->new(0);
					} else {
						$self->{_have_geoip} = 0;
					}
				}
				if($self->{_have_geoip} == 1) {
					$self->{_country} = $self->{_geoip}->country_code_by_addr($ip);
				}
			}
		}
		if($self->{_country} && ($self->{_country} eq 'eu')) {
			delete($self->{_country});
		}
		unless($self->{_country}) {
			require Net::Whois::IP;
			Net::Whois::IP->import();

			my $whois;

			eval {
				# Catch connection timeouts to
				# whois.ripe.net by turning the carp
				# into an error
				local $SIG{__WARN__} = sub { die $_[0] };
				$whois = Net::Whois::IP::whoisip_query($ip);
			};
			unless($@ || !defined($whois) || (ref($whois) ne 'HASH')) {
				if(defined($whois->{Country})) {
					$self->{_country} = $whois->{Country};
				} elsif(defined($whois->{country})) {
					$self->{_country} = $whois->{country};
				}
				if($self->{_country} && ($self->{_country} eq 'eu')) {
					delete($self->{_country});
				}
			}

			unless($self->{_country}) {
				require Net::Whois::IANA;
				Net::Whois::IANA->import();

				my $iana = new Net::Whois::IANA;
				eval {
					$iana->whois_query(-ip => $ip);
				};
				unless ($@) {
					$self->{_country} = $iana->country();
				}
			}
			if($self->{_country} eq 'eu') {
				delete($self->{_country});
			}
			if($self->{_country}) {
				# 190.24.1.122 has carriage return in its WHOIS record
				$self->{_country} =~ s/[\r\n]//g;
				if($self->{_country} =~ /^(..)\s*#.*/) {
					# Remove comments in the Whois record
					$self->{_country} = $1;
				}
			}
		}
	}

	if($self->{_country}) {
		$self->{_country} = lc($self->{_country});
		if($self->{_country} eq 'hk') {
			# Hong Kong is no longer a country, but Whois thinks
			# it is - try "whois 218.213.130.87"
			$self->{_country} = 'cn';
		} elsif($self->country eq 'eu') {
			require Net::Subnet;

			# RT-86809, Baidu claims it's in EU not CN
			Net::Subnet->import();
			if(subnet_matcher('185.10.104.0/22')->($ip)) {
				$self->{_country} = 'cn';
			} else {
				# There is no country called 'eu'
				$self->_warn({
					warning => "$ip has country of eu"
				});
				delete($self->{_country});
			}
		}
		if($self->{_cache}) {
			$self->{_cache}->set($ip, $self->{_country}, '1 hour');
			if($self->{_logger}) {
				$self->{_logger}->debuf("Set $ip to $self->{_country}");
			}
		}
	}

	return $self->{_country};
}

=head2 locale

HTTP doesn't have a way of transmitting a browser's localisation information
which would be useful for default currency, date formatting etc.

This method attempts to detect the information, but it is a best guess
and is not 100% reliable.  But it's better than nothing ;-)

Returns a L<Locale::Object::Country> object.

To be clear, if you're in the US and request the language in Spanish,
and the site supports it, language() will return 'Spanish', and locale() will
try to return the Locale::Object::Country for the US.

=cut

sub locale {
	my $self = shift;

	if($self->{_locale}) {
		return $self->{_locale};
	}
	require Locale::Object::Country;
	Locale::Object::Country->import();

	# First try from the User Agent.  Probably only works with Mozilla and
	# Safari.  I don't know about Opera.  It won't work with IE or Chrome.
	my $agent = $ENV{'HTTP_USER_AGENT'};
	my $country;
	if(defined($agent) && ($agent =~ /\((.+)\)/)) {
		foreach(split(/;/, $1)) {
			my $candidate = $_;

			$candidate =~ s/^\s//g;
			$candidate =~ s/\s$//g;
			if($candidate =~ /^[a-zA-Z]{2}-([a-zA-Z]{2})$/) {
				local $SIG{__WARN__} = undef;
				my $c = Locale::Object::Country->new(code_alpha2 => $1);
				if($c) {
					$self->{_locale} = $c;
					return $c;
				}
				# carp "Warning: unknown country $1 derived from $candidate in HTTP_USER_AGENT ($agent)";
			}
		}

		eval {
			require HTTP::BrowserDetect;

			HTTP::BrowserDetect->import();
		};

		unless($@) {
			my $browser = HTTP::BrowserDetect->new($agent);

			if($browser && $browser->country()) {
				my $c = Locale::Object::Country->new(code_alpha2 => $browser->country());
				if($c) {
					$self->{_locale} = $c;
					return $c;
				}
			}
		}
	}

	# Try from the IP address
	$country = $self->country();

	if($country) {
		$country =~ s/[\r\n]//g;

		my $c;
		eval {
			local $SIG{__WARN__} = sub { die $_[0] };
			$c = Locale::Object::Country->new(code_alpha2 => $country);
		};
		unless($@) {
			if($c) {
				$self->{_locale} = $c;
				return $c;
			}
		}
	}

	# Try mod_geoip
	if(defined($ENV{'GEOIP_COUNTRY_CODE'})) {
		$country = $ENV{'GEOIP_COUNTRY_CODE'};
		my $c = Locale::Object::Country->new(code_alpha2 => $country);
		if($c) {
			$self->{_locale} = $c;
			return $c;
		}
	}
	return ();	# returns undef
}

=head1 AUTHOR

Nigel Horne, C<< <njh at bandsman.co.uk> >>

=head1 BUGS

If HTTP_ACCEPT_LANGUAGE is 3 characters, e.g., es-419,
sublanguage() returns undef.

Please report any bugs or feature requests to C<bug-cgi-lingua at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=CGI-Lingua>.  I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.


=head1 SEE ALSO

Locale::Country::Object
HTTP::BrowserDetect

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc CGI::Lingua


You can also look for information at:

=over 4

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=CGI-Lingua>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/CGI-Lingua>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/CGI-Lingua>

=item * Search CPAN

L<http://search.cpan.org/dist/CGI-Lingua/>

=back


=head1 ACKNOWLEDGEMENTS


=head1 LICENSE AND COPYRIGHT

Copyright 2010-2014 Nigel Horne.

This program is released under the following licence: GPL


=cut

1; # End of CGI::Lingua