The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package WordLists::Sort::Typical;
use utf8;
use strict;
use warnings;
use Unicode::Normalize; #provides NFD
use WordLists::Sort qw( atomic_compare complex_compare);
use WordLists::Base;
our $VERSION = $WordLists::Base::VERSION;
our $AUTOLOAD;
require Exporter;
our @ISA       = qw (Exporter);
our @EXPORT    = ();
our @EXPORT_OK = qw(
	cmp_alnum
	cmp_alnum_only
	cmp_accnum
	cmp_accnum_only
	cmp_ver
	cmp_dict
);
sub _cmp { $_[0] cmp $_[1] };

sub cmp_dict
{
	my $norm_remove_the = sub{$_[0] =~ s/^the\s+//;$_[0];};
	complex_compare (
		$_[0], $_[1],
		{
			functions =>
			[
				{
					n => $norm_remove_the,
					t=>[
						{
							re=>qr/.+/,
							c => \&cmp_accnum_only,
						},
					],
				},
				{
					n => sub{&{$norm_remove_the}($_[0]); $_[0] =~ s/[^\p{Script: Latin}0-9]/ /g; $_[0] =~ s/[a-z\p{Lowercase}]/ /g; $_[0];},
					c => \&_cmp,
				},
				{
					n => $norm_remove_the,
					t=>[
						{
							re=>qr/.+/,
							c => \&cmp_alnum_only,
						},
					],
				},
				{
					n => sub{&{$norm_remove_the}($_[0]); $_[0] =~ s/[^\p{Script: Latin}0-9]/ /g; $_[0] =~ s/[a-zA-Z]/ /g; $_[0];},
					c => \&_cmp,
				},
				{
					n => sub{&{$norm_remove_the}($_[0]); $_[0] =~ s/[\P{Uppercase}]/ /g; $_[0];},
					c => \&_cmp,
				},
				{
					n => sub { &{$norm_remove_the}($_[0]); $_[0]=~ s/[^\p{Script: Latin}0-9]//g; $_[0];},
					c => \&_cmp,
				},
				{
					n => $norm_remove_the,
					t=>[
						{
							re=>qr/.+/,
							c => \&cmp_alpha,
						},
					],
				},
				{
					c => sub { ($_[0] =~ s/^the\s+//) cmp ($_[1] =~ s/^the\s+//) },
				},
			]
		}
	);
}

sub cmp_alnum
{
	atomic_compare (
		$_[0], $_[1],
		{
			n => sub { lc $_[0];},
			t =>
			[
				{
					re => qr/[0-9]+/, 
					c => sub { $_[0] <=> $_[1]; } 
				},
			],
		}
	);
}
sub cmp_alpha
{
	atomic_compare (
		$_[0], $_[1],
		{
			n => sub { lc $_[0];},
		}
	);
}
sub cmp_alnum_only
{
	atomic_compare (
		$_[0], $_[1],
		{
			n => sub {
				$_[0] =~ s/[^\p{Script: Latin}0-9]//g;
				$_[0];
			},
			t=>[
				{
					re=>qr/.+/,
					c => \&cmp_alnum,
				}
			],
		}
	);
}
sub cmp_accnum_only
{
	atomic_compare (
		$_[0], $_[1],
		{
			n => sub {
				$_[0] =~ s/[^\p{Script: Latin}0-9]//g;
				$_[0];
			},
			t=>[
				{
					re=>qr/.+/,
					c => \&cmp_accnum,
				}
			],
		}
	);
}
sub cmp_accnum
{
	atomic_compare (
		$_[0], $_[1],
		{
			n => sub {
				$_[0] = NFD ($_[0]);
				$_[0] =~ s/\pM//g;
				lc $_[0];
			},
			t =>
			[
				{
					re => qr/[0-9]+/, 
					c => sub { $_[0] <=> $_[1]; } 
				},
			],
		}
	);
}

sub cmp_ver # compares version strings: anything that is not alphanumeric is a separator and doesn't have any preference 
# 1.1 = 1:1
# 1.1.a < 1.1a < 1.1.1
{
	atomic_compare (
		$_[0], $_[1],
		{
			n => sub {$_[0] =~ s/^v//i; $_[0];},
			t =>
			[
				{
					re => qr/[0-9]+/, 
					c => sub { $_[0] <=> $_[1]; } 
				},
				{
					re => qr/[a-zA-Z]+/, 
					c => sub { lc $_[0] cmp lc $_[1]; } 
				},
				{
					re => qr/[^a-zA-Z0-9]+/,
					c => 0
				},


			],
		}
	);
}



1;


=pod

=head1 NAME

WordLists::Sort::Typical

=head1 SYNOPSIS

	'A14' cmp 'A2'; # sadly returns -1, so instead do this:
	use WordLists::Sort::Typical qw(cmp_alnum);
	cmp_alnum('A14', 'A2'); # returns 1

=head1 DESCRIPTION	

This provides functions for sorting text.

=head3 cmp_alnum

Compares alphanumeric values sensibly, e.g. "Unit 10" comes after "Unit 9", not before "Unit 2". Case-insensitive.

=head3 cmp_alnum_only

Compares alphanumeric values sensibly as C<cmp_alnum>, but ignores all values except alphanumeric characters, so "re-factor" sorts with "refactor", not between "re" and "react". Case-insensitive.

=head3 cmp_accnum

Compares alphanumeric values sensibly as C<cmp_alnum>, and considers accented characters to be equivalent to unaccented characters, so "café" sorts with "cafe", not after "caftan".

=head3 cmp_accnum_only

Compares alphanumeric values sensibly and accent-insensitively as C<cmp_accnum>, and ignores non-alphanumeric content like C<cmp_alnum_only>

=head3 cmp_ver

Compares version numbers sensibly, even if they are of the form "v1.0028_01a".

=head3 cmp_dict

This uses a C<complex_sort>, the first stage being C<cmp_accnum_only>. Strings which are still equal are progressively sorted with tie-breakers so that order is reliable. Strings beginning "the " are sorted identically, except at the end, when strings without "the " have preference.

=over

=item *

Case - uppercase comes after lowercase.

=item *

Accents - uppercase comes after lowercase.

=item *

Non-alphanumeric characters - these are sorted, ignoring other intervening characters.

=item *

Definite article - if the strings are otherwise identical, a string beginning "the " comes after a string not beginning "the "

=back

=head1 BUGS

Please use the Github issues tracker.

=head1 LICENSE

Copyright 2011-2012 © Cambridge University Press. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.

=cut