The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Lingua::LO::Romanize::Word;

use strict;
use utf8;

use Moose;
use MooseX::AttributeHelpers;

use Lingua::LO::Romanize::Types;
use Lingua::LO::Romanize::Word;

=encoding utf-8

=head1 NAME

Lingua::LO::Romanize::Word - Class for words, used by Lingua::LO::Romanize.

=head1 VERSION

Version 0.10

=cut

our $VERSION = '0.10';

has 'word_str' => (
    is          => 'ro',
    isa         => 'Str',
    required    => 1,
);


has 'syllables' => (
    metaclass   => 'Collection::Array',
    coerce      => 1,
    is          => 'ro',
    isa         => 'Lingua::LO::Romanize::Types::SyllableArr',
    init_arg    => undef,
    builder     => '_build_syllables',
    lazy        => 1,
    provides    => {
        elements    => 'all_syllables',
    },
);

has 'hyphen' => (
    is      => 'rw',
    isa     => 'Bool',
    default => 0,
);

# private builder method for syllables
# parsing out lao syllables, lao numbers and unrecognized characters

sub _build_syllables {
    my $self = shift;
    my $word = $self->word_str;
    my @syllables;
    
    while ($word) {
        if ($word =~ s/^[^ກ-ໝ]+//s) {
            push @syllables, $&;
        } elsif ($word =~ s/^[໐-໙]+//s) {
            push @syllables, $&;
        } elsif ($word =~ s/^ໆ//) {
            if (scalar(@syllables)) {
                my $prev_syllable = $syllables[-1];
                push @syllables, $prev_syllable;
            }
        } elsif ($word =~ s/^ຯ//) { #... or perhaps it should be together with a lao syllable
            push @syllables, $&;
        } elsif (my $syllable = _find_lao_syllable($word)) {
            $word =~ s/^$syllable// or $word =~ s/^.//; #just so that we don't loop forever
            push @syllables, $syllable;
        } else { #just so we don't loop forever
            $word =~ s/.//;
        }
    }
    \@syllables;
}

# private sub routine to find a lao syllable from a string and return the syllable
sub _find_lao_syllable {
    my $word = shift;
    my $syllable;
    return 
        unless $word =~ /^[ເ-ໄ]?([ກຂຄງຈສຊຍຽດຕຖທນບປຜຝພຟມຢຣລຼວຫອຮໜໝ])/;
    
    my $consonant = $1;

    if ($word =~ /^[ເ-ໄ]?$consonant[ເ-ໄ]?([ວຣລຼ])/) { # ວ, ຣ, or ລ (also ຼ) can be used in combination with another consonant
        my $extra = $1;
        unless ($extra eq 'ວ' && $word =~ /^$consonant(?:ວ)[^ະັາິີຶືຸູະັົອໍວຽຍຳ]/) {
            $consonant .= $extra;
        }
    }
    
    my $vowels = '';
    
    if ($consonant =~ /^ຫ$/ && $word =~ /^ຫ[ເ-ໄ]?([ຍນມ])/) {
        my $extra = $1;
        #fetch the surounding vowels and tone mark if any
        $word =~ /^$consonant([ເ-ໄ])?$extra([ະັາິີຶືຸູະັົອໍວຽຍຳ່້໊໋]*)/;
        $consonant .= $extra;
        $vowels .= $1 if $1;
        $vowels .= $2 if $2;
    } else {
        #fetch the surounding vowels and tone mark if any
        $word =~ /^([ເ-ໄ])?$consonant([ະັາິີຶືຸູະັົອໍວຽຍຳ່້໊໋]*)/;
        
        $vowels .= $1 if $1;
        $vowels .= $2 if $2;
    }
    
    my $tone;
    if ($vowels =~ s/([່-໋])//) {
        $tone = $1;
    }
    
    #find first vowel
    if ($vowels =~ /^(?:ໍາ|ຳ)/) { #'sala am' is always the end of a syllable
        my $found = $&;
        $syllable = $consonant;
        $found =~ s/^ໍ// and $syllable .= 'ໍ';
        $syllable .= $tone if defined ($tone);
        $syllable .= $found;
        return $syllable;
    } elsif ($vowels =~ /^(?:ເັຍະ|ເຶອະ|ເັຽະ)/ || $vowels =~ /^(?:ເາະ|ົວະ|ເັຽ|ເັຍ|ເືອ|ເິະ|ເົາ)/) {
        # trying to match the largest vowel first, then go to shorter (less characters)
        # doing 4 and 3 character vowels
        my $found = $&;
        $found =~ /^ເ/ and $syllable = 'ເ';
        $syllable .= $consonant;
        $found =~ /^ເ?([ົັືິຶ])/ and $syllable .= $1;
        $syllable .= $tone if defined ($tone);
        if ($found =~ /(ຍະ|ອະ|ຽະ|າະ|ວະ)$/ || $found =~/([ຽຍອະາ])$/) {
            $syllable .= $1;
        }
    } elsif ($vowels =~ /^(?:ເະ|ເັ|ແະ|ແັ|ໂະ|ັອ|ັວ|ົວ|ັຽ|ັຍ|ເິ|ເຍ|ເຽ|ເີ|ເື|ີວ|ິວ)/) {
        # doing 2 character vowels
        my $found = $&;
        $found =~ /^([ເແໂ])/ and $syllable .= $1;
        $syllable .= $consonant;
        $found =~ /^[ເແ]?([ັົິີື])/ and $syllable .= $1;
        $syllable .= $tone if defined ($tone);
        $found =~ /([ະອວຽຍ])$/ and $syllable .= $1;
    } elsif ($vowels =~ /^[ະັາິີຶືຸູເແົໂໍອວຽຍໄໃ]/) {
        # doing single character vowels
        my $found = $&;
        if ($found =~ /^([ເ-ໄ])$/) {
            $syllable = $1 . $consonant;
            $syllable .= $tone if defined ($tone);
        } elsif ($found =~ /^([ັິີຶືຸູົໍ])$/) {
            $syllable = $consonant;
            $syllable .= $found;
            $syllable .= $tone if defined ($tone);
        } else {
            $syllable = $consonant;
            $syllable .= $tone if defined ($tone);
            $syllable .= $found;
        }
    } else { #lonely constant, just return it (with possible tone)
        $syllable = $consonant;
        $syllable .= $tone if defined ($tone);
        return $syllable;
    }
    
    my $regexp = qr{$syllable([ກງຍຽດນບມຣວ]໌?)(?:([^່້໊໋ະັາິີຶືຸູະັົໍຳ])(.?)|$)};
    
    # checking for a possible closing consonant
    if ($word =~ /^$regexp/) {
        my $last_consonant = $1;
        my $possible_vowel = $2 if $2;
        my $continued_vowel = $3 if defined $3;
        if (!(defined $2) || # end of string
            $possible_vowel =~ /[^ຽວອຍ]/ || #for sure a consonant
            (defined $continued_vowel && $continued_vowel =~/[ະັາິີຶືຸູະັົອໍວຽຍຳ່້໊໋]/)) {#post vowels and tones
            $syllable .= $last_consonant;
        }
    }
    return $syllable;
}

=head1 SYNOPSIS

L<Lingua::LO::Romanize::Word> is used by L<Lingua::LO::Romanize> to divide a string to a collection of words. It is recommended to use L<Lingua::LO::Romanize> instead of this class directly (even if it is possible).

    use Lingua::LO::Romanize::Word;

    my $foo = Lingua::LO::Romanize::Word->new(word_str => 'ພາສາລາວ');

    my $bar = $foo->romanize;           # $bar will hold the string 'phasalao'
    $foo->hyphen(1);                    # set hyphenation between syllables
    $bar = $foo->romanize;              # $bar will hold the string 'pha-sa-lao'
    $bar = $foo->word_str;              # $bar will hold the string 'ພາສາລາວ'
    
    my $syllables_array_ref = $foo->all_syllables; # will return an array reference to all syllables;

For more information, please see L<Lingua::LO::Romanize>

=head1 METHODS

=head2 new

Creates a new L<Lingua::LO::Romanize::Word> object, a word_str is required.

=head2 hyphen

If set to 1 (TRUE), the syllables will be hyphenated when romanized is called. Default is 0 (FALSE), not hyphenated.

=head2 romanize

Romanize the 'word' and return the romanized string accourding to the BGN/PCGN standard.

=head2 word_str

Returns the word as the original string.

=head2 all_syllables

Returns an array reference to all L<Lingua::LO::Romanize::Syllable>

=cut

sub romanize {
    my $self = shift;
    
    my @romanized_arr;
    my $romanized_str;
    
    foreach my $syllable ($self->all_syllables) {
        push @romanized_arr, $syllable->romanize;
    }
    
    my $join_str = '';
    
    $join_str = '-' if $self->hyphen;
    
    $romanized_str = join $join_str, @romanized_arr;
    
    $romanized_str =~ s/^-//;
    
    $romanized_str =~ s/--/-/g if $self->hyphen;
    
    return $romanized_str;
}

=head1 AUTHOR

Joakim Lagerqvist, C<< <jokke at cpan.org> >>

=head1 BUGS

Please report any bugs or feature requests to C<bug-lingua-lo-romanize at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Lingua-LO-Romanize>.  I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.

=head1 SEE ALSO

L<Lingua::LO::Romanize>

=head1 COPYRIGHT & LICENSE

Copyright 2009 Joakim Lagerqvist, all rights reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.


=cut

no Moose;
__PACKAGE__->meta->make_immutable;

1; # End of Lingua::LO::Romanize::Word