lib/Lingua/Stem/UniNE/FA.pm

package Lingua::Stem::UniNE::FA;

use v5.8;
use utf8;
use strict;
use warnings;
use charnames ':full';
use parent 'Exporter';

our $VERSION   = '0.04';
our @EXPORT_OK = qw( stem stem_fa );

*stem_fa = \&stem;

sub stem {
    my ($word) = @_;

    $word = remove_kasra($word);
    $word = remove_suffix($word);
    $word = remove_kasra($word);

    return $word;
}

sub remove_kasra {
    my ($word) = @_;

    return $word
        if length $word < 5;

    $word =~ s{ \N{ARABIC KASRA} $}{}x;

    return $word;
}

sub remove_suffix {
    my ($word) = @_;
    my $length = length $word;

    if ($length > 7) {
        return $word
            if $word =~ s{ (?:
                آباد | باره | بندی | بندي | ترین | ترين | ریزی |
                ريزي | سازی | سازي | گیری | گيري | هایی | هايي
            ) $}{}x;
    }

    if ($length > 6) {
        return $word
            if $word =~ s{ (?:
                اند | ایم | ايم | شان | های | هاي
            ) $}{}x;
    }

    if ($length > 5) {
        return normalize($word)
            if $word =~ s{ ان $}{}x;

        return $word
            if $word =~ s{ (?:
                ات | اش | ام | تر | را | ون | ها | هء | ین | ين
            ) $}{}x;
    }

    if ($length > 3) {
        return $word
            if $word =~ s{ (?: ت | ش | م | ه | ی | ي ) $}{}x;
    }

    return $word;
}

sub normalize {
    my ($word) = @_;

    return $word
        if length $word < 4;

    if ($word =~ s{ (?: ت | ر | ش | گ | م | ى ) $}{}x) {
        return $word
            if length $word < 4;

        $word =~ s{ (?: ی | ي ) $}{}x;
    }

    return $word;
}

1;

__END__

=encoding UTF-8

=head1 NAME

Lingua::Stem::UniNE::FA - Persian stemmer

=head1 VERSION

This document describes Lingua::Stem::UniNE::FA v0.04.

=head1 SYNOPSIS

    use Lingua::Stem::UniNE::FA qw( stem_fa );

    my $stem = stem_fa($word);

    # alternate syntax
    $stem = Lingua::Stem::UniNE::FA::stem($word);

=head1 DESCRIPTION

A stemmer for the Persian (Farsi) language.

This module provides the C<stem> and C<stem_fa> functions, which are synonymous
and can optionally be exported.  They accept a single word and return a single
stem.

=head1 SEE ALSO

L<Lingua::Stem::UniNE> provides a stemming object with access to all of the
implemented University of Neuchâtel stemmers including this one.  It has
additional features like stemming lists of words.

L<Lingua::Stem::Any> provides a unified interface to any stemmer on CPAN,
including this one, as well as additional features like normalization,
casefolding, and in-place stemming.

This stemming algorithm was originally implemented by Ljiljana Dolamic in
L<Java|http://members.unine.ch/jacques.savoy/clef/persianStemmerUnicode.txt>.

=head1 ACKNOWLEDGEMENTS

Ljiljana Dolamic and Jacques Savoy of the University of Neuchâtel authored the
original stemming algorithm that was implemented in this module.

=head1 AUTHOR

Nick Patch <patch@cpan.org>

=head1 COPYRIGHT AND LICENSE

© 2012–2013 Nick Patch

This library is free software; you can redistribute it and/or modify it under
the same terms as Perl itself.

	Global
`s`	Focus search bar
`?`	Bring up this help dialog

	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)

	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse

	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)