The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package LaTeX::Decode;

use 5.008;
use warnings;
use strict;
use Carp;

=encoding utf-8

=head1 NAME

LaTeX::Decode - Decode from LaTeX to Unicode

=head1 VERSION

Version 0.03

=cut

use base qw(Exporter);
our $VERSION = '0.03';
our @EXPORT  = qw(latex_decode);
use LaTeX::Decode::Data;
use Unicode::Normalize;

=head1 SYNOPSIS

    use LaTeX::Decode;

    my $latex_string = 'Mu\\d{h}ammad ibn M\\=us\=a al-Khw\\=arizm\\={\\i}';
    my $new_string   = latex_decode($latex_string); # => 'Muḥammad ibn Mūsā al-Khwārizmī'

=head1 DESCRIPTION

=head1 EXPORT

=head1 FUNCTIONS

=head2 latex_decode($text, %options)

Decodes the given text from LaTeX to Unicode.

The function accepts a number of options:

    * normalize => $bool (default 1)
        whether the output string should be normalized with Unicode::Normalize

    * normalization => <normalization form> (default 'NFC')
        and if yes, the normalization form to use (see the Unicode::Normalize documentation)

    * strip_outer_braces => $bool (default 0)
        whether the outer curly braces around letters+combining marks should be
        stripped off. By default "fut{\\'e}" becomes fut{é}, to prevent something
        like '\\textuppercase{\\'e}' to become '\\textuppercaseé'. Setting this option to
        TRUE can be useful for instance when converting BibTeX files.

=head1 GLOBAL OPTIONS

The decoding scheme can be set with

    $LaTeX::Decode::DefaultScheme = '<name>';

Possible values are 'base', 'extra' and 'full'; default value is 'extra'.

base  => Most common macros and diacritics (sufficient for Western languages
         and common symbols)

extra => Also converts punctuation, larger range of diacritics and macros
         (e.g. for IPA, Latin Extended Additional, etc.)

full  => Also converts symbols, Greek letters, dingbats, negated symbols, and
         superscript characters and symbols ...

=cut

our $DefaultScheme = 'extra';

sub _get_diac {
    my $scheme = shift;
    if ( $scheme eq 'base' ) {
        return %DIACRITICS;
    }
    else {
        return ( %DIACRITICS, %DIACRITICSEXTRA );
    }
}

sub _get_mac {
    my $scheme = shift;
    if ( $scheme eq 'base' ) {
        return %WORDMACROS;
    }
    elsif ( $scheme eq 'full' ) {
        return ( %WORDMACROS, %WORDMACROSEXTRA, %PUNCTUATION, %SYMBOLS,
            %GREEK );
    }
    else {
        return ( %WORDMACROS, %WORDMACROSEXTRA, %PUNCTUATION );
    }
}

sub latex_decode {
    my $text      = shift;
    my %opts      = @_;
    my $norm      = exists $opts{normalize} ? $opts{normalize} : 1;
    my $norm_form = exists $opts{normalization} ? $opts{normalization} : 'NFC';
    my $scheme    = exists $opts{scheme} ? $opts{scheme} : $DefaultScheme;
    croak "invalid scheme name '$scheme'"
        unless ( $scheme eq 'full' or $scheme eq 'base' or $scheme eq 'extra' );
    my $strip_outer_braces =
      exists $opts{strip_outer_braces} ? $opts{strip_outer_braces} : 0;

    my %DIAC    = _get_diac($scheme);
    my %WORDMAC = _get_mac($scheme);

    # a regex with all possible word macros
    my $WORDMAC_RE =
      join( '|', sort { length $b <=> length $a } keys %WORDMAC );
    $WORDMAC_RE = qr{$WORDMAC_RE};

    my $DIAC_RE;
    if ( $scheme eq 'base' ) {
        $DIAC_RE = $DIAC_RE_BASE;
    }
    else {
        $DIAC_RE = $DIAC_RE_EXTRA;
    }

    if ( $scheme eq 'full' ) {
        $text =~ s/\\not\\($NEG_SYMB_RE)/$NEGATEDSYMBOLS{$1}/ge;
        $text =~ s/\\textsuperscript{($SUPER_RE)}/$SUPERSCRIPTS{$1}/ge;
        $text =~ s/\\textsuperscript{\\($SUPERCMD_RE)}/$CMDSUPERSCRIPTS{$1}/ge;
        $text =~ s/\\dings{([2-9AF][0-9A-F])}/$DINGS{$1}/ge;
    }

    $text =~ s/(\\[a-zA-Z]+)\\(\s+)/$1\{\}$2/g;    # \foo\ bar -> \foo{} bar
    $text =~ s/([^{]\\\w)([;,.:%])/$1\{\}$2/g;     #} Aaaa\o, -> Aaaa\o{},
    $text =~ s/(\\(?:$DIAC_RE_BASE|$ACCENTS_RE)){\\i}/$1\{i\}/g;
           # special cases such as '\={\i}' -> '\={i}' -> "i\x{304}"

    ## remove {} around macros that print one character
    ## by default we skip that, as it would break constructions like \foo{\i}
    if ($strip_outer_braces) {
        $text =~ s/ \{\\($WORDMAC_RE)\} / $WORDMAC{$1} /gxe;
    }
    $text =~ s/ \\($WORDMAC_RE)(?: \{\} | \s+ | \b) / $WORDMAC{$1} /gxe;

    $text =~ s/\\($ACCENTS_RE)\{(\p{L}\p{M}*)\}/$2 . $ACCENTS{$1}/ge;

    $text =~ s/\\($ACCENTS_RE)(\p{L}\p{M}*)/$2 . $ACCENTS{$1}/ge;

    $text =~ s/\\($DIAC_RE)\s*\{(\p{L}\p{M}*)\}/$2 . $DIAC{$1}/ge;

    $text =~ s/\\($DIAC_RE)\s+(\p{L}\p{M}*)/$2 . $DIAC{$1}/ge;

    $text =~ s/\\($ACCENTS_RE)\{(\p{L}\p{M}*)\}/$2 . $ACCENTS{$1}/ge;

    $text =~ s/\\($ACCENTS_RE)(\p{L}\p{M}*)/$2 . $ACCENTS{$1}/ge;

    $text =~ s/\\($DIAC_RE)\s*\{(\p{L}\p{M}*)\}/$2 . $DIAC{$1}/ge;

    $text =~ s/\\($DIAC_RE)\s+(\p{L}\p{M}*)/$2 . $DIAC{$1}/ge;

    ## remove {} around letter+combining mark(s)
    ## by default we skip that, as it would destroy constructions like \foo{\`e}
    if ($strip_outer_braces) {
        $text =~ s/{(\PM\pM+)}/$1/g;
    }

    if ($norm) {
        return Unicode::Normalize::normalize( $norm_form, $text );
    }
    else {
        return $text;
    }
}

=head1 AUTHOR

François Charette, C<< <firmicus@cpan.org> >>

=head1 BUGS

Please report any bugs or feature requests to C<bug-latex-decode at
rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=LaTeX-Decode>.  I will be
notified, and then you'll automatically be notified of progress on your bug as
I make changes.

=head1 COPYRIGHT & LICENSE

Copyright 2009-2010 François Charette, all rights reserved.

This module is free software.  You can redistribute it and/or
modify it under the terms of the Artistic License 2.0.

This program is distributed in the hope that it will be useful,
but without any warranty; without even the implied warranty of
merchantability or fitness for a particular purpose.

=cut

1;

# vim: set tabstop=4 shiftwidth=4 expandtab: