script/unisubs - metacpan.org

#!/usr/bin/env perl

use 5.014;

use utf8;
use strict;
use autodie;
use warnings;
use warnings    qw< FATAL  utf8     >;
use open        qw< :std  :utf8     >;
use charnames   qw< :full >;
use feature     qw< unicode_strings >;

use re          "/msux";

#############################################

use File::Basename      qw< basename >;
use Carp                qw< carp croak confess cluck >;
use Encode              qw< encode decode >;
use Unicode::Normalize  qw< NFD NFC NFKD NFKC >;

#############################################

sub compile		    (    ); 
sub convert_to_subscripts   ( _  );
sub deQ			    ( $  );
sub deQQ		    ( $  );
sub dequeue		    ( $$ );
sub filter		    (    ); 
sub fix_encodings	    (    ); 
sub inits		    (    ); 
sub last_rites		    (    ); 
sub main		    (    ); 

#############################################

MAIN: {
    main();
    exit(0);
} 
die "NOT REACHED";

#############################################


sub fix_encodings() { 
    if (grep /\P{ASCII}/ => @ARGV) {
       @ARGV = map { decode("UTF-8", $_) } @ARGV;
    }
}

sub inits() { 
    last_rites();
    fix_encodings();
    $0 = basename($0);  # shorter messages
    $| = 1;
}

sub last_rites() { 

    $SIG{__DIE__} = sub {
	confess "Uncaught exception: @_" unless $^S;
    };

    $SIG{__WARN__} = sub {
	if ($^S) { cluck   "Trapped warning: @_" }
	else     { confess "Deadly warning: @_"  }
    };

}

sub main() { 
    inits();
    compile();
    filter();
}

sub compile() { 

    my $subscripts = q();
    my $originals  = q();

    local $_;

    binmode(DATA, ":utf8");
    while (<DATA>) {
	next if / \A [\h\v] + \z /;
	next if /  ^ \h* \N{NUMBER SIGN} /;
	chomp;
	die "bad data line: $_" unless m{
	    \A \h+

	    (?<CHAR>
		\H
	    )

	    \h+

	    (?<CPNUM>
		\p{ahex}{4,6}
	    )

	    \t

	    (?<NAME>
		(?=  \w )
		[A-Z0-9\N{SPACE}\N{HYPHEN-MINUS}] +
		(?<= \w )
	    )

	    \z
	};

	my($char, $cpnum, $name) = @+{qw[CHAR CPNUM NAME]};
	my $nfkd = NFKD($char);
	if ($char ne $nfkd && length($nfkd) == 1) { 
	    $subscripts .= $char;
	    $originals  .= $nfkd;
	}
    }

    my $code = deQ<<'LITERAL' . deQQ<<"INTERPOLATED";
		|Q|
		|Q|   use utf8;
		|Q|
		|Q|   sub convert_to_subscripts (_) {
		|Q|       confess "argcount" unless @_ == 1;
		|Q|       my $string = $_[0];
		|Q|       confess "want string" if ref $_[0];
LITERAL
	       |QQ|
	       |QQ|      \$string =~ tr[$originals][$subscripts];
	       |QQ|       return \$string;
	       |QQ|   }
	       |QQ|
	       |QQ|   'ig00'
	       |QQ|
INTERPOLATED

    eval $code || die;

}

sub filter() { 

    if (@ARGV == 0 && -t STDIN) {
	print STDERR "$0: reading from standard input\n"
	    if -t STDERR;
    }

    eval q{ 
	END { close STDOUT } 
	1;
    } || die;

    while (my $line = <>)  {
	chomp $line;
	my $nfline = NFD($line);
	my $subbed = convert_to_subscripts($nfline);
	say $subbed;
    }

}

sub dequeue($$) {
    my($leader, $body) = @_;
    $body =~ s/^\s*\Q$leader\E ?//gm;
    return $body;
}

sub deQ($) {
    my $text = $_[0];
    return dequeue q<|Q|>,  $text;
}

sub deQQ($) {
    my $text = $_[0];
    return dequeue qq<|QQ|>, $text;
}


__END__
  ͅ  0345	COMBINING GREEK YPOGEGRAMMENI
  ٖ  0656	ARABIC SUBSCRIPT ALEF
 ₊  208A	SUBSCRIPT PLUS SIGN
 ₋  208B	SUBSCRIPT MINUS
 ₌  208C	SUBSCRIPT EQUALS SIGN
 ₍  208D	SUBSCRIPT LEFT PARENTHESIS
 ₎  208E	SUBSCRIPT RIGHT PARENTHESIS
 ₀  2080	SUBSCRIPT ZERO
 ₁  2081	SUBSCRIPT ONE
 ₂  2082	SUBSCRIPT TWO
 ⨧  2A27	PLUS SIGN WITH SUBSCRIPT TWO
 ₃  2083	SUBSCRIPT THREE
 ₄  2084	SUBSCRIPT FOUR
 ₅  2085	SUBSCRIPT FIVE
 ₆  2086	SUBSCRIPT SIX
 ₇  2087	SUBSCRIPT SEVEN
 ₈  2088	SUBSCRIPT EIGHT
 ₉  2089	SUBSCRIPT NINE
 ₐ  2090	LATIN SUBSCRIPT SMALL LETTER A
 ₑ  2091	LATIN SUBSCRIPT SMALL LETTER E
 ₔ  2094	LATIN SUBSCRIPT SMALL LETTER SCHWA
 ₕ  2095	LATIN SUBSCRIPT SMALL LETTER H
 ᵢ  1D62	LATIN SUBSCRIPT SMALL LETTER I
 ⱼ  2C7C	LATIN SUBSCRIPT SMALL LETTER J
 ₖ  2096	LATIN SUBSCRIPT SMALL LETTER K
 ₗ  2097	LATIN SUBSCRIPT SMALL LETTER L
 ₘ  2098	LATIN SUBSCRIPT SMALL LETTER M
 ₙ  2099	LATIN SUBSCRIPT SMALL LETTER N
 ₒ  2092	LATIN SUBSCRIPT SMALL LETTER O
 ₚ  209A	LATIN SUBSCRIPT SMALL LETTER P
 ᵣ  1D63	LATIN SUBSCRIPT SMALL LETTER R
 ₛ  209B	LATIN SUBSCRIPT SMALL LETTER S
 ₜ  209C	LATIN SUBSCRIPT SMALL LETTER T
 ᵤ  1D64	LATIN SUBSCRIPT SMALL LETTER U
 ᵥ  1D65	LATIN SUBSCRIPT SMALL LETTER V
 ₓ  2093	LATIN SUBSCRIPT SMALL LETTER X
 ᵦ  1D66	GREEK SUBSCRIPT SMALL LETTER BETA
 ᵧ  1D67	GREEK SUBSCRIPT SMALL LETTER GAMMA
 ͺ  037A	GREEK YPOGEGRAMMENI
 ᵨ  1D68	GREEK SUBSCRIPT SMALL LETTER RHO
 ᵩ  1D69	GREEK SUBSCRIPT SMALL LETTER PHI
 ᵪ  1D6A	GREEK SUBSCRIPT SMALL LETTER CHI
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)