The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/bin/bash
# makedict.sh  [make dictionary]
#+ and for other lexicographic purposes.


E_BADARGS=65

REMOVETHESE="cid|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|jjj|kkk|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|eee|fff|ggg|hhh|iii|jjj|kkk|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz"

if [ ! -r "$1" ]                    #  Need at least one
then                                #+ valid file argument.
  echo "Usage: $0 files-to-process"
  exit $E_BADARGS
fi


# SORT="sort"                       #  No longer necessary to define
                                    #+ options to sort. Changed from

stop=$(perl -MAI::MicroStructure::WordBlacklist -E  "my \$s=AI::MicroStructure::WordBlacklist::getStopWords('en'); my @s = keys %\$s; print join('|',@s);")



function uniquemmasher(){


cat $1 |                            # Contents of specified files to stdout.
        tr A-Z a-z |                # Convert to lowercase.
        tr ' ' '\012' |             # New: change spaces to newlines.
   #    tr -cd '\012[a-z][0-9]' |   #  Get rid of everything
                                    #+ non-alphanumeric (in orig. script).
        tr -c '\012a-z'  '\012' |   #  Rather than deleting non-alpha
                                    #+ chars, change them to newlines.
        sort |                      # $SORT options unnecessary now.
        egrep -v "($REMOVETHESE)" |
        uniq |                      # Remove duplicates.
        grep -v '^#' |              # Delete lines starting with hashmark.
        egrep -v "^($stop)$" |
        egrep -v "^[ ]*([A-Za-z][A-Za-z]|[A-Za-z])$" |
        grep -v '^$';



}

function masher(){


cat $1 |                            # Contents of specified files to stdout.
        tr A-Z a-z |                # Convert to lowercase.
        tr ' ' '\012' |             # New: change spaces to newlines.
   #    tr -cd '\012[a-z][0-9]' |   #  Get rid of everything
                                    #+ non-alphanumeric (in orig. script).
        tr -c '\012a-z'  '\012' |   #  Rather than deleting non-alpha
                                    #+ chars, change them to newlines.
        sort |                      # $SORT options unnecessary now.
      #  uniq |                      # Remove duplicates.
      egrep -v "($REMOVETHESE)" |
        grep -v '^#' |              # Delete lines starting with hashmark.
        egrep -v "^($stop)$" |
        egrep -v "^[ ]*([A-Za-z][A-Za-z]|[A-Za-z])$" |
        grep -v '^$';



}


if [ "$2" == 1 ]                    #  Need at least one
then
uniquemmasher $*;
else
masher $*;
fi

exit 0