#!/bin/bash
# makedict.sh [make dictionary]
#+ and for other lexicographic purposes.
E_BADARGS=65
REMOVETHESE="cid|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|jjj|kkk|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|eee|fff|ggg|hhh|iii|jjj|kkk|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz"
if [ ! -r "$1" ] # Need at least one
then #+ valid file argument.
echo "Usage: $0 files-to-process"
exit $E_BADARGS
fi
# SORT="sort" # No longer necessary to define
#+ options to sort. Changed from
stop=$(perl -MAI::MicroStructure::WordBlacklist -E "my \$s=AI::MicroStructure::WordBlacklist::getStopWords('en'); my @s = keys %\$s; print join('|',@s);")
function uniquemmasher(){
cat $1 | # Contents of specified files to stdout.
tr A-Z a-z | # Convert to lowercase.
tr ' ' '\012' | # New: change spaces to newlines.
# tr -cd '\012[a-z][0-9]' | # Get rid of everything
#+ non-alphanumeric (in orig. script).
tr -c '\012a-z' '\012' | # Rather than deleting non-alpha
#+ chars, change them to newlines.
sort | # $SORT options unnecessary now.
egrep -v "($REMOVETHESE)" |
uniq | # Remove duplicates.
grep -v '^#' | # Delete lines starting with hashmark.
egrep -v "^($stop)$" |
egrep -v "^[ ]*([A-Za-z][A-Za-z]|[A-Za-z])$" |
grep -v '^$';
}
function masher(){
cat $1 | # Contents of specified files to stdout.
tr A-Z a-z | # Convert to lowercase.
tr ' ' '\012' | # New: change spaces to newlines.
# tr -cd '\012[a-z][0-9]' | # Get rid of everything
#+ non-alphanumeric (in orig. script).
tr -c '\012a-z' '\012' | # Rather than deleting non-alpha
#+ chars, change them to newlines.
sort | # $SORT options unnecessary now.
# uniq | # Remove duplicates.
egrep -v "($REMOVETHESE)" |
grep -v '^#' | # Delete lines starting with hashmark.
egrep -v "^($stop)$" |
egrep -v "^[ ]*([A-Za-z][A-Za-z]|[A-Za-z])$" |
grep -v '^$';
}
if [ "$2" == 1 ] # Need at least one
then
uniquemmasher $*;
else
masher $*;
fi
exit 0