The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
#!/bin/bash
mkdir -p tmp-test
for i in `find . -iname "*pdf" -print | cut -d '/' -f2`; do
  if [ -f tmp-test/$i.txt ] || [ -f $i.txt ] ;then
    echo "skip $i";
  else

    echo `date  +"%Y-%m-%d +%H"` > $i.txt;
    pdf2txt $i | sed 's/[^a-zA-Z0-9, ]*//g' | sed 's/Page/@@MARKER@@ Page/g' | sed 's/CHAPTER/@@MARKER@@ CHAPTER/g' | sed 's/SUMMARY/@@MARKER@@ SUMMARY/g' | sed 's/REFERENCES/@@MARKER@@ REFERENCES/g' | sed 's/OVERVIEW/@@MARKER@@ OVERVIEW/g' | sed 's/Chapter/@@MARKER@@ Chapter/g' | sed 's/Index/@@MARKER@@ Index/g' >> $i.txt;

    mv $i.txt tmp-test;
    cat tmp-test/$i.txt | egrep -v  "^[0-9]*[A-Z]$|[0-9][0-9]$|[0-9]$|^[*]$" | data-freq  --limit 30;

  fi
done