The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# -*- perl -*-

# t/001_load.t - check module loading and create testing directory

use Carp;
use strict;
use warnings;

use Test::More tests => 5;

my $totalTests = 10;
BEGIN { use_ok( 'Text::Ngramize' ); }
my $object = Text::Ngramize->new ();
isa_ok ($object, 'Text::Ngramize');
ok (testNgramsHashesComputedOverRange ($totalTests), 'Ngram hash computing.');
ok (testFindingOfWords($totalTests), 'Word finding.' );
ok (testNgramsComputedOverRange ($totalTests), 'Ngram computing.');


# type of tests to perform.
# create a string with the position of words knows and see that they are found
# and the n-grams are computed correctly.
# compute hash values and strings and ensure equal strings and equal hashes.
# add the same string to the beginning and end of text and compare the first and
# last hash values computed.

sub testNgramsHashesComputedOverRange
{
  my ($TotalTests) = @_;
  $TotalTests = 50 unless defined $TotalTests;
  my $typeOfNgrams;
  foreach my $typeOfNgrams ('asc', 'characters', 'words')
  {
    for (my $sizeOfNgrams = 1; $sizeOfNgrams < 64; $sizeOfNgrams += $sizeOfNgrams)
    {
      print 'testing ' . $typeOfNgrams . ' ngram hashes of size ' . $sizeOfNgrams . ' ';
      for (my $i = 0; $i < $TotalTests; $i++)
      {
        return 0 unless testNgramHashesComputedOnce ($sizeOfNgrams, $typeOfNgrams);
        print '.';
      }
      print "\n";
    }
  }
  return 1;
}

sub testNgramHashesComputedOnce
{
  my ($SizeOfNgrams, $TypeOfNgrams) = @_;
  $SizeOfNgrams = 1 unless defined $SizeOfNgrams;
  $TypeOfNgrams = 'characters' unless defined $TypeOfNgrams;

  # build the string to test on.

  # get a random word.
  my $ascOnly = ($TypeOfNgrams =~ m/^a/i);
  my $word = getRandomWord (2 * $SizeOfNgrams, $ascOnly);
  if (length ($word) < $SizeOfNgrams)
  {
    $word .= 'a' x ($SizeOfNgrams - length ($word));
  }
  $word = substr ($word, 0, $SizeOfNgrams);

  # get some random text.
  my ($text, undef) = getRandomText ();

  # prefix and suffix the word to the text. by doing this we ensure
  # the first and last ngram hash values generated should be the same,
  # for characters or bytes.
  my $prefix = $word . ' ';
  $prefix = $prefix x $SizeOfNgrams;
  my $suffix = ' ' . $word;
  $suffix = $suffix x $SizeOfNgrams;
  $text = $prefix . $text . $suffix;
  my $ngramizer = Text::Ngramize->new (sizeOfNgrams => $SizeOfNgrams, typeOfNgrams => $TypeOfNgrams);
  my $listOfNgramHashValuesWithPositions = $ngramizer->getListOfNgramHashValuesWithPositions (text => $text);
  my $listOfNgramHashValues = $ngramizer->getListOfNgramHashValues (text => $text);
  return 0 if (@$listOfNgramHashValuesWithPositions != @$listOfNgramHashValues);

  # first and last values should be same given how text was constructed.
  return 0 if ($listOfNgramHashValues->[-1] != $listOfNgramHashValues->[0]);

  # hash values of both methods should be equal.
  for (my $i = 0; $i < @$listOfNgramHashValuesWithPositions; $i++)
  {
    return 0 if ($listOfNgramHashValuesWithPositions->[$i][0] != $listOfNgramHashValues->[$i]);
  }

  return 1;
}

# tests the methods getListOfNgramsWithPositions and getListOfNgrams over
# a range of ngram sizes for 'asc' and 'character' ngrams.
sub testNgramsComputedOverRange
{
  my ($TotalTests) = @_;
  $TotalTests = 50 unless defined $TotalTests;
  my $typeOfNgrams;
  foreach my $typeOfNgrams ('asc', 'characters')
  {
    for (my $sizeOfNgrams = 1; $sizeOfNgrams < 64; $sizeOfNgrams += $sizeOfNgrams)
    {
      print 'testing ' . $typeOfNgrams . ' ngrams of size ' . $sizeOfNgrams . ' ';
      for (my $i = 0; $i < $TotalTests; $i++)
      {
        return 0 unless testNgramsComputedOnce ($sizeOfNgrams, $typeOfNgrams);
        print '.';
      }
      print "\n";
    }
  }
  return 1;
}

# tests the methods getListOfNgramsWithPositions and getListOfNgrams once on
# randomly generated text and ngrams specified size.
sub testNgramsComputedOnce
{
  my ($SizeOfNgrams, $TypeOfNgrams) = @_;
  $SizeOfNgrams = 1 unless defined $SizeOfNgrams;
  $TypeOfNgrams = 'characters' unless defined $TypeOfNgrams;

  my ($text, undef) = getRandomText ();
  my $ngramizer = Text::Ngramize->new (sizeOfNgrams => $SizeOfNgrams, typeOfNgrams => $TypeOfNgrams);
  my $listOfNgramsWithPositions = $ngramizer->getListOfNgramsWithPositions (text => $text);
  my $listOfNgrams = $ngramizer->getListOfNgrams (text => $text);
  return 0 if (@$listOfNgramsWithPositions != @$listOfNgrams);

  if ($TypeOfNgrams =~ /^c/i)
  {
    for (my $i = 0; $i < @$listOfNgramsWithPositions; $i++)
    {
      return 0 if ($listOfNgramsWithPositions->[$i][0] ne $listOfNgrams->[$i]);
      return 0 if (substr ($text, $listOfNgramsWithPositions->[$i][1], $listOfNgramsWithPositions->[$i][2]) ne $listOfNgrams->[$i]);
    }
  }
  elsif ($TypeOfNgrams =~ /^a/i)
  {
    use bytes;
    for (my $i = 0; $i < @$listOfNgramsWithPositions; $i++)
    {
      return 0 if ($listOfNgramsWithPositions->[$i][0] ne $listOfNgrams->[$i]);
      return 0 if (substr ($text, $listOfNgramsWithPositions->[$i][1], $listOfNgramsWithPositions->[$i][2]) ne $listOfNgrams->[$i]);
    }
  }
  return 1;
}

# tests the methods getListOfNgramsWithPositions and getListOfNgrams multiple
# times on different randomly generated text and ngrams of size 1.
sub testFindingOfWords
{
  my ($TotalTests) = @_;
  $TotalTests = 50 unless defined $TotalTests;

  print 'testing word finding';
  for (my $i = 0; $i < $TotalTests; $i++)
  {
    return 0 unless testFindingOfWordsOnce ();
    print '.';
  }
  print "\n";
  return 1;
}

# tests the methods getListOfNgramsWithPositions and getListOfNgrams once on
# randomly generated text and ngrams of size 1.
sub testFindingOfWordsOnce
{
  my ($text, $wordInfo) = getRandomText ();
  my $ngramizer = Text::Ngramize->new (typeOfNgrams => 'words', sizeOfNgrams => 1);
  my $listOfWordsWithPositions = $ngramizer->getListOfNgramsWithPositions (text => $text);
  my $listOfWords = $ngramizer->getListOfNgrams (text => $text);
  return 0 if (@$listOfWordsWithPositions != @$wordInfo);
  return 0 if (@$listOfWords != @$wordInfo);

  for (my $i = 0; $i < @$listOfWordsWithPositions; $i++)
  {
    return 0 if ($listOfWords->[$i] ne $wordInfo->[$i][0]);
    return 0 if ($listOfWordsWithPositions->[$i][0] ne $wordInfo->[$i][0]);
    return 0 if ($listOfWordsWithPositions->[$i][1] != $wordInfo->[$i][1]);
    return 0 if ($listOfWordsWithPositions->[$i][2] != $wordInfo->[$i][2]);
  }
  return 1;
}

# generates a random word with at least one character.
sub getRandomWord # ($MaxCharactersInWord, $Type)
{
  my $MaxCharactersInWord = shift;
  my $AscOnly = shift;
  $AscOnly = 0 unless defined $AscOnly;

  $MaxCharactersInWord = 10 unless defined $MaxCharactersInWord;
  $MaxCharactersInWord = int abs $MaxCharactersInWord;
  my $charsInWord = 1 + int rand $MaxCharactersInWord;
  $charsInWord = 1 if ($charsInWord < 1);
  my @characters;
  for (my $i = 0; $i < 2 * $charsInWord; $i++)
  {
    my $ord;
    if ($AscOnly) { $ord = ord ('a') + int rand (ord('z') - ord ('a') + 1); }
    else { $ord = int rand (0xffff); }
    my $chr;
    my $doNext = 0;
    eval
    {
      no warnings;
      $chr = chr $ord;
      $doNext = 1 if ($chr =~ /^\P{IsAlphabetic}$/);
    };
    next if ($doNext || $@);
    next unless length ($chr);
    push @characters, $chr;
    last if (@characters >= $charsInWord);
  }
  push @characters, 'a' if (@characters == 0);
  return join ('', @characters);
}

# generates a random sentence of words ending with a period.
sub getRandomSentence # ($MaxWordsInSentence, $StartingPosition)
{
  my $MaxWordsInSentence = shift;
  $MaxWordsInSentence = 10 unless defined $MaxWordsInSentence;
  $MaxWordsInSentence = int abs $MaxWordsInSentence;
  $MaxWordsInSentence = 1 if ($MaxWordsInSentence < 1);

  my $StartingPosition = shift;
  $StartingPosition = 0 unless defined $StartingPosition;

  my @punctuation;
  foreach my $item (',', '#', '*', '_', '-', '+', '=', ';'. ':')
  {
    push @punctuation, $item if ($item =~ /^\P{IsAlphabetic}$/);
  }

  my @words;
  my @allWordInfo;
  my $wordPosition = $StartingPosition;
  for (my $i = 0; $i < $MaxWordsInSentence; $i++)
  {
    # add word with at least one letter.
    my $word = getRandomWord;
    my $wordSize = length ($word);
    next if ($wordSize < 1);
    push @words, $word;

    # store and update the position info of the word.
    push @allWordInfo, [$word, $wordPosition, $wordSize];
    $wordPosition += $wordSize;

    # if done, exit the loop.
    last unless ($i < $MaxWordsInSentence);

    # add a space 90% of the time.
    if (rand() < .9)
    {
      my $space = ' ';

      # sometimes add extra space.
      if (rand() < .1)
      {
        my $size = 2 + int rand (4);
        $space = ' ' x $size;
      }
      push @words, $space;
      $wordPosition += length ($space);
    }
    else
    {
      my $punc = $punctuation[rand(scalar(@punctuation))];
      push @words, $punc;
      $wordPosition += length ($punc);
    }
  }

  # compute the position and length of the words.
  my $sentence = join ('', @words) . '.';

  # sanity check, i really hope this never croaks.
  my $offset = $allWordInfo[0]->[1];
  foreach my $wordInfo (@allWordInfo)
  {
    if (substr ($sentence, $wordInfo->[1]-$offset, $wordInfo->[2]) ne $wordInfo->[0])
    {
      croak "progamming error: substr '" . $wordInfo->[0] . "' not equal to '" . substr ($sentence, $wordInfo->[1]-$offset, $wordInfo->[2]) . "'.\n";
    }
  }

  return ($sentence, \@allWordInfo);
}


sub getRandomText
{
  # get the total number of sentences to generate.
  my $TotalSentences = shift;
  $TotalSentences = 50 unless defined $TotalSentences;
  $TotalSentences = abs $TotalSentences;
  $TotalSentences = 1 if ($TotalSentences < 1);

  my @allSentences;
  my @allWordInfo;
  my $offset = 0;
  for (my $i = 0; $i < $TotalSentences; $i++)
  {
    my ($sentence, $wordInfo) = getRandomSentence (15, $offset);
    $offset += length ($sentence);
    push @allSentences, $sentence;
    push @allWordInfo, @$wordInfo;
  }

  # concat all the sentences.
  my $text = join ('', @allSentences);

  # sanity check, i really hope this also never croaks.
  foreach my $wordInfo (@allWordInfo)
  {
    if (substr ($text, $wordInfo->[1], $wordInfo->[2]) ne $wordInfo->[0])
    {
      croak "progamming error: substr '" . $wordInfo->[0] . "' not equal to '" . substr ($text, $wordInfo->[1], $wordInfo->[2]) . "'.\n";
    }
  }

  return ($text, \@allWordInfo);
}