The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
package String::Markov;

# ABSTRACT: A Moo-based, text-oriented Markov Chain module

our $VERSION = 0.009;

use 5.010;
use Moo;
use namespace::autoclean;

use Unicode::Normalize qw(normalize);
use List::Util qw(sum);

has normalize => (is => 'rw', default => sub { 'C' });
has do_chomp  => (is => 'rw', default => sub { 1 });
has null      => (is => 'ro', default => sub { "\0" });
has stable    => (is => 'ro', default => sub { 1 });
has order     => (is => 'ro', isa => sub {
	die "Need an integer greater than zero" if !$_[0] || $_[0] =~ /\D/;
}, default => sub { 2 });

has ['split_sep','join_sep'] => (
	is => 'rw',
	default => sub { undef }
);

has ['transition_count','row_sum'] => (
	is => 'ro',
	isa => sub { die "Need a hash ref" if ref $_[0] ne 'HASH'; },
	default => sub { {} }
);

around BUILDARGS => sub {
	my ($orig, $class, @arg) = @_;
	my %ahash;

	%ahash = @arg == 1 ? %{$arg[0]} : @arg;

	my $sep = delete $ahash{sep} // '';
	die "ERR: sep argument must be scalar; did you mean to set split_sep instead?" if ref $sep;
	$ahash{split_sep} //= $sep;
	$ahash{join_sep}  //= $sep;

	return $class->$orig(\%ahash);
};

sub join_prob {
	my ($self, $orig_prob) = @_;
	my %p;

	@p{@{$orig_prob->[0]}} = @{$orig_prob->[1]};

	return \%p;
}

sub split_prob {
	my ($self, $orig_prob) = @_;

	if ($self->stable) {
		my @k = sort keys %$orig_prob;
		return [
			\@k,
			[@{$orig_prob}{@k}],
		];
	} else {
		return [
			[keys %$orig_prob],
			[values %$orig_prob],
		];
	}
}

sub split_all_prob {
	my $self = shift;
	my $tc = $self->transition_count;
	my $nt = {};

	while (my ($state, $prob) = each %$tc) {
		$nt->{$state} = $self->split_prob($prob);
	}

	%$tc = %$nt;
}

sub split_line {
	my ($self, $sample) = @_;
	if (my $norm = $self->normalize) {
		$sample = normalize($norm, $sample);
	}
	return split($self->split_sep, $sample);
}

sub add_sample {
	my ($self, $sample) = @_;
	my $n     = $self->order;
	my $null  = $self->null;

	my $sref  = ref $sample;
	my @nms = ($null,) x $n;

	if ($sref eq 'ARRAY') {
		push @nms, @$sample;
	} elsif (!$sref) {
		die 'ERR: missing split separator,' if !defined $self->split_sep;
		push @nms, $self->split_line($sample);
	} else {
		die "ERR: bad sample type $sref";
	}

	push @nms, $null;

	my $sep   = $self->join_sep // '';
	my $count = $self->transition_count;
	my $sum   = $self->row_sum;
	for my $i (0 .. ($#nms - $n)) {
		my $cur = join($sep, @nms[$i .. ($i + $n - 1)]);
		my $nxt = $nms[$i + $n];
		my $prob = $count->{$cur};
		if ($prob && ref $prob ne 'HASH') {
			$count->{$cur} = $self->join_prob($prob);
		}
		++$count->{$cur}{$nxt};
		++$sum->{$cur};
	}

	return $self;
}

sub add_files {
	my ($self, @files) = @_;
	my $do_chomp = $self->do_chomp;

	local @ARGV = @files;
	while(my $sample = <>) {
		chomp $sample if $do_chomp;
		$self->add_sample($sample);
	}

	$self->split_all_prob();

	return $self;
}

sub sample_next_state {
	my ($self, @cur_state) = @_;
	die "ERR: wrong amount of state" if @cur_state != $self->order;

	my $count = $self->transition_count;
	my $sum   = $self->row_sum;

	my $cur = join($self->join_sep // '', @cur_state);
	my $thresh = $sum->{$cur};
	return undef if !$thresh;

	$thresh *= rand();

	my $prob = $count->{$cur};
	if (ref $prob ne 'ARRAY') {
		$prob = $self->split_prob($prob);
		$count->{$cur} = $prob;
	}

	my $s = 0;
	my $i = 0;
	my ($k, $v) = @{$prob};
	do {
		$s += $v->[$i];
	} while ($thresh > $s && ++$i);
	return $k->[$i];
}

sub generate_sample {
	my ($self) = @_;

	my $null = $self->null;
	my $n  = $self->order;
	my $sep = $self->join_sep // '';
	my @nm = ($null,) x $n;

	do {
		push @nm, $self->sample_next_state(@nm[-$n .. -1]);
	} while ($nm[-1] ne $null);

	@nm = @nm[$n .. ($#nm-1)];

	return wantarray ?
		@nm :
		defined $self->join_sep ?
			join($sep, @nm) :
			\@nm;

}

__PACKAGE__->meta->make_immutable;

1;

__END__

=pod

=head1 NAME

String::Markov - A Moo-based, text-oriented Markov Chain module

=head1 VERSION

version 0.009

=head1 SYNOPSIS

  my $mc = String::Markov->new();

  $mc->add_files(@ARGV);

  print $mc->generate_sample . "\n" for (1..20);


  my $mc = String::Markov->new(order => 1, sep => ' ');

  for my $stanza (@The_Rime_of_the_Ancient_Mariner) {
  	$mc->add_sample($stanza);
  }
  
  print $mc->generate_sample;

=head1 DESCRIPTION

String::Markov is a Moo-based Markov Chain module, designed to easily consume
and produce text.

=head1 ATTRIBUTES

=head2 order

The order of the chain, i.e. how much past state is used to determine the next
state. The default of 2 is reasonable for constructing new names/words when
splitting into characters, or for long-ish works when splitting into words.

=head2 split_sep

How states are split. This value (or I<sep>; see L</new()>) is passed directly
as the first argument to L<perlfunc/split>, so using C<' '> has special
semantics.  Regular expressions will work as well, but be aware that any
matched characters are discarded.

=head2 join_sep

How states are joined. This value (or I<sep>; see L</new()>) is passed as the
first argument of L<perlfunc/join>. In addition, it is used to build keys for
internal hashes. This can cause problems in cases where split_sep() produces
sequences like C<'ae', 'io'>, C<'a', 'ei', 'o'>, or C<'ae', 'i', 'o'>, which
will all turn into C<'aeio'> with the default of C<''>. If I<join_sep> is
C<'*'> instead, then three unique keys result: C<'ae*io'>, C<'a*ei*o'>, and
C<'ae*i*o'>. See L</add_sample()>.

=head2 null

What is used to mark the beginning and end of a sample internally. The default
of C<"\0"> should work for UTF-8 text, but may cause problems with UTF-16 or
other encodings.

=head2 stable

Whether or not to always produce the same results from the same internal state.
If stable is true, then the same random seed (see L<perlfunc/srand>) will
produce identical results for chains created from the same inputs.

=head2 normalize

Whether to normalize Unicode strings. This value, if true, is passed as the
first argument to Unicode::Normalize::normalize. The default C<'C'> should do
what most people expect, but it may be the case that C<'D'> is what you want.
If you're not using Unicode, set this to undef.

=head2 do_chomp

Whether to L<perlfunc/chomp> lines when reading files. See L</add_files()>.

=head1 METHODS

=head2 new()

  # Defaults
  my $mc = String::Markov->new(
  	order     => 2,
  	sep       => '',
  	split_sep => undef,
  	join_sep  => undef,
  	null      => "\0",
	stable    =>  1,
  	normalize => 'C',
	do_chomp  => 1,
  );

The I<sep> argument doesn't correlate to an attribute, but is used to
initialize I<split_sep> and/or I<join_sep> if either is undefined.

See L</ATTRIBUTES>.

=head2 split_line()

This is the method L</add_sample()> calls when it is passed a non-ref argument.
It returns an array of states (usually individual characters or words) that are
used to build the Markov Chain model.

The default implementation is equivalent to:

  sub split_line {
  	my ($self, $sample) = @_;
  	$sample = normalize($self->normalize, $sample) if $self->normalize;
  	return split($self->split_sep, $sample);
  }

This method can be overridden to deal with unusual data.

=head2 add_sample()

This method adds samples to build the Markov Chain model. It takes a single
argument, which can be either a string or an array reference. If the argument
is an array reference, its elements are directly used to update the Markov
Chain. If it is a string, add_sample() uses the split_line() method to create
an array of states, and then updates the Markov Chain.

Note that this function generates hash keys for the transition matrix. The keys
are built according to the I<order>, I<null>, and I<join_sep> attributes, so if
an instance is created with:

  my $mc = String::Markov->new(null => '!', order => 2, join_sep => '*');
  $mc->add_sample($_) for (@sample_lines);

Then the internal transition matrix might look like:

  {
    '!*!' => { 'A' => 5, 'B' => 7, ... }, # Initial state
    '!*A' => { ... },
    '!*B' => { ... },
    ...
    'x*y' => { '!' => 4 },                # always end after 'xy'
    'y*z' => { '!' => 3, 'q' => 2 },      # sometimes end after 'yz'
    ...
  }

=head2 add_files()

This is a simple convenience method, designed to replace code like:

  while(<>) { chomp; $mc->add_sample($_) }

It takes a list of file names as arguments, and adds them line-by-line.

=head2 generate_sample()

This method returns a sequence of states, generated from the Markov Chain using
the Monte Carlo method.

If called in scalar context, the states are joined with I<join_sep> before
being returned.

=head1 SEE ALSO

L<Algorithm::MarkovChain>

=head1 AUTHOR

Grant Mathews <gmathews@cpan.org>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2014 by Grant Mathews.

This is free software, licensed under:

  The Artistic License 2.0 (GPL Compatible)

=cut