lib/Hailo/Storage.pm - metacpan.org

package Hailo::Storage;
BEGIN {
  $Hailo::Storage::AUTHORITY = 'cpan:AVAR';
}
BEGIN {
  $Hailo::Storage::VERSION = '0.64';
}

use 5.010;
use Any::Moose;
use Any::Moose 'X::StrictConstructor';
use DBI;
use Hailo::Storage::Schema;

has dbd => (
    isa           => 'Str',
    is            => 'ro',
    lazy_build    => 1,
    documentation => "The DBD::* driver we're using",
);

has dbd_options => (
    isa           => 'HashRef',
    is            => 'ro',
    lazy_build    => 1,
    documentation => 'Options passed as the last argument to DBI->connect()',
);

sub _build_dbd_options {
    my ($self) = @_;
    return {
        RaiseError => 1
    };
}

has dbh => (
    isa           => 'DBI::db',
    is            => 'ro',
    lazy_build    => 1,
    documentation => 'Our DBD object',
);

sub _build_dbh {
    my ($self) = @_;
    my $dbd_options = $self->dbi_options;

    return DBI->connect($self->dbi_options);
};

has dbi_options => (
    isa           => 'ArrayRef',
    is            => 'ro',
    auto_deref    => 1,
    lazy_build    => 1,
    documentation => 'Options passed to DBI->connect()',
);

sub _build_dbi_options {
    my ($self) = @_;
    my $dbd = $self->dbd;
    my $dbd_options = $self->dbd_options;
    my $db = $self->brain // '';

    my @options = (
        "dbi:$dbd:dbname=$db",
        '',
        '',
        $dbd_options,
    );

    return \@options;
}

has _engaged => (
    isa           => 'Bool',
    is            => 'rw',
    default       => 0,
    documentation => 'Have we done setup work to get this database going?',
);

has sth => (
    isa        => 'HashRef',
    is         => 'ro',
    lazy_build => 1,
    documentation => 'A HashRef of prepared DBI statement handles',
);

sub _build_sth {
    my ($self) = @_;
    return Hailo::Storage::Schema->sth($self->dbd, $self->dbh, $self->order);
}

has _boundary_token_id => (
    isa => 'Int',
    is  => 'rw',
);

# bootstrap the database
sub _engage {
    my ($self) = @_;

    if ($self->initialized) {
        # Check the order we've been given and retrieve it from the
        # database if there's nothing odd going on.
        $self->_engage_initialized_check_and_set_order;

        # Likewise for the Tokenizer
        $self->_engage_initialized_check_and_set_tokenizer;

        $self->sth->{token_id}->execute(0, '');
        my $id = $self->sth->{token_id}->fetchrow_array;
        $self->_boundary_token_id($id);
    }
    else {
        Hailo::Storage::Schema->deploy($self->dbd, $self->dbh, $self->order);

        # Set metadata in the database for use by subsequent
        # invocations
        {
            # Don't change order again
            my $order = $self->order;
            $self->sth->{set_info}->execute('markov_order', $order);

            # Warn if the tokenizer changes
            my $tokenizer = $self->tokenizer_class;
            $self->sth->{set_info}->execute('tokenizer_class', $tokenizer);
        }

        $self->sth->{add_token}->execute(0, '');
        $self->sth->{last_token_rowid}->execute();
        my $id = $self->sth->{last_token_rowid}->fetchrow_array();
        $self->_boundary_token_id($id);
    }

    $self->_engaged(1);

    return;
}

sub _engage_initialized_check_and_set_order {
    my ($self) = @_;

    my $sth = $self->dbh->prepare(qq[SELECT text FROM info WHERE attribute = ?;]);
    $sth->execute('markov_order');
    my $db_order = $sth->fetchrow_array();

    my $my_order = $self->order;
    if ($my_order != $db_order) {
        if ($self->hailo->{has_custom_order}->()) {
            die <<"DIE";
You've manually supplied an order of `$my_order' to Hailo but you're
loading a brain that has the order `$db_order'.

Hailo will automatically load the order from existing brains, however
you've constructed Hailo and manually specified an order not
equivalent to the existing order of the database.

Either supply the correct order or omit the order attribute
altogether. We could continue but I'd rather die since you're probably
expecting something I can't deliver.
DIE
        }

        $self->order($db_order);
        $self->hailo->{set_order}->($db_order);
    }

    return;
}

sub _engage_initialized_check_and_set_tokenizer {
    my ($self) = @_;

    my $sth = $self->dbh->prepare(qq[SELECT text FROM info WHERE attribute = ?;]);
    $sth->execute('tokenizer_class');
    my $db_tokenizer_class = $sth->fetchrow_array;
    my $my_tokenizer_class = $self->tokenizer_class;

    # defined() because we can't count on old brains having this
    if (defined $db_tokenizer_class
        and $my_tokenizer_class ne $db_tokenizer_class) {
        if ($self->hailo->{has_custom_tokenizer_class}->()) {
            die <<"DIE";
You've manually supplied a tokenizer class `$my_tokenizer_class' to
Hailo, but you're loading a brain that has the tokenizer class
`$db_tokenizer_class'.

Hailo will automatically load the tokenizer class from existing
brains, however you've constructed Hailo and manually specified an
tokenizer class not equivalent to the existing tokenizer class of the
database.

Either supply the correct tokenizer class or omit the order attribute
altogether. We could continue but I'd rather die since you're probably
expecting something I can't deliver.
DIE
        }

        $self->tokenizer_class($db_tokenizer_class);
        $self->hailo->{set_tokenizer_class}->($db_tokenizer_class);
    }

    return;
}

sub start_training {
    my ($self) = @_;
    $self->_engage() unless $self->_engaged;
    $self->start_learning();
    return;
}

sub stop_training {
    my ($self) = @_;
    $self->stop_learning();
    return;
}

sub start_learning {
    my ($self) = @_;
    $self->_engage() unless $self->_engaged;

    # start a transaction
    $self->dbh->begin_work;
    return;
}

sub stop_learning {
    my ($self) = @_;
    # finish a transaction
    $self->dbh->commit;
    return;
}

# See if SELECT count(*) FROM info; fails. If not we assume that we
# have an up and running database.
sub initialized {
    my ($self) = @_;
    my $dbh = $self->dbh;

    my ($err, $warn, $res);
    eval {
        # SQLite will warn 'no such table info'
        local $SIG{__WARN__} = sub { $err = $_[0] };

        # If it doesn't warn trust that it dies here
        local ($@, $!);
        $res = $dbh->do("SELECT count(*) FROM info;");
    };

    return (not $err and not $warn and defined $res);
}

# return some statistics
sub totals {
    my ($self) = @_;
    $self->_engage() unless $self->_engaged;

    $self->sth->{token_total}->execute();
    my $token = $self->sth->{token_total}->fetchrow_array - 1;
    $self->sth->{expr_total}->execute();
    my $expr = $self->sth->{expr_total}->fetchrow_array // 0;
    $self->sth->{prev_total}->execute();
    my $prev = $self->sth->{prev_total}->fetchrow_array // 0;
    $self->sth->{next_total}->execute();
    my $next = $self->sth->{next_total}->fetchrow_array // 0;

    return $token, $expr, $prev, $next;
}

__PACKAGE__->meta->make_immutable;

=encoding utf8

=head1 NAME

Hailo::Storage - A base class for L<Hailo> L<storage|Hailo::Role::Storage> backends

=head1 METHODS

The following two methods must to be implemented by subclasses:

=head2 C<_build_dbd>

Should return the name of the database driver (e.g. 'SQLite') which will be
passed to L<DBI|DBI>.

=head2 C<_build_dbd_options>

Subclasses can override this method to add options of their own. E.g:

    override _build_dbd_options => sub {
        return {
            %{ super() },
            sqlite_unicode => 1,
        };
    };

=head1 Comparison of backends

This benchmark shows how the backends compare when training on the
small testsuite dataset as reported by the F<utils/hailo-benchmark>
utility (found in the distribution):

                         Rate DBD::Pg DBD::mysql DBD::SQLite/file DBD::SQLite/memory
    DBD::Pg            2.22/s      --       -33%             -49%               -56%
    DBD::mysql         3.33/s     50%         --             -23%               -33%
    DBD::SQLite/file   4.35/s     96%        30%               --               -13%
    DBD::SQLite/memory 5.00/s    125%        50%              15%                 --

Under real-world workloads SQLite is much faster than these results
indicate since the time it takes to train/reply is relative to the
existing database size. Here's how long it took to train on a 214,710
line IRC log on a Linode 1080 with Hailo 0.18:

=over

=item * SQLite

    real    8m38.285s
    user    8m30.831s
    sys     0m1.175s

=item * MySQL

    real    48m30.334s
    user    8m25.414s
    sys     4m38.175s

=item * PostgreSQL

    real    216m38.906s
    user    11m13.474s
    sys     4m35.509s

=back

In the case of PostgreSQL it's actually much faster to first train
with SQLite, dump that database and then import it with L<psql(1)>,
see L<failo's README|http://github.com/hinrik/failo> for how to do
that.

However, replying with an existing database (using
F<utils/hailo-benchmark-replies>) yields different results. SQLite can
reply really quickly without being warmed up (which is the typical
usecase for chatbots) but once PostgreSQL and MySQL are warmed up they
start replying faster:

Here's a comparison of doing 10 replies:

                        Rate PostgreSQL MySQL SQLite-file SQLite-file-28MB SQLite-memory
    PostgreSQL        71.4/s         --  -14%        -14%             -29%          -50%
    MySQL             83.3/s        17%    --          0%             -17%          -42%
    SQLite-file       83.3/s        17%    0%          --             -17%          -42%
    SQLite-file-28MB 100.0/s        40%   20%         20%               --          -30%
    SQLite-memory      143/s       100%   71%         71%              43%            --

In this test MySQL uses around 28MB of memory (using Debian's
F<my-small.cnf>) and PostgreSQL around 34MB. Plain SQLite uses 2MB of
cache but it's also tested with 28MB of cache as well as with the
entire database in memory.

But doing 10,000 replies is very different:

                       Rate SQLite-file PostgreSQL SQLite-file-28MB MySQL SQLite-memory
    SQLite-file      85.1/s          --        -7%             -18%  -27%          -38%
    PostgreSQL       91.4/s          7%         --             -12%  -21%          -33%
    SQLite-file-28MB  103/s         21%        13%               --  -11%          -25%
    MySQL             116/s         37%        27%              13%    --          -15%
    SQLite-memory     137/s         61%        50%              33%   18%            --

Once MySQL gets more memory (using Debian's F<my-large.cnf>) and a
chance to warm it starts yielding better results (I couldn't find out
how to make PostgreSQL take as much memory as it wanted):

                   Rate         MySQL SQLite-memory
    MySQL         121/s            --          -12%
    SQLite-memory 138/s           14%            --

=head1 AUTHOR

E<AElig>var ArnfjE<ouml>rE<eth> Bjarmason <avar@cpan.org>

Hinrik E<Ouml>rn SigurE<eth>sson, hinrik.sig@gmail.com

=head1 LICENSE AND COPYRIGHT

Copyright 2010 E<AElig>var ArnfjE<ouml>rE<eth> Bjarmason and
Hinrik E<Ouml>rn SigurE<eth>sson

This program is free software, you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)