The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
package RDF::TrineX::RuleEngine::Jena;
use Moose;
use English;
use Path::Class;
use File::Find;
use Archive::Zip qw(:CONSTANTS :ERROR_CODES);
use Try::Tiny;
use File::Copy;
use File::Share ':all';
use File::Slurp;
use IO::CaptureOutput qw(capture);
use RDF::Trine;
use RDF::Trine::Namespace qw(rdf rdfs owl xsd);
use RDF::Trine::Model::StatementFilter;
use Carp qw(cluck);

use File::Temp qw(tempfile);
use Data::Dumper;

BEGIN {
    $RDF::TrineX::RuleEngine::Jena::VERSION = '0.001';
}

has JENAROOT => (
    is       => 'ro',
    isa      => 'Path::Class::Dir',
    required => 1,
);

has JENA_VERSION => (
    is       => 'ro',
    isa      => 'Str',
    required => 1,
);
has JENA_SOURCES_JAR => (
    is       => 'ro',
    isa      => 'Archive::Zip',
    required => 1,
);
has JENA_CLASSPATH => (
    is       => 'ro',
    isa      => 'ArrayRef[Str]',
    required => 1,
);

has ruleset_map => (
    traits  => ['Hash'],
    is      => 'ro',
    isa     => 'HashRef[Str]',
    handles => {
        'available_rulesets'   => 'keys',
        'get_ruleset_filename' => 'get',
    },
);

sub stringify_classpath {
    my $self = shift;
    return join ":", @{ $self->JENA_CLASSPATH };
}

around 'BUILDARGS' => sub {
    my $orig  = shift;
    my $class = shift;
    my %args  = @_;

    # detect JENAROOT
    # first as an argument passed to the constructor
    unless ( $args{JENAROOT} ) {

        # second as an environment variable
        $args{JENAROOT} = $ENV{JENAROOT};
        unless ( $args{JENAROOT} ) {

            # third as a build option stored in share/JENAROOT
            my $JENAROOT_sharefile = dist_file( 'RDF-TrineX-RuleEngine-Jena', 'JENAROOT' );
            try {
                $args{JENAROOT} = read_file $JENAROOT_sharefile;
                chomp $args{JENAROOT} if $args{JENAROOT};
            }
            catch {
                confess "Couldn't open shared file $JENAROOT_sharefile. Broken build, reinstall";
            };
        }
    }
    confess
        "Must pass JENAROOT via constructor, set environment variable JENAROOT or configure path at runtime"
        unless $args{JENAROOT};

    $args{JENAROOT} = Path::Class::Dir->new( $args{JENAROOT} );
    confess "JENAROOT '$args{JENAROOT}' does not exist."
        unless ( -e $args{JENAROOT} );
    confess "JENAROOT '$args{JENAROOT}' is not a directory."
        unless ( -d $args{JENAROOT} );
    for (qw(lib lib-src)) {
        confess "JENAROOT doesn't contain '$_'. Probably the wrong directory."
            unless ( -e $args{JENAROOT}->subdir($_) );
    }

    # collect jars for classpath
    $args{JENA_CLASSPATH} = [];
    find(
        sub {
            m/jar$/
                && ( push @{ $args{JENA_CLASSPATH} }, $File::Find::name );
        },
        $args{JENAROOT}->subdir('lib')
    );

    # find sources jar and set JENA_VERSION
    find(
        sub {
            m/jena-([\d\.]+)-sources\.jar/
                && -e $_
                && $1
                && ( $args{JENA_VERSION}     = $1 )
                && ( $args{JENA_SOURCES_JAR} = Archive::Zip->new($File::Find::name) );
        },
        $args{JENAROOT}->subdir("lib-src")
    );
    confess "Couldn't determine Jena version."
        unless ( $args{JENA_VERSION} );
    confess
        "Couldn't find jena-$args{JENA_VERSION}-sources.jar (should be in \$JENAROOT/lib-src but isn't."
            unless ( $args{JENA_SOURCES_JAR} );

    # detect available_rulesets
    $args{ruleset_map} = { map { $_->fileName =~ m|etc/(.+).rules$|; $1 => $_->fileName }
            $args{JENA_SOURCES_JAR}->membersMatching('etc/.+rules$') };

    # COMPATIBILITY
    # 'owl.rules' and 'owl-b.rules' are broken
    delete $args{ruleset_map}->{'owl-b'};
    delete $args{ruleset_map}->{'owl'};
    delete $args{ruleset_map}->{'owl-fb-old'};
    delete $args{ruleset_map}->{'rdfs-fb-tgc-simple'};

    return $class->$orig(%args);
};
sub apply_rules {
    my $self      = shift;
    my %opts      = @_;
    my %tempfiles = (
        input => File::Temp->new(
            TMPDIR   => 1,
            TEMPLATE => 'jena_in_XXXXX',
            SUFFIX   => '.nt',
            UNLINK   => 0
        ),
        output => File::Temp->new(
            TMPDIR   => 1,
            TEMPLATE => 'jena_out_XXXXX',
            SUFFIX   => '.nt',
            UNLINK   => 0
        ),
        rules => File::Temp->new(
            TMPDIR   => 1,
            TEMPLATE => 'jena_XXXXX',
            SUFFIX   => '.rules',
            UNLINK   => 0
        ),
    );

    # determine input
    confess "Must specify 'input' to apply_rules."
        unless $opts{input};
    $self->_coerce_input( $opts{input}, %tempfiles );

    # determine input format
    my $input_format = $opts{input_format} || 'N-TRIPLE';

    # determine rules
    confess "Must specify 'rules' to apply_rules."
        unless $opts{rules};
    $self->_coerce_rules( $opts{rules}, %tempfiles );

    # close (and write out) all tempfiles
    map { $_->close } values %tempfiles;

    # actually execute jena.RuleMap
    # my $jena_rulemap_sub = sub {
    my $stderr;
    capture(
        sub {
            $self->exec_jena_rulemap(
                filename_rules => $tempfiles{rules}->filename,
                filename_input => $tempfiles{input}->filename,
                input_format   => $input_format,
                output_format  => 'N-TRIPLE',
                additions_only => $opts{additions_only},
            )
        },
        undef, # don't capture output
        \$stderr, #capture stderr
        $tempfiles{output}, #write stdout to this tempfile
        undef, #don't write stderr to file
    );
    $tempfiles{output}->close;

    if ($opts{output} && ! blessed $opts{output}) {
        if ($opts{output} =~ m/^:filename$/i) {
            return $tempfiles{output}->filename;
        } elsif ($opts{output} =~ m/^:(?:FH|string)$/i ) {
            open my $fh, "<:utf8", $tempfiles{output}->filename;
            return $fh if $opts{output} =~ m/^:FH/i;
            return do { local $/; <$fh>; };
        }
    }

    # make sure we have an output model
    my $out_model = $self->_coerce_output( $opts{output} );
    my $parser = RDF::Trine::Parser->new('ntriples');
    $parser->parse_file_into_model( '/', $tempfiles{output}->filename, $out_model );

    # purge stuff
    if ( my $schemas_to_purge = $opts{purge_schemas} ) {

        # "cast" model to statementfiltermodel
        unless ( $out_model->isa('RDF::Trine::Model::StatementFilter') ) {
            $out_model = RDF::Trine::Model::StatementFilter->new( $out_model->_store );
        }

        # warn "Adding rules to Model::StatementFilter";
        unless ( ref $schemas_to_purge ) {
            $schemas_to_purge = [$schemas_to_purge];
        }
        $self->_remove_schema( $out_model, $schemas_to_purge );
    }

    # remove all tempfiles
    map { unlink $_ } values %tempfiles;

    # return the output as a model
    return $out_model;
}

sub _coerce_output {
    my $self = shift;
    my $data = shift;
    unless ($data) {
        return RDF::Trine::Model::StatementFilter->temporary_model;
    }
    if ( blessed $data) {
        if ( $data->isa('RDF::Trine::Model') ) {
            return $data;
        }
        else {
            confess "Can't handle output of type " . ref $data;
        }
    }
    else {
        confess "Output must be a model.";
    }
}

sub _coerce_input {
    my $self      = shift;
    my $data      = shift;
    my %tempfiles = @_;
    if ( ref $data ) {
        if ( ref $data eq 'SCALAR' ) {
            print { $tempfiles{input} } $$data;
        }

        # TODO cleaner class check
        elsif ( blessed $data ) {
            if ( $data->isa('RDF::Trine::Model') ) {
                my $serializer = RDF::Trine::Serializer->new('ntriples');
                my $data_ser   = $serializer->serialize_model_to_string($data);
                print { $tempfiles{input} } $data_ser;
            }
            else {
                confess "Can't handle objects of type " . ref $data;
            }
        }
    }
    else {
        confess "Couldn't open input file $data"
            unless ( -e $data && -r $data );
        copy( $data, $tempfiles{input}->tempfile );
    }
}

sub _coerce_rules {
    my $self      = shift;
    my $data      = shift;
    my %tempfiles = @_;
    if ( ref $data ) {

        # if it's SCALAR ref, use those rules directly
        if ( ref $data eq 'SCALAR' ) {
            print { $tempfiles{rules} } $$data;
        }

        # if it's a GLOB reference, write it to tempfile
        elsif ( ref $data eq 'GLOB' ) {
            while (<$data>) {
                print { $tempfiles{rules} } $_;
            }
        }
    }
    else {

        # might be a predefined ruleset
        my $predefined_ruleset = $self->get_ruleset_filename($data);
        if ($predefined_ruleset) {

            # confess "No such ruleset $opts{ruleset}." unless $dataet_filename;
            my $extract_ok
                = $self->JENA_SOURCES_JAR->extractMemberWithoutPaths( $predefined_ruleset,
                $tempfiles{rules}->filename );
            unless ( AZ_OK == $extract_ok ) {
                confess "Couldn't extract $predefined_ruleset to " . $tempfiles{rules}->filename;
            }
        }

        # it's a filename
        else {
            confess "Rule file '$data' does't exist or isn't readable."
                unless ( -e $data && -r $data );
            copy( $data, $tempfiles{rules}->filename );
        }
    }
}

sub exec_jena_rulemap {
    my $self           = shift;
    my %opts           = @_;
    my $filename_rules = $opts{filename_rules};
    my $filename_input = $opts{filename_input};
    my $input_format   = $opts{input_format} || "N-TRIPLE";
    my $output_format   = $opts{output_format} || "N-TRIPLE";
    my $additions_only = $opts{additions_only};
    # my %tempfiles = ${

    my $OLD_CLASSPATH = $ENV{CLASSPATH};
    $ENV{CLASSPATH} = $self->stringify_classpath;
    my @jena_rulemap_cmd = (
        "java" => "jena.RuleMap",
        "-il"  => $input_format,
        "-ol"  => $output_format,
        $opts{filename_rules},
        $opts{filename_input},
    );

    if ($additions_only) {
        splice @jena_rulemap_cmd, 6, 0, "-d";
    }
    # my ( $stdout, $stderr, $success, $exit_code ) = capture_exec(@jena_rulemap_cmd);
    system( @jena_rulemap_cmd );
    $ENV{CLASSPATH} = $OLD_CLASSPATH;

    return 1;
}

sub _model_difference {
    my $self = shift;
    my ( $model, $model_diff ) = @_;
    my $iter = $model_diff->as_stream;
    while ( my $stmt = $iter->next ) {
        $model->remove_statement($stmt);
    }
    return $model;
}

sub _remove_tautologies {
    my $self = shift;
    my ($model) = @_;

    # ?a owl:equivalentProperty ?a -> []
    $model->add_rule(
        sub {
            my $stmt = shift;
            if (
                $stmt->subject->equal( $stmt->object )
                && $stmt->predicate->equal(
                    RDF::Trine::iri('http://www.w3.org/2002/07/owl#equivalentProperty')
                )
                )
            {
                return 0;
            }
            return 1;
        }
    );
}

sub _remove_schema {
    my $self          = shift;
    my ($model)       = shift;
    my (@namespaces)  = @{ shift() };
    my %namespaces_re = (
        rdf  => qr/^$rdf/,
        rdfs => qr/^$rdfs/,
        owl  => qr/^$owl/,
        xsd  => qr/^$xsd/,
        jena => qr|^\Qurn:x-hp-jena:rubrik/\E|,
        daml => qr|^\Qhttp://www.daml.org/2001/03/daml+oil#\E|,
    );
    if ( $namespaces[0] eq ':all' ) {
        @namespaces = keys %namespaces_re;
    }
    $model->add_rule(
        sub {
            my $stmt = shift;

            # remove all statements about RDF, RDFS, OWL, jena:rubrik, DAML and XSD resources
            for my $ns (@namespaces) {
                if ( $stmt->subject->is_blank ) {
                    if ( $stmt->object->equal( $owl->Thing ) ) {
                        return 0;
                    }
                    return 1;
                }
                return 0 if $stmt->subject->uri_value =~ $namespaces_re{$ns};

                # additionally  purge everything with jena's FB namespace in i
                if ( $ns eq 'jena' ) {
                    for ( 0 .. 2 ) {
                        next unless [ $stmt->nodes ]->[$_]->is_resource;
                        return 0 if [ $stmt->nodes ]->[$_]->uri_value =~ $namespaces_re{$ns};
                    }
                }
            }
            return 1;
        }
    );

    # warn Dumper $model->count_statements;
}

1;
=head1 NAME

RDF::TrineX::RuleEngine::Jena - Wrapper around Jena's rule engine for reasoning over RDF

=head1 SYNOPSIS

    use RDF::Trine::Namespace qw(rdf rdfs);

    my $one_triple = "<test/classA> <${rdfs}domain> <test/ClassB> .";

    my $reasoner = RDF::TrineX::RuleEngine::Jena->new;
    my $model_inferred = $reasoner->apply_rules(
        input => \ $one_triple,
        rules => 'rdfs-fb',
        purge_schemas => ':all',
    );

    print $model_inferred->size;    
    # 7

    my $serializer = RDF::Trine::Serializer->new('turtle' , namespaces => { rdf => $rdf, rdfs => $rdfs });
    print $serializer->serialize_model_to_string( $model_inferred );

    # <test/ClassB> rdfs:subClassOf rdfs:Resource, <test/ClassB> ;
    #     a rdfs:Class .
    # <test/classA> rdfs:domain <test/ClassB> ;
    #     a rdf:Property, rdfs:Resource .

=head1 DESCRIPTION

This module is a convenience wrapper around a call to Jena's C< jena.RuleMap >
command line rule-engine interface. It transparently handles serialization and
creation of temporary files, but it relies on a working Java installation and knowledge
of the location of the Jena framework.

=head2 Finding Jena

When building this module, the Jena framework can be downloaded or a path to an
existing Jena installation can be specified. This path is stored in a shared
file. If you can't or don't want to specify it at build time, you can set the
JENAROOT environment variable to the location of the extracted Jena download.
Finally you can pass the path to it at runtime to the constructor.

=head2 RDF::Trine vs. Jena Format names

    Trine    | Jena
    ---------+----------------------------
    ntriples | N-TRIPLE
    turtle   | TURTLE
    rdxml    | RDF/XML, RDF/XML-ABBREV
    n3       | N3-PP, N3-PLAIN, N3-TRIPLE


=head1 ATTRIBUTES

=over 4

=item JENAROOT

A L<Path::Class::Dir|Path::Class::Dir> object of the Jena directory.

=item JENA_VERSION

The Version of Jena used, determined from the C<< jena-X.X.X-sources.jar >>
file.

=item JENA_SOURCES_JAR

L<Archive::Zip|Archive::Zip> object for the C<< jena-X.X.X-sources.jar >> file.
Contains the predefined rulesets.

=item JENA_CLASSPATH

Array reference holding the paths to all the C<<jar>> files required for Jena
to run.

=back

=head1 METHODS

=head2 new

Returns a new L<RDF::TrineX::RuleEngine::Jena|RDF::TrineX::RuleEngine::Jena>
object. Before 

The optional C<JENAROOT> argument holds the path to the extracted Jena source.
If not set, C<JENAROOT> is determined as described in L</JENAROOT>.


=head2 apply_rules

Applies a set of Jena rules to RDF input and adds the inferred statements to
the output model.

=over 4

=item C<< input => $input_data >> 

B<required>

C<$input_data> is serialized, written to a temporary file and fed to
L</exec_jena_rulemap> as the C<filename_input> argument. Currently, the
following data types are handled:

=over 4

=item * L<RDF::Trine::Model|RDF::Trine::Model>.

    my $model = RDF::Trine::Model->temporary_model;
    RDF::Trine::Parser->new('turtle')->parse_file_into_model('my_file.ttl');
    $reasoner->apply_rules(
        input => $model,
        rules => ...,
    );

=item * String: Treated as the path to a file containing a serialized RDF graph.
    
    $reasoner->apply_rules(
        input => 'my_file.nt',
        rules => ...,
    );

=item * Scalar reference: Treated as a reference to a serialized RDF graph.

    my $input_ttl = <'EOF';
    @prefix rdfs:http://www.w3.org/2000/01/rdf-schema# .
    <Tiny> rdfs:subClassOf <Small> .
    EOF
    $reasoner-apply_rules(
        input => \ $input_ttl,
        input_format => 'TURTLE',
        rules => ...,
    );

=back

=item C<< rules => $rules_data >> 

B<required>

C<< $rules_data >> can be any of the following:

=over 4

=item * String matching one of the L</available_rulesets>: The appropriate rules file is
loaded from L</JENA_SOURCES_JAR>.

    $reasoner->apply_rules(
        input => ...,
        rules => 'rdfs',
    );

=item * Scalar reference: The dereferenced value is treated as a string of rules.

    my $rules = "[dummy: (?a ?b ?c) -> (?a rdfs:label "This is stupid") ]";
    $reasoner->apply_rules(
        input => ...,
        rules => \ $rules,
    );

=item * Any other string: Treat C<$rules_data> as a filename and load rules from there.

    $reasoner->apply_rules(
        input => ...,
        rules => '/path/to/my/ruleset.rules',
    );

=back

=item C<< output => ($model|":fh"|":filename"|":string"|$string) >>

If specified, inferred statements are written to this model, otherwise a
temporary model is created. If you set output to the same value as input,
inferred statements are added to the original model.

=over 4

=item * C<< $model >>: The statements are added to this L<RDF::Trine::Model>.
Setting this to the same model as in C<< input >> will cause all rule-based
statement removals to be ignored since there currently is no way of tracking
which statements I<were> by applying the rules.

=item * C<":fh">: If this special string (case-insensitive) is supplied, a readable filehandle
to the raw output of jena.RuleMap is returned. C<< purge_schemas >> is ignored.

    my $fh = $reasoner->apply_rules(
        input => ...,
        rules => ...,
        output => ':FH',
    );
    while (<$fh>) {
        my ($s, $p, $o ) = $_ =~ m/^\s*<([^>]+>\s+<([^>]+>\s+<([^>]+>\s*.$/;
    }

=item * C<":filename">: If this special string (case-insensitive) is supplied, the filename of the
temporary file containing the raw output of jena.rulemap is returned .
C<purge_schemas> is ignored.

    use File::Slurp;
    my $fname = $reasoner->apply_rules(
        input => ...,
        rules => ...,
        output => ':filename',
    );
    my $contents = read_file $fname;

=item * C<":string">: If this special string (case-insensitive) is supplied,
the complete raw output of jena.RuleMap is returned. C<< purge_schemas >> is
ignored.

    my $serialized = $reasoner->apply_rules(
        input => ...,
        rules => ...,
        output => ':sTRing',
    );

=item * C<$string>: Any other string is treated as a filename to write the raw
output of jena.RuleMap to. C<< purge_schemas >> is ignored.

    my $serialized = $reasoner->apply_rules(
        input => 'data.nt',
        rules => ...,
        output => 'data_inferred.nt',
    );

=back

=item C<< purge_schemas => (\@list_of_schemanames|":all") >>

Jena's rule engine adds lots and lots of schema statements about rdf, rdfs, owl,
xsd plus some internals. You can tell RDF::TrineX::RuleEngine::Jena to purge those
statements by supplying an array ref of schema names to purge_schemas.

Specifying C<:all> removes all schema statements, RDF::TrineX::RuleEngine::Jena knows
about.

    $reasoner->apply_rules(
        input => ...,
        rules => ...,
        purge_schemas => ':all',
    );

is equivalent to

    $reasoner->apply_rules(
        input => ...,
        rules => ...,
        purge_schemas => [qw( rdf rdfs daml xsd owl jena )],
    );

=back

=head2 exec_jena_rulemap

Sets and resets CLASSPATH and runs C<< java jena.RuleMap ... >> using a
L<system|perlfunc/system> call.  This is all this function does, capturing STDIN and
STDERR and parsing/serializing happens in L<< apply_rules >>.

Arguments:

=over 4

=item filename_rules

Filename of the C<< .rules >> file

=item filename_input

File name of the file containing the assertions.

=item input format

The format of the input file, in Jena notation (i.e. 'N-TRIPLE', 'TURTLE', 'RDF/XML'...)

=item output_format

Format of the result printed to STDOUT, again in Jena notation.

=item additions_only

When this flag is set, Jena will only return deduced and schema statements, as
opposed to the original model with added and removed statements when the flag
is not set.

=back

=cut

=head2 _model_difference

Given two models A and B, remove all statements from A that are also in B.

=cut


=head2 _remove_tautologies

Remove all statements of the form C<X owl:equivalentProperty X>.

=cut 

=head2 available_rulesets

Lists the available predefined rulesets shipped with Jena that aren't broken. Currently, these are:

=over 4

=item * B<daml-micro>

=item * B<owl-fb>

=item * B<owl-fb-micro>

=item * B<owl-fb-mini>

=item * B<rdfs>

=item * B<rdfs-b>

=item * B<rdfs-b-tuned>

=item * B<rdfs-fb>

=item * B<rdfs-fb-lp-expt>

=item * B<rdfs-fb-tgc>

=item * B<rdfs-fb-tgc-noresource>

=item * B<rdfs-noresource>

=back

=head2 get_ruleset_filename

Get the filename of a predefined ruleset within L</JENA_SOURCES_JAR>.

=cut



=head1 AUTHOR

Konstantin Baierer <kba@cpan.org>

=head1 SEE ALSO

=over 4

=item L< http://answers.semanticweb.com/questions/1453/reasoning-and-sparql-through-arq-command-line >

=back