The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package CGI::IDS::Whitelist;

our $VERSION = '1.0217';

#------------------------- Notes -----------------------------------------------
# This source code is documented in both POD and ROBODoc format.
# Please find additional POD documentation at the end of this file
# (search for "__END__").
#-------------------------------------------------------------------------------

#****c* IDS::Whitelist
# NAME
#   PerlIDS Whitelist (CGI::IDS::Whitelist)
# DESCRIPTION
#   Whitelist Processor for PerlIDS (CGI::IDS)
# AUTHOR
#   Hinnerk Altenburg <hinnerk@cpan.org>
# CREATION DATE
#   2010-03-29
# COPYRIGHT
#   Copyright (C) 2010-2014 Hinnerk Altenburg
#
#   This file is part of PerlIDS.
#
#   PerlIDS is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Lesser General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   PerlIDS is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Lesser General Public License for more details.
#
#   You should have received a copy of the GNU Lesser General Public License
#   along with PerlIDS.  If not, see <http://www.gnu.org/licenses/>.

#****

=head1 NAME

CGI::IDS::Whitelist - Whitelist Processor for PerlIDS - Perl Website Intrusion Detection System (XSS, CSRF, SQLI, LFI etc.)

=head1 DESCRIPTION

Whitelist Processor for PerlIDS (L<CGI::IDS|CGI::IDS>). Performs a basic string check and the whitelist check.
See section L<CGI::IDS/Whitelist> for details on setting up a whitelist file. CGI::IDS::Whitelist may also be
used standalone without CGI::IDS to check whether a request has suspicious parameters at all before
handing it over to CGI::IDS. This may be the case if you let worker servers do the more expensive
CGI::IDS job and only want to send over the requests that have suspicious parameters.
See L<SYNOPSIS|CGI::IDS::Whitelist/SYNOPSIS> for an example.

=head1 SYNOPSIS

 use CGI;
 use CGI::IDS::Whitelist;

 $query = new CGI;

 my $whitelist = CGI::IDS::Whitelist->new(
   whitelist_file  => '/home/hinnerk/sandbox/ids/cgi-bin/param_whitelist.xml',
 );

 my @request_keys = keys %$query->Vars;
 foreach my $key (@request_keys) {
   if ( $whitelist->is_suspicious(key => $key, request => $query->Vars ) {
     send_to_ids_worker_server( $query->Vars );
     last;
   }
 }

=head1 METHODS

=cut

#------------------------- Pragmas ---------------------------------------------
use strict;
use warnings;

#------------------------- Libs ------------------------------------------------
use XML::Simple qw(:strict);
use Carp;
use JSON::XS;
use Encode;

#------------------------- Subs ------------------------------------------------

#****m* IDS/new
# NAME
#   Constructor
# DESCRIPTION
#   Creates a Whitelist object.
#   The whitelist will stay loaded during the lifetime of the object.
#   You may call is_suspicious() multiple times, the collecting debug
#   arrays suspicious_keys() and non_suspicious_keys() will only be
#   emptied by an explizit reset() call.
# INPUT
#   HASH
#     whitelist_file  STRING  The path to the whitelist XML file
# OUTPUT
#   Whitelist object, dies (croaks) if a whitelist parsing error occurs.
# EXAMPLE
#   # instantiate object
#   my $whitelist = CGI::IDS::Whitelist->new(
#       whitelist_file  => '/home/hinnerk/sandbox/ids/cgi-bin/param_whitelist.xml',
#   );
#   # instantiate object without a whitelist, just performs a basic string check
#   my $whitelist = CGI::IDS::Whitelist->new();

#****

=head2 new()

Constructor. Can optionally take the path to a whitelist file.
If I<whitelist_file> is not given, just a basic string check will be performed.

The whitelist will stay loaded during the lifetime of the object.
You may call C<is_suspicious()> multiple times, the collecting debug
arrays C<suspicious_keys()> and C<non_suspicious_keys()> will only be
emptied by an explizit C<reset()> call.

For example, the following are valid constructors:

 my $whitelist = CGI::IDS::Whitelist->new(
     whitelist_file  => '/home/hinnerk/sandbox/ids/cgi-bin/param_whitelist.xml',
 );

 my $whitelist = CGI::IDS::Whitelist->new();

The Constructor dies (croaks) if a whitelist parsing error occurs.

=cut

sub new {
    my ($package, %args) = @_;

    # self member variables
    my $self = {
        whitelist_file      => $args{whitelist_file},
        suspicious_keys     => [],
        non_suspicious_keys => [],
    };

    # create object
    bless $self, $package;

    # read & parse XML
    $self->_load_whitelist_from_xml($self->{whitelist_file});

    return $self;
}

#****m* IDS/Whitelist/is_suspicious
# NAME
#   is_suspicious
# DESCRIPTION
#   Performs the whitelist check for a given request parameter.
# INPUT
#   HASHREF
#     + key       The key of the request parameter to be checked
#     + request   HASHREF to the complete request (for whitelist conditions check)
# OUTPUT
#   1 if you should check it with the complete filter set,
#   0 if harmless or sucessfully whitelisted.
# SYNOPSIS
#   $whitelist->is_suspicious( key => 'mykey', request => $request );
#****

=head2 is_suspicious()

 DESCRIPTION
   Performs the whitelist check for a given request parameter.
 INPUT
   HASHREF
     + key       The key of the request parameter to be checked
     + request   HASHREF to the complete request (for whitelist conditions check)
 OUTPUT
   1 if you should check it with the complete filter set,
   0 if harmless or sucessfully whitelisted.
 SYNOPSIS
   $whitelist->is_suspicious( key => 'mykey', request => $request );

=cut

sub is_suspicious {
    my ($self, %args)       = @_;
    my $key                 = $args{key};
    my $request             = $args{request};
    my $request_value       = $args{request}->{$key};
    my $contains_encoding   = 0;

    # skip if value is empty or generally whitelisted
    if ( $request_value ne '' &&
        !(  $self->{whitelist}{$key} &&
            !defined($self->{whitelist}{$key}->{rule}) &&
            !defined($self->{whitelist}{$key}->{conditions}) &&
            !defined($self->{whitelist}{$key}->{encoding})
        )
    ) {
        my $request_value_orig = $request_value;
        $request_value = $self->convert_if_marked_encoded(key => $key, value => $request_value);
        if ($request_value ne $request_value_orig) {
            $contains_encoding = 1;
        }

        $request_value = $self->make_utf_8($request_value);

        # scan only if value is not harmless
        if ( !$self->is_harmless_string($request_value) ) {
            my $attacks = {};

            if (!$self->{whitelist}{$key}) {
                # apply filters to value, not in whitelist
                push (@{$self->{suspicious_keys}}, {key => $key, value => $request_value, reason => 'key'}); # key not whitelisted
                return 1;
            }
            else {
                # check if all conditions match
                my $condition_mismatch = 0;
                foreach my $condition (@{$self->{whitelist}{$key}->{conditions}}) {
                    if (! defined($request->{$condition->{key}}) ||
                        ( defined ($condition->{rule}) && $request->{$condition->{key}} !~ $condition->{rule} )
                    ) {
                        $condition_mismatch = 1;
                    }
                }

                # Apply filters if key is not in whitelisted environment conditions
                # or if the value does not match the whitelist rule if one is set.
                # Filtering is skipped if no rule is set.
                if ( $condition_mismatch ||
                    (defined($self->{whitelist}{$key}->{rule}) &&
                    $request_value !~ $self->{whitelist}{$key}->{rule}) ||
                    $contains_encoding
                ) {
                    # apply filters to value, whitelist rules mismatched
                    my $reason = '';
                    if ($condition_mismatch) {
                        $reason = 'cond'; # condition mismatch
                    }
                    elsif (!$contains_encoding) {
                        $reason = 'rule'; # rule mismatch
                    }
                    else {
                        $reason = 'enc'; # contains encoding
                    }
                    push (@{$self->{suspicious_keys}}, {key => $key, value => $request_value, reason => $reason});
                    return 1;
                }
                else {
                    # skipped, whitelist rule matched
                    push (@{$self->{non_suspicious_keys}}, {key => $key, value => $request_value, reason => 'r&c'}); # rule & conditions matched
                }
            }
        }
        else {
            # skipped, harmless string
            push (@{$self->{non_suspicious_keys}}, {key => $key, value => $request_value, reason => 'harml'}); # harmless
        }
    }
    else {
        # skipped, empty value or key generally whitelisted
        my $reason = $request_value ? 'key' : 'empty';
        push (@{$self->{non_suspicious_keys}}, {key => $key, value => $request_value, reason => $reason});
    }
    return 0;
}

#****m* IDS/Whitelist/convert_if_marked_encoded
# NAME
#   convert_if_marked_encoded
# DESCRIPTION
#   Tries to JSON-decode and flatten a value to a plain string if the key has been marked as JSON in the whitelist.
#   Other encodings may follow in future.
# INPUT
#   HASHREF
#     + key
#     + value
# OUTPUT
#   The JSON-decoded and flattened 'value' if key is marked JSON. Plain keys and values, newline separated.
#   Untouched 'value' otherwise.
# SYNOPSIS
#   $whitelist->convert_if_marked_encoded( key => 'data', value = '{"a":"b","c":["123", 111, "456"]}');
#****

=head2 convert_if_marked_encoded()

 DESCRIPTION
   Tries to JSON-decode and flatten a value to a plain string if the key has been marked as JSON in the whitelist.
   Other encodings may follow in future.
 INPUT
   HASHREF
     + key
     + value
 OUTPUT
   The JSON-decoded and flattened 'value' if key is marked JSON. Plain keys and values, newline separated.
   Untouched 'value' otherwise.
 SYNOPSIS
   $whitelist->convert_if_marked_encoded( key => 'data', value => '{"a":"b","c":["123", 111, "456"]}');

=cut

sub convert_if_marked_encoded {
    my ($self, %args)   = @_;
    my $key             = $args{key};
    my $request_value   = $args{value};

    # If marked as JSON, try to convert from JSON to reduce false positives
    if (defined($self->{whitelist}{$key}) &&
        defined($self->{whitelist}{$key}->{encoding}) &&
        $self->{whitelist}{$key}->{encoding} eq 'json') {

        $request_value = _json_to_string($request_value);
    }
    return $request_value;
}

#****m* IDS/Whitelist/suspicious_keys
# NAME
#   suspicious_keys
# DESCRIPTION
#   Returns the set of filters that are suspicious
#   Keys are listed from the last reset() or Whitelist->new()
# INPUT
#   none
# OUTPUT
#   [ { 'value' => , 'reason' => , 'key' =>  }, { ... } ]
# SYNOPSIS
#   $whitelist->suspicious_keys();
#****

=head2 suspicious_keys()

 DESCRIPTION
   Returns the set of filters that are suspicious
   Keys are listed from the last reset() or Whitelist->new()
 INPUT
   none
 OUTPUT
   [ { 'value' => , 'reason' => , 'key' =>  }, { ... } ]
 SYNOPSIS
   $whitelist->suspicious_keys();

=cut

sub suspicious_keys {
    my ($self) = @_;
    return $self->{suspicious_keys};
}

#****m* IDS/Whitelist/non_suspicious_keys
# NAME
#   non_suspicious_keys
# DESCRIPTION
#   Returns the set of filters that have been checked but are not suspicious
#   Keys are listed from the last reset() or Whitelist->new()
# INPUT
#   none
# OUTPUT
#   [ { 'value' => , 'reason' => , 'key' =>  }, { ... } ]
# SYNOPSIS
#   $whitelist->non_suspicious_keys();
#****

=head2 non_suspicious_keys()

 DESCRIPTION
   Returns the set of filters that have been checked but are not suspicious
   Keys are listed from the last reset() or Whitelist->new()
 INPUT
   none
 OUTPUT
   [ { 'value' => , 'reason' => , 'key' =>  }, { ... } ]
 SYNOPSIS
   $whitelist->non_suspicious_keys();

=cut

sub non_suspicious_keys {
    my ($self) = @_;
    return $self->{non_suspicious_keys};
}

#****m* IDS/Whitelist/reset
# NAME
#   reset
# DESCRIPTION
#   resets the member variables suspicious_keys and non_suspicious_keys to []
# INPUT
#   none
# OUTPUT
#   none
# SYNOPSIS
#   $whitelist->reset();
#****

=head2 reset()

 DESCRIPTION
   resets the member variables suspicious_keys and non_suspicious_keys to []
 INPUT
   none
 OUTPUT
   none
 SYNOPSIS
   $whitelist->reset();

=cut

sub reset {
    my ($self) = @_;
    $self->{suspicious_keys}     = [];
    $self->{non_suspicious_keys} = [];
}

#****f* IDS/Whitelist/is_harmless_string
# NAME
#   is_harmless_string
# DESCRIPTION
#   Performs a basic regexp check for harmless characters
# INPUT
#   + string
# OUTPUT
#   BOOLEAN (pattern match return value)
# SYNOPSIS
#   $whitelist->is_harmless_string( $string );
#****

=head2 is_harmless_string()

 DESCRIPTION
   Performs a basic regexp check for harmless characters
 INPUT
   + string
 OUTPUT
   BOOLEAN (pattern match return value)
 SYNOPSIS
   $whitelist->is_harmless_string( $string );

=cut

sub is_harmless_string {
    my ($self, $string) = @_;

    $string = $self->make_utf_8($string);

    return ( $string !~ m/[^\w\s\/@!?\.]+|(?:\.\/)|(?:@@\w+)/ );
}

#****f* IDS/Whitelist/make_utf_8
# NAME
#   make_utf_8
# DESCRIPTION
#   Encodes string to UTF-8 and strips malformed UTF-8 characters
# INPUT
#   + string
# OUTPUT
#   UTF-8 string
# SYNOPSIS
#   $whitelist->make_utf_8( $string );
#****

=head2 make_utf_8()

 DESCRIPTION
   Encodes string to UTF-8 and strips malformed UTF-8 characters
 INPUT
   + string
 OUTPUT
   UTF-8 string
 SYNOPSIS
   $whitelist->make_utf_8( $string );

=cut

sub make_utf_8 {
    my ($self, $string) = @_;

    # make string UTF-8
    my $utf8_encoded = '';
    eval {
        $utf8_encoded = Encode::encode('UTF-8', $string, Encode::FB_CROAK);
    };
    if ($@) {
        # sanitize malformed UTF-8
        $utf8_encoded = '';
        my @chars = split(//, $string);
        foreach my $char (@chars) {
            my $utf_8_char = eval { Encode::encode('UTF-8', $char, Encode::FB_CROAK) }
                or next;
            $utf8_encoded .= $utf_8_char;
        }
    }
    return $utf8_encoded;
}

#****im* IDS/Whitelist/_load_whitelist_from_xml
# NAME
#   _load_whitelist_from_xml
# DESCRIPTION
#   loads the parameter whitelist XML file
#   croaks if a xml or regexp parsing error occors
# INPUT
#   whitelistfile   path + name of the XML whitelist file
# OUTPUT
#   int             number of loaded rules
# SYNOPSIS
#   $self->_load_whitelist_from_xml('/home/xyz/param_whitelist.xml');
#****

sub _load_whitelist_from_xml {
    my ($self, $whitelistfile) = @_;
    my $whitelistcnt = 0;

    if ($whitelistfile) {
        # read & parse whitelist XML
        my $whitelistxml;
        eval {
            $whitelistxml = XMLin($whitelistfile,
                forcearray  => [ qw(whitelist param conditions condition)],
                keyattr     => [],
            );
        };
        if ($@) {
            croak "Error in _load_whitelist_from_xml while parsing $whitelistfile: $@";
        }

        # convert XML structure into handy data structure
        foreach my $whitelistobj (@{$whitelistxml->{param}}) {
            my @conditionslist = ();
            foreach my $condition (@{$whitelistobj->{conditions}[0]{condition}}) {
                if (defined($condition->{rule})) {
                    # copy for error message
                    my $rule = $condition->{rule};

                    eval {
                        $condition->{rule} = qr/$condition->{rule}/ms;
                    };
                    if ($@) {
                        croak 'Error in whitelist rule of condition "' . $condition->{key} . '" for param "' . $whitelistobj->{key} . '": ' . $rule . ' Message: ' . $@;
                    }
                }
                push(@conditionslist, $condition);
            }
            my %whitelisthash = ();
            if (defined($whitelistobj->{rule})) {
                eval {
                    $whitelisthash{rule} = qr/$whitelistobj->{rule}/ms;
                };
                if ($@) {
                    croak 'Error in whitelist rule for param "' . $whitelistobj->{key} . '": ' . $whitelistobj->{rule} . ' Message: ' . $@;
                }
            }
            if (@conditionslist) {
                $whitelisthash{conditions} = \@conditionslist;
            }
            if ($whitelistobj->{encoding}) {
                $whitelisthash{encoding} = $whitelistobj->{encoding};
            }
            $self->{whitelist}{$whitelistobj->{key}} = \%whitelisthash;
            $whitelistcnt++;
        }
    }
    return $whitelistcnt;
}

#****if* IDS/Whitelist/_json_to_string
# NAME
#   _json_to_string
# DESCRIPTION
#   Tries to decode a string from JSON. Uses _datastructure_to_string().
# INPUT
#   value   the string to convert
# OUTPUT
#   value   converted string if correct JSON, the unchanged input string otherwise
# SYNOPSIS
#   IDS::Whitelist::_json_to_string($value);
#****

sub _json_to_string {
    my ($value) = @_;
    my $json_ds;
    eval {
        $json_ds = JSON::XS::decode_json($value);
    };
    if (!$@) {
        $value = _datastructure_to_string($json_ds)."\n";
    }
    return $value;
}

#****if* IDS/Whitelist/_datastructure_to_string
# NAME
#   _datastructure_to_string
# DESCRIPTION
#   Walks recursively through array or hash and concatenates keys and values to one single string (\n separated)
# INPUT
#   ref     the array/hash to convert
# OUTPUT
#   string  converted string
# SYNOPSIS
#   IDS::Whitelist::_datastructure_to_string($ref);
#****

sub _datastructure_to_string {
    my $in = shift;
    my $out = '';
    if (ref $in eq 'HASH') {
        foreach (keys %$in) {
            $out .= $_."\n";
            $out .= _datastructure_to_string($in->{$_});
        }
    }
    elsif (ref $in eq 'ARRAY') {
        foreach (@$in) {
            $out = _datastructure_to_string($_) . $out;
        }
    }
    else {
            $out .= $in."\n";
    }
    return $out;
}

1;

__END__

=head1 BUGS & SUPPORT

see L<CGI::IDS/BUGS> and L<CGI::IDS/SUPPORT>

=head1 AUTHOR

Hinnerk Altenburg, C<< <hinnerk at cpan.org> >>

=head1 SEE ALSO

L<CGI::IDS>

=head1 COPYRIGHT & LICENSE

Copyright (C) 2014 Hinnerk Altenburg (L<http://www.hinnerk-altenburg.de/>)

This file is part of PerlIDS.

PerlIDS is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

PerlIDS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License
along with PerlIDS.  If not, see <http://www.gnu.org/licenses/>.

=cut