The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package WWW::Adblock::RegexFilter;

use strict;
use warnings;
use 5.006;

our $VERSION = "0.02";

=head1 NAME

WWW::Adblock::RegexFilter - implement a single Adblock filter

=head1 DESCRIPTION

Used by WWW::Adblock to implement a single filter.  Should not be called by an end user.

=head2 Methods

=head3 new

 my $f = WWW::Adblock::Filter->new();

Creates a new object.  Returns undef on failure or itself on success.

=cut

sub new {
    my ( $class, %args ) = @_;

    my $self = bless( {}, $class );

    # Create a blocking rule by default
    $self->{type}           = 'blocking';
    $self->{case_sensitive} = 0;
    $self->{domains}        = undef;
    $self->{regex}          = undef;

    return unless exists $args{text};
    return unless $self->_from_text( $args{text} );

    return $self;
}

=head3

  $f->_from_text("filter text");

Called by new to setup the filter.

=cut

sub _from_text {
    my ( $self, $text ) = @_;

    return 0 unless defined $text;

    if ( $text =~ m/^@@(.+)$/ ) {
        $text = $1;
        $self->{type} = 'whitelist';
    }

    # If this rule has options, parse them out
    if ( $text =~ /\$(~?[\w\-]+(?:=[^,\s]+)?(?:,~?[\w\-]+(?:=[^,\s]+)?)*)$/ ) {
        $text = $`;
        my @options = split( /,/, $1 );

        foreach my $o (@options) {
            my ( $option, $value ) = split( /=/, $o, 2 );
            $option = uc($option);

            # TODO: Add content type matching too.  For URI filtering it
            #       doesn't help but it would be necessary for element
            #       filtering.
            if ( $option eq "MATCH_CASE" ) {
                $self->{case_sensitive} = 1;

            }
            elsif ( $option eq "DOMAIN" && defined $value ) {
                $self->{domains} = [ split( /\|/, $value ) ];

            }

            # Other options that aren't implemented
            # "THIRD_PARTY", "~THIRD_PARTY", "COLLAPSE", "~COLLAPSE"
        }
    }

    $text =~ s/\*+/*/g;        # Remove multiple wildcards
    $text =~ s/^\*+//;         # Remove leading wildcards
    $text =~ s/\*+$//;         # Remove trailing wildcards
    $text =~ s/\^\|$/\^/;      # Remove anchors following separator
    $text =~ s/(\W)/\\$1/g;    # Escape special symbols
    $text =~ s/\\\*/.*/g;      # Replace wildcards with .*

    # Separator placeholders (all ANSI characters but alpha or _%.-)
    $text =~
      s/\\\^/(?:[\x00-\x24\x26-\x2C\x2F\x3A-\x40\x5B-\x5E\x60\x7B-\x80]|\$)/g;

    # Extended anchor
    $text =~ s/^\\\|\\\|/^[\\w\\-]+:\/+(?!\/)(?:[^\/]+\.)?/;

    $text =~ s/^\\\|/^/;       # Start anchor
    $text =~ s/\\\|$/\$/;      # End anchor

    $self->{regex} = qr/$text/;
    return 1;
}

sub matches {
    my ( $self, $uri, $domain ) = @_;

    # TODO: Should support the other options (contentType, thirdParty)

    # If the domain is given and this rule is constrained to domains, check
    # whether we should proceed
    if ( defined $domain && defined $self->{domains} ) {
        if ( !grep /^$domain$/, $self->{domains} ) {
            return 0;
        }
    }

    if ( $uri =~ $self->{regex} ) {

        #print "$uri matched $r (mode " . $self->{type} . ")\n";
        return 1 if $self->{type} eq 'blocking';
        return 2 if $self->{type} eq 'whitelist';
    }

    return 0;
}

1;