The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# Copyright (c) 2002-2005 the World Wide Web Consortium :
#       Keio University,
#       European Research Consortium for Informatics and Mathematics
#       Massachusetts Institute of Technology.
# written by Olivier Thereaux <ot@w3.org> for W3C
#
# $Id: Basic.pm,v 1.18 2008/11/18 16:48:56 ot Exp $

package W3C::LogValidator::Basic;
use strict;
use warnings;


require Exporter;
our @ISA = qw(Exporter);
our %EXPORT_TAGS = ( 'all' => [ qw() ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = qw();
our $VERSION = sprintf "%d.%03d",q$Revision: 1.18 $ =~ /(\d+)\.(\d+)/;


###########################
# usual package interface #
###########################
our $verbose = 1;
our %config;

sub new
{
        my $self  = {};
        my $proto = shift;
        my $class = ref($proto) || $proto;
	# mandatory vars for the API
	@{$self->{URIs}} = undef;
	# don't change this
	if (@_) {%config =  %{(shift)};}
	if (exists $config{verbose}) {$verbose = $config{verbose}}
        bless($self, $class);
        return $self;
}

sub uris { 
	my $self = shift;
	if (@_) { @{$self->{URIs}} = @_ }
	return @{$self->{URIs}};
}


sub trim_uris 
{
        my $self = shift;
        my @trimmed_uris;
	my $exclude_regexp = "";
	my @exclude_areas;
	$exclude_regexp = $config{ExcludeAreas};
	if ($exclude_regexp){
		$exclude_regexp =~ s/\//\\\//g ;
		@exclude_areas = split(" ", $exclude_regexp);
	}
	else { print "nothing to exclude\n" if ($verbose >2);}
        my $uri;
        while ($uri = shift)
        {
	    my $acceptable = 1;
	    foreach my $area (@exclude_areas)
	    {
                if ($uri =~ /$area/)
                {	
			my $slasharea = $area;
			$slasharea =~ s/\\\//\//g;
			$slasharea =~ s/\\././g;
			print "Ignoring $uri matching $slasharea \n" if ($verbose > 2) ; 
			$acceptable = 0;		
                }
	    }	
	    push @trimmed_uris,$uri if ($acceptable);
        }
        return @trimmed_uris;
}


#########################################
# Actual subroutine to check the list of uris #
#########################################


sub process_list
{
	my $self = shift;
	my $max_invalid = undef;
	my $max_documents = undef;
	if (exists $config{MaxDocuments}) {$max_documents = $config{MaxDocuments}}
	else {$max_documents = 0}

# This basic module does not actually "validates"
# so MaxInvalid is not relevant... Keeping it anyway
	if (exists $config{MaxInvalid}) {$max_invalid = $config{MaxInvalid}}
	else {$max_invalid = 0}
	my $name = "";
	if (exists $config{ServerName}) {$name = $config{ServerName}}

	print "Now Using the Basic module... \n" if $verbose;
	my %hits;
	my %HTTPcodes;
	my @uris = undef;
	use DB_File; 
	if (defined ($config{tmpfile}))
	{
		my $tmp_file = $config{tmpfile};
		tie (%hits, 'DB_File', "$tmp_file", O_RDONLY) || 
		    die ("Cannot create or open $tmp_file");
		@uris = sort { $hits{$b} <=> $hits{$a} } keys %hits;
	}

	elsif ($self->uris())
	{
		@uris = $self->uris();
		foreach my $uri (@uris) { $hits{$uri} = 0 }
	}

        @uris = $self->trim_uris(@uris);


	if (defined ($config{tmpfile_HTTP_codes}))
	{
		my $tmp_file_HTTP_codes = $config{tmpfile_HTTP_codes};
		tie (%HTTPcodes, 'DB_File', "$tmp_file_HTTP_codes", O_RDONLY) || 
		    die ("Cannot create or open $tmp_file_HTTP_codes");
	}


	my $intro="Here are the <census> most popular documents overall for $name.";
	my @result;
	my @result_head;
	push @result_head, "Rank";
	push @result_head, "Hits";
	push @result_head, "Address";
	my $census = 0;
	while ( (@uris) and  (($census < $max_documents) or (!$max_documents)) )
	{
		my $uri = shift (@uris);
		chomp ($uri);
		my @result_tmp;
		if (!defined $HTTPcodes{$uri})
		{ # if no HTTP code present, assume it's a 200
		    $census++;
		    push @result_tmp, "$census";
		    push @result_tmp, "$hits{$uri}";
		    push @result_tmp, "$uri";
		    push @result, [@result_tmp];
		}
		elsif (($HTTPcodes{$uri} eq "200")  or (!$HTTPcodes{$uri} =~ /\d+/))
		# should perhaps make a subroutine for that instead of DUPing code
		{
		    $census++;
		    push @result_tmp, "$census";
		    push @result_tmp, "$hits{$uri}";
		    push @result_tmp, "$uri";
		    push @result, [@result_tmp];
		}
		elsif ((defined $HTTPcodes{$uri}) and ($verbose > 2)) { 
		    print "$uri returned code $HTTPcodes{$uri}, ignoring \n"; 
		}
	}
	print "Done!\n" if $verbose;
	if ($census eq 1) # let's repect grammar here
                {
                        $intro=~ s/are/is/;
                        $intro=~ s/<census> //;
                        $intro=~ s/document\(s\)/document/;
                }
	else
	{
		$intro=~ s/<census>/$census/;
	}
	if (defined ($config{tmpfile})) { 
		untie %hits;
	}
	my $outro="";
	my %returnhash;
	$returnhash{"name"}="basic";
	$returnhash{"intro"}=$intro;
	$returnhash{"outro"}=$outro;
	@{$returnhash{"thead"}}=@result_head;
	@{$returnhash{"trows"}}=@result;
	return %returnhash;
}

package W3C::LogValidator::Basic;

1;

__END__

=head1 NAME

W3C::LogValidator::Basic - [W3C Log Validator] Sort Web server log entries by popularity (hits)

=head1 SYNOPSIS

  use  W3C::LogValidator::Basic;
  my $b = new W3C::LogValidator::Basic;
  $b->uris('http://www.w3.org/Overview.html', 'http://www.yahoo.com/index.html');
  my $result_string= $b->process_list();

=head1 DESCRIPTION


This module is part of the W3C::LogValidator suite, and simply gives back pages
sorted by popularity. This is an example of simple module for LogValidator.

=head1 API 

=head2 Constructor

=over 2

=item $b = W3C::LogValidator::Basic->new

Constructs a new C<W3C::LogValidator:HTMLBasic> processor.  

You might pass it a configuration hash reference (see L<W3C::LogValidator/config_module> and L<W3C::LogValidator::Config>)
Particularly relevant for this module are the "verbose", "MaxDocuments" and obviously "tmpfile" (see C<process_list>).
Pass the configuration hash ref as follows:

  $b = W3C::LogValidator::HTMLValidator->new(\%config);

=back

=head2 General Methods

=over 4

=item b->uris 

Returns a  list of URIs to be processed (unless the configuration gives the location for the hash of URI/hits berkeley file, see C<process_list> 
If an array is given as a parameter, also sets the list of URIs and returns it.
Note: while this method is useful in other modules of L<W3C::LogValidator>, this basic module is here to sort URIs extracted from Log Files by popularity, this method is hence rather useless for L<W3C::LogValidator::Basic>.

=item b->trim_uris 

Given a list of URIs of documents to process, returns a subset of this list containing the URIs of documents the module supposedly can handle.
For this module, the decision is made based on the setting for ExcludedAreas only


=item b->process_list

Formats the list of URIs sorted by popularity.

Returns a result hash. Keys for this hash are: 

  name (string): the name of the module, i.e "Basic"
  intro (string): introduction to the processing results
  thead (array): headers of the results table
  trows (array of arrays): rows of the results table
  outro (string): conclusion of the processing results

=back

=head1 BUGS

Public bug-tracking interface at http://www.w3.org/Bugs/Public/


=head1 AUTHOR

Olivier Thereaux <ot@w3.org> for W3C

=head1 SEE ALSO

W3C::LogValidator, perl(1).
Up-to-date complete info at http://www.w3.org/QA/Tools/LogValidator/

=cut