The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

package Log::Parallel::ApacheCLF;

use strict;
use warnings;
use Data::Dumper;
require Log::Parallel::Parsers;
require Exporter;
use Time::JulianDay qw(jd_timegm);;
use URI::Escape::XS qw(uri_unescape uri_escape);

our @ISA = qw(Log::Parallel::Parsers::BaseClass Exporter);
our @EXPORT = qw();

__PACKAGE__->register_parser();

our $warn_level = 1;

my $month_rx = qr/(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/;
my $num255_rx = qr/(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)/;
my $ip_rx = qr/$num255_rx\.$num255_rx\.$num255_rx\.$num255_rx/;
my $host_rx = qr/(?:$ip_rx|unknown)/;
my $quoted_rx = qr/(?:[^\\"]|\\.)*/;

my %mon_num = (
	Jan	=> 1,
	Feb	=> 2,
	Mar	=> 3,
	Apr	=> 4,
	May	=> 5,
	Jun	=> 6,
	Jul	=> 7,
	Aug	=> 8,
	Sep	=> 9,
	Oct	=> 10,
	Nov	=> 11,
	Dec	=> 12,
);

sub return_parser
{
	my ($pkg, $fh, %info) = @_;
	my $filesize = $info{filesize} || 2_000_000_000;
	my $span = $info{span};
	my $start_time = $info{time};
	my $post_rx = $info{extra_rx} || qr//;
	my $pre_rx = $info{pre_rx} || qr//;
	my $pre_rx_save_match_count = $info{pre_rx_saved_match_count} || 0;

	my $safewarn = sub {
		my ($level, $err) = @_;
		return unless $level <= $warn_level;
		use bytes;
		$err =~ s/([\200-\377])/sprintf("M-%c",ord($1)&0177)/eg;
		$err =~ s/([\0-\37\177])/sprintf("^%c",ord($1)^64)/eg;
		my ($pkg, $file, $line) = caller;
		warn "$err at $file:$line processing $info{host}:$info{filename}\n";
	};


	my $line_number = 0;

	my $midnight;
	my $day = '';
	my $offset;
	my $zone = '';

	return sub {
		while (<$fh>) {

			$line_number++;

			# - - - [19/Mar/2009:00:00:03 -0700] "HEAD / HTTP/1.0" 401 - "-" "-"
			unless (

				m{
					^
					$pre_rx

					(\S+)					# server hostname, *1
					[ ]
					apache:
					[ ]
					([^]["\@/]+?)				# forward ip, *2
					[ ]
					\S+					# ident user
					[ ]
					(\S+)					# apache auth user, *3
					[ ]
					\[
						(\d\d?/$month_rx/\d\d\d\d)	# apache date, *4
						:
						(\d\d)				# apache hour, *5
						:
						(\d\d)				# apache minute, *6
						:
						(\d\d)				# apache second, *7
						[ ]
						([-+]\d\d\d\d)			# apache timezone, *8
					\]
					[ ]
					"($quoted_rx)"				# request, *9
					[ ]
					(\d+)					# apache result code, *10
					[ ]
					(\S+)					# number of bytes sent, *11
					[ ]
					"($quoted_rx)"				# referrer, *12
					[ ]
					"($quoted_rx)"				# user agent, *13

					$post_rx
				}x

			) {
				warn "Could not parse $line_number $_";
				next;
			}
			
			my @match = ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23);
			my @pre = splice(@match, 0, $pre_rx_save_match_count);
			my ($ip, $auth_user, $date, $hh, $mm, $ss, $tzoff, $request, $status, $sent, $referrer, $user_agent) = splice(@match, 0, 13);

			#print "server_host=$server_host\n";
			#print "\tip=$ip\n";
			#print "\tdate=$date\n";
			#print "\trequest=$request\n";
			#print "\tstatus=$status\n";
			#print "\tsent=$sent\n";

			my $detail = '';

			$site = '' unless $site;

			unless ($day eq $date) {
				$day = $date;
				my ($mday, $month, $year) = split(/\//, $day);
				my $mnum = $mon_num{$month} || die "month = $month ($day)";
				$midnight = jd_timegm(0, 0, 0, $mday, $mnum-1, $year);
			}
			unless ($zone eq $tzoff) {
				$tzoff =~ /^([+-])(\d\d)(\d\d)/ || die;
				$offset = ($1 eq '-' ? -1 : 1) * (60 * $2 + $3);
			}
			my $time = $midnight + $offset + $hh * 3600 + $mm * 60 + $ss;

			$ip		= '' if ! defined($ip) || $ip eq '-';
			$referrer	= '' if ! defined($referrer) || $referrer eq '-';
			$user_agent	= '' if ! defined($user_agent) || $user_agent eq '-';
			$auth_user	= '' if ! defined($auth_user) || $auth_user eq '-';

			return {
				ip		=> $ip,
				auth_user	=> $auth_user,
				server_time	=> $time,
				request		=> $request,
				status		=> $status,
				bytes_sent	=> $sent,
				referrer	=> $referrer,
				user_agent	=> $user_agent,
				@pre ? (pre_match => \@pre) : (),
				(@+ > $pre_rx_save_match_count + 13) ? (post_match => \@match ) : (),
			};
		}
		return undef;
	};
}

1;

__END__

=head1 NAME

 Log::Parallel::ApacheCLF - parse apache common log format

=head1 SYNOPSIS

 use Log::Parallel::ApacheCLF;

 my $parser = Log::Parallel::ApacheCLF->return_parser($fh, %info);

=head1 LOG PROCESSING CONFIG

sources:
  -
    name:               raw apache server logs
    hosts:              host1.domain
    path:               /var/apache_archive/%YYYY%.%MM%.%DD%{,.bz2}
    format:             ApacheCLF
    valid_from:         2009-01-01
    valid_to:           yesterday
jobs:
  -
    name:               server logs
    destination:        server logs
    source:             raw apache server logs
    path:               '%DATADIR%/%YYYY%/%MM%/%DD%/%JOBNAME%.%DURATION%.%BUCKET%.%SOURCE_BKT%'
    valid_from:         2008-01-01
    valid_to:           yesterday
    frequency:          daily
    output_format:      TSV
    use:                Log::Parallel::TSV Log::Parallel::ApacheCLF
    buckets:            20
    hosts:              host10,host11,host12,host13
    bucketizer:         $log->{server_time}

=head1 DESCRIPTION

Parse the apache web server logs in Common Log Format.   The fields from the apache logs are named as follows:

=over

=item ip

The IP address header field.  Sometimes C<->.

=item auth_user

The HTTP authenticated user.

=item server_time

The time, unix time seconds, that the server wrote the log line.

=item request

The HTTP request line.   Eg: C<GET / HTTP/1.0>.

=item status

The HTTP status code.  200, 301, etc.

=item bytes_sent

The number bytes transfered.

=item user_agent

The HTTP UserAgent.

=item refferer

The HTTP Refferrer field.

=back

This module can also be used to parse more extended Apache logs.   Create a new module and
invoke this one to do a bunch of the work.    There are three extra construction arguments
that can be used:

=over

=item pre_rx

A regular expression to match of things that come I<before> the regular
Apache log format on each line.  
If this has saved matches, they'll be returned as an array: C<pre_match>.

=item pre_rx_saved_match_count

If you have a C<pre_rx>, and if that regular expression has saved matches,
you must say how many for Log::Parallel::ApacheCLF to work.  This
is how.

=item post_rx

A regular expression to match of things that come I<after> the regular
Apache log format on each line.

If this has saved matches, they'll be returned as an array: C<post_match>.

=back

=head1 LICENSE

This package may be used and redistributed under the terms of either
the Artistic 2.0 or LGPL 2.1 license.