The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# ABSTRACT: Simple syslog line parser

package Parse::Syslog::Line;

use warnings;
use strict;

use Exporter;
use Const::Fast;
use DateTime::Format::HTTP;
use HTTP::Date;

our $VERSION        = '3.1';

our $DateTimeCreate    = 1;
our $ExtractProgram    = 1;
our $FmtDate;
our $EpochCreate       = 0;
our $PruneRaw          = 0;
our $PruneEmpty        = 0;
our $RegexSet          = 'stable';
our @PruneFields       = ();


my %INT_PRIORITY = (
    'emerg'         => 0,
    'alert'         => 1,
    'crit'          => 2,
    'err'           => 3,
    'warn'          => 4,
    'notice'        => 5,
    'info'          => 6,
    'debug'         => 7,
);

my %INT_FACILITY = (
    #
    # POSIX Facilities
    'kern'          => 0 << 3,
    'user'          => 1 << 3,
    'mail'          => 2 << 3,
    'daemon'        => 3 << 3,
    'auth'          => 4 << 3,
    'syslog'        => 5 << 3,
    'lpr'           => 6 << 3,
    'news'          => 7 << 3,
    'uucp'          => 8 << 3,
    'cron'          => 9 << 3,
    'authpriv'      => 10 << 3,
    'ftp'           => 11 << 3,
    #
    # Local Reserved
    'local0'        => 16 << 3,
    'local1'        => 17 << 3,
    'local2'        => 18 << 3,
    'local3'        => 19 << 3,
    'local4'        => 20 << 3,
    'local5'        => 21 << 3,
    'local6'        => 22 << 3,
    'local7'        => 23 << 3,
    #
    # Apple Additions
    'netinfo'       => 12 << 3,
    'remoteauth'    => 13 << 3,
    'install'       => 14 << 3,
    'ras'           => 15 << 3,
    'launchd'       => 24 << 3,
);

const our %LOG_PRIORITY => (
    %INT_PRIORITY,
    reverse(%INT_PRIORITY),
);

const our %LOG_FACILITY => (
    %INT_FACILITY,
    reverse(%INT_FACILITY),
);

const our %CONV_MASK => (
    priority        => 0x07,
    facility        => 0x03f8,
);


our @ISA = qw(Exporter);
our @EXPORT = qw(parse_syslog_line);
our @EXPORT_OK = qw(
    parse_syslog_line
    preamble_priority preamble_facility
    %LOG_FACILITY %LOG_PRIORITY
);
our @EXPORT_TAGS = (
    constants       => [ qw( %LOG_FACILITY %LOG_PRIORITY ) ],
    preamble        => [ qw(preamble_priority preamble_facility) ],
);

# Regex to Support Matches
 my %RE = (
    IPv4    => qr/(?:[0-9]{1,3}\.){3}[0-9]{1,3}/,
);

# Regex to Extract Data
my %REGEXP = (
    stable => {
        preamble        => qr/^\<(\d+)\>/,
        date            => qr/^([a-zA-Z]{3}\s+[0-9]+\s+[0-9]{1,2}(?:\:[0-9]{2}){1,2})/,
        date_long => qr/^
                (?:[0-9]{4}\s+)?                # Year: Because, Cisco
                ([.*])?                         # Cisco adds a * for no ntp, and a . for configured but out of sync
                [a-zA-Z]{3}\s+[0-9]+            # Date: Jan  1
                (?:\s+[0-9]{4})?                # Year: Because, Cisco
                \s+                             # Date Separator: spaces
                [0-9]{1,2}(?:\:[0-9]{2}){1,2}   # Time: HH:MM or HH:MM:SS
                (?:\.[0-9]{3})?                 # Time: .DDD ms resolution
                (?:\s+[A-Z]{3,4})?              # Timezone, ZZZ or ZZZZ
                (?:\:?)                         # Cisco adds a : after the second timestamp
        /x,
        date_iso8601    => qr/^(
                [0-9]{4}(\-[0-9]{2}){2}     # Date YYYY-MM-DD
                (\s|T)                      # Date Separator T or ' '
                [0-9]{2}(\:[0-9]{2}){1,2}   # Time HH:MM:SS
                ([+\-][0-9]{2}\:[0-9]{2})?  # UTC Offset +DD:MM
        )/x,
        host            => qr/^\s*([^:\s]+)\s+/,
        cisco_hates_you => qr/^\s*[0-9]*:\s+/,
        program_raw     => qr/^\s*([^\[][^:]+):\s*/,
        program_name    => qr/^([^\[\(\ ]+)/,
        program_sub     => qr/\(([^\)]+)\)/,
        program_pid     => qr/\[([^\]]+)\]/,
        program_netapp  => qr/\[([^\]]+)\]:\s*/,
    },
    devel => {
        preamble        => qr/^\<(\d+)\>/,
        date            => qr/^([a-zA-Z]{3}\s+[0-9]+\s+[0-9]{1,2}(?:\:[0-9]{2}){1,2})/,
        date_long => qr/^
                (?:[0-9]{4}\s+)?                # Year: Because, Cisco
                ([.*])?                         # Cisco adds a * for no ntp, and a . for configured but out of sync
                [a-zA-Z]{3}\s+[0-9]+            # Date: Jan  1
                (?:\s+[0-9]{4})?                # Year: Because, Cisco
                \s+                             # Date Separator: spaces
                [0-9]{1,2}(?:\:[0-9]{2}){1,2}   # Time: HH:MM or HH:MM:SS
                (?:\.[0-9]{3})?                 # Time: .DDD ms resolution
                (?:\s+[A-Z]{3,4})?              # Timezone, ZZZ or ZZZZ
                (?:\:?)                         # Cisco adds a : after the second timestamp
        /x,
        date_iso8601    => qr/^(
                [0-9]{4}(?:\-[0-9]{2}){2}     # Date YYYY-MM-DD
                (?:\s|T)                      # Date Separator T or ' '
                [0-9]{2}(\:[0-9]{2}){1,2}   # Time HH:MM:SS
                (?:[+\-][0-9]{2}\:[0-9]{2})?  # UTC Offset +DD:MM
        )/x,
        host            => qr/^\s*([^:\s]+)\s+/,
        cisco_hates_you => qr/^\s*[0-9]*:\s+/,
        program_raw     => qr/^\s*([^\[][^:]+):\s*/,
        program_name    => qr/^([^\[\(\ ]+)/,
        program_sub     => qr/\(([^\)]+)\)/,
        program_pid     => qr/\[([^\]]+)\]/,
        program_netapp  => qr/\[([^\]]+)\]:\s*/,
    },
);


my %_empty_msg = map { $_ => undef } qw(
    preamble priority priority_int facility facility_int
    datetime_raw date_raw date time date_str datetime_str datetime_obj epoch
    host_raw host domain
    program_raw program_name program_pid program_sub
);

sub parse_syslog_line {
    my ($raw_string) = @_;

    # Verify we have a valid RegexSet
    die "Invalid RegexSet '$RegexSet', valid are: ". join(", ", sort keys %REGEXP) unless exists $REGEXP{$RegexSet};

    # Initialize everything to undef
    my %msg =  $PruneEmpty ? () : %_empty_msg;
    $msg{message_raw} = $raw_string unless $PruneRaw;

    #
    # grab the preamble:
    if( $raw_string =~ s/$REGEXP{$RegexSet}->{preamble}//o ) {
        # Cast to integer
        $msg{preamble} = int $1;

        # Extract Integers
        $msg{priority_int} = $msg{preamble} & $CONV_MASK{priority};
        $msg{facility_int} = $msg{preamble} & $CONV_MASK{facility};

        # Lookups
        $msg{priority} = $LOG_PRIORITY{ $msg{priority_int} };
        $msg{facility} = $LOG_FACILITY{ $msg{facility_int} };
    }

    #
    # Handle Date/Time
    if( $raw_string =~ s/$REGEXP{$RegexSet}->{date}//o) {
        $msg{datetime_raw} = $1;
    }
    elsif( $raw_string =~ s/$REGEXP{$RegexSet}->{date_iso8601}//o) {
        $msg{datetime_raw} = $1;
    }
    if( exists $msg{datetime_raw} && length $msg{datetime_raw} ) {
        $msg{date_raw} = $msg{datetime_raw};

        # Only parse the DatetTime if we're configured to do so
        if( $DateTimeCreate ) {
            my $dt = DateTime::Format::HTTP->parse_datetime( $msg{datetime_raw} );
            $msg{date}         = $dt->ymd('-');
            $msg{time}         = $dt->hms;
            $msg{epoch}        = $dt->epoch if $EpochCreate;
            $msg{datetime_str} = $dt->ymd('-') . ' ' . $dt->hms;
            $msg{datetime_obj} = $dt;
        }
        elsif( $FmtDate && ref $FmtDate eq 'CODE' ) {
            @msg{qw(date time epoch datetime_str)} = $FmtDate->($msg{datetime_raw});
        }
        elsif( $EpochCreate ) {
            $msg{epoch}        = HTTP::Date::str2time($msg{datetime_raw});
            $msg{datetime_str} = HTTP::Date::time2iso($msg{epoch});
        }
        $msg{date_str} = $msg{datetime_str} if exists $msg{datetime_str};
    }

    #
    # Host Information:
    if( $raw_string =~ s/$REGEXP{$RegexSet}->{host}//o ) {
        my $hostStr = $1;
        my($ip) = ($hostStr =~ /($RE{IPv4})/o);
        if( defined $ip && length $ip ) {
            $msg{host_raw} = $hostStr;
            $msg{host} = $ip;
        }
        elsif( length $hostStr ) {
            my ($host,$domain) = split /\./, $hostStr, 2;
            $msg{host_raw} = $hostStr;
            $msg{host} = $host;
            $msg{domain} = $domain;
        }
    }
    if( $raw_string =~ s/$REGEXP{$RegexSet}->{cisco_hates_you}//o ) {
        # Yes, Cisco adds a second timestamp to it's messages, because it hates you.
        if( $raw_string =~ s/$REGEXP{$RegexSet}->{date_long}//o ) {
            # Cisco encodes the status of NTP in the second datestamp, so let's pass it back
            if ( my $ntp = $1 ) {
                $msg{ntp} = $ntp eq '.' ? 'out of sync'
                          : $ntp eq '*' ? 'not configured'
                          : 'unknown';
            }
            else {
                $msg{ntp} = 'ok';
            }
        }
    }

    #
    # Parse the Program portion
    if( $ExtractProgram ) {
        if( $raw_string =~ s/$REGEXP{$RegexSet}->{program_raw}//o ) {
            my $progStr = $1;
            chomp($progStr);
            if( defined $progStr && length $progStr) {
                $msg{program_raw} = $progStr;
                if( ($msg{program_name}) = ($progStr =~ /$REGEXP{$RegexSet}->{program_name}/o) ) {
                    if (length $msg{program_name} != length $msg{program_raw} ) {
                        (($msg{program_pid}) = ($progStr =~ /$REGEXP{$RegexSet}->{program_pid}/o))
                            || (($msg{program_sub}) = ($progStr =~ /$REGEXP{$RegexSet}->{program_sub}/o))
                    }
                }
            }
        }
        elsif( $raw_string =~ s/$REGEXP{$RegexSet}->{program_netapp}//o ) {
            # Check for a [host thing.subthing:level]: tag
            #          or [host:thing.subthing:level]: tag, Thanks NetApp.
            my $subStr = $1;
            $msg{program_raw} = qq{[$subStr]};
            my ($host,$program,$level) = split /[: ]+/, $subStr;
            $msg{program_name} = $program;
            if(!exists $msg{priority} && exists $LOG_PRIORITY{$level}) {
                $msg{priority} = $level;
                $msg{priority_int} = $LOG_PRIORITY{$level};
            }
            $raw_string =~ s/^[ :]+//;
        }
    }
    else {
        $raw_string =~ s/^\s+//;
    }

    # The left overs should be the message
    $msg{content} = $raw_string;
    chomp $msg{content};
    $msg{message} = defined $msg{program_raw} ? "$msg{program_raw}: $msg{content}" : $msg{content};

    if( $PruneRaw ) {
        delete $msg{$_} for grep { $_ =~ /_raw$/ } keys %msg;
    }
    if( $PruneEmpty ) {
        delete $msg{$_} for grep { !defined $msg{$_} } keys %msg;
    }
    if( @PruneFields ) {
        no warnings;
        delete $msg{$_} for @PruneFields;
    }

    #
    # Return our hash reference!
    return \%msg;
}


sub preamble_priority {
    my $preamble = int shift;

    my %hash = (
        preamble => $preamble,
    );

    $hash{as_int} = $preamble & $CONV_MASK{priority};
    $hash{as_text} = $LOG_PRIORITY{ $hash{as_int} };

    return \%hash;
}


sub preamble_facility {
    my $preamble = int shift;

    my %hash = (
        preamble => $preamble,
    );

    $hash{as_int} = $preamble & $CONV_MASK{facility};
    $hash{as_text} = $LOG_FACILITY{ $hash{as_int} };

    return \%hash;

}

1; # End of Parse::Syslog::Line

__END__

=pod

=encoding UTF-8

=head1 NAME

Parse::Syslog::Line - Simple syslog line parser

=head1 VERSION

version 3.1

=head1 SYNOPSIS

I wanted a very simple log parser for network based syslog input.
Nothing existed that simply took a line and returned a hash ref all
parsed out.

    use Parse::Syslog::Line qw(parse_syslog_line);

    $Parse::Syslog::Line::DateTimeCreate = 1;

    my $href = parse_syslog_line( $msg );
    #
    # $href = {
    #       preamble        => '13',
    #       priority        => 'notice',
    #       priority_int    => 5,
    #       facility        => 'user',
    #       facility_int    => 8,
    #       date            => 'YYYY-MM-DD',
    #       time            => 'HH::MM:SS',
    #       epoch           => 1361095933,
    #       datetime_str    => 'YYYY-MM-DD HH:MM:SS',
    #       datet_str       => 'YYYY-MM-DD HH:MM:SS',
    #       datetime_obj    => new DateTime(), # If installed
    #       datetime_raw    => 'Feb 17 11:12:13'
    #       date_raw        => 'Feb 17 11:12:13'
    #       date_raw        => 'Feb 17 11:12:13'
    #       host_raw        => 'hostname',  # Hostname as it appeared in the message
    #       host            => 'hostname',  # Hostname without domain
    #       domain          => 'blah.com',  # if provided
    #       program_raw     => 'sshd(blah)[pid]',
    #       program_name    => 'sshd',
    #       program_sub     => 'pam_unix',
    #       program_pid     => 20345,
    #       content         => 'the rest of the message'
    #       message         => 'program[pid]: the rest of the message',
    #       message_raw     => 'The message as it was passed',
    #       ntp             => 'ok',           # Only set for Cisco messages
    # };
    ...

=head1 EXPORT

Exported by default:
       parse_syslog_line( $one_line_of_syslog_message );

Optional Exports:
  :preamble
       preamble_priority
       preamble_facility

  :constants
       %LOG_FACILITY
       %LOG_PRIORITY

=head1 VARIABLES

=head2 ExtractProgram

If this variable is set to 1 (the default), parse_syslog_line() will try it's
best to extract a "program" field from the input.  This is the most expensive
set of regex in the module, so if you don't need that pre-parsed, you can speed
the module up significantly by setting this variable.

Vendors who do proprietary non-sense with their syslog formats are to blame for
this setting.

Usage:

  $Parse::Syslog::Line::ExtractProgram = 0;

=head2 DateTimeCreate

If this variable is set to 1 (the default), a DateTime object will be
returned in the $m->{datetime_obj} field.  Otherwise, this will be skipped.

Usage:

  $Parse::Syslog::Line::DateTimeCreate = 0;

=head2 EpochCreate

If this variable is set to 1, the number of seconds from UNIX epoch
will be returned in the $m->{epoch} field.  If DateTimeCreate is
not set, the parser will use C<HTTP::Date> to perform the parsing

Usage:

  $Parse::Syslog::Line::EpochCreate = 1;

=head2 PruneRaw

This variable defaults to 0, set to 1 to delete all keys in the return hash ending in "_raw"

Usage:

  $Parse::Syslog::Line::PruneRaw = 1;

=head2 PruneEmpty

This variable defaults to 0, set to 1 to delete all keys in the return hash which are undefined.

Usage:

  $Parse::Syslog::Line::PruneEmpty = 1;

=head2 PruneFields

This should be an array of fields you'd like to be removed from the hash reference.

Usage:

  @Parse::Syslog::Line::PruneFields = qw(date_str date_raw facility_int priority_int);

=head2 RegexSet

Allows the use of different regex sets, the default is stable.  This is mostly a developer level
feature to allow easy benchmarking of features against previous release.

Usage:

  $Parse::Syslog::Line::RegexSet = 'devel';

=head1 FUNCTIONS

=head2 parse_syslog_line

Returns a hash reference of syslog message parsed data.

=head2 preamble_priority

Takes the Integer portion of the syslog messsage and returns
a hash reference as such:

    $prioRef = {
        'preamble'  => 13
        'as_text'   => 'notice',
        'as_int'    => 5,
    };

=head2 preamble_facility

Takes the Integer portion of the syslog messsage and returns
a hash reference as such:

    $facRef = {
        'preamble'  => 13
        'as_text'   => 'user',
        'as_int'    => 8,
    };

=head1 DEVELOPMENT

This module is developed with Dist::Zilla.  To build from the repository, use Dist::Zilla:

    dzil authordeps |cpanm
    dzil build
    dzil test

=over 4

=item * Github Page

L<http://github.com/reyjrar/Parse-Syslog-Line>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/Parse-Syslog-Line>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/Parse-Syslog-Line>

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Parse-Syslog-Line>

=item * Search CPAN

L<http://search.cpan.org/dist/Parse-Syslog-Line>

=back

=head1 ACKNOWLEDGEMENTS

=over 4

=item Mattia Barbon

Contribution of patch to support faster HTTP::Date routines

=item Alexander Hartmaier

Contribution of log samples for Cisco devices and testing

=item Shawn Wilson

Contribution of patch to support custom date parsing function

=back

=head1 AUTHOR

Brad Lhotsky <brad@divisionbyzero.net>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2015 by Brad Lhotsky.

This is free software, licensed under:

  The (three-clause) BSD License

=cut