lib/BibTeX/Parser.pm - metacpan.org

package BibTeX::Parser;
{
  $BibTeX::Parser::VERSION = '1.01';
}
# ABSTRACT: A pure perl BibTeX parser
use warnings;
use strict;

use BibTeX::Parser::Entry;


my $re_namechar = qr/[a-zA-Z0-9\!\$\&\*\+\-\.\/\:\;\<\>\?\[\]\^\_\`\|]/o;
my $re_name     = qr/$re_namechar+/o;


sub new {
    my ( $class, $fh ) = @_;

    return bless {
        fh      => $fh,
        strings => {
            jan => "January",
            feb => "February",
            mar => "March",
            apr => "April",
            may => "May",
            jun => "June",
            jul => "July",
            aug => "August",
            sep => "September",
            oct => "October",
            nov => "November",
            dec => "December",

        },
        line   => -1,
        buffer => "",
    }, $class;
}

sub _slurp_close_bracket;

sub _parse_next {
    my $self = shift;

    while (1) {    # loop until regular entry is finished
        return 0 if $self->{fh}->eof;
        local $_ = $self->{buffer};

        until (/@/m) {
            my $line = $self->{fh}->getline;
            return 0 unless defined $line;
            $_ .= $line;
        }

        my $current_entry = new BibTeX::Parser::Entry;
        if (/@($re_name)/cgo) {
	    my $type = uc $1;
            $current_entry->type( $type );
            my $start_pos = pos($_) - length($type) - 1;

            # read rest of entry (matches braces)
            my $bracelevel = 0;
            $bracelevel += tr/\{/\{/;    #count braces
            $bracelevel -= tr/\}/\}/;
            while ( $bracelevel != 0 ) {
                my $position = pos($_);
                my $line     = $self->{fh}->getline;
				last unless defined $line;
                $bracelevel =
                  $bracelevel + ( $line =~ tr/\{/\{/ ) - ( $line =~ tr/\}/\}/ );
                $_ .= $line;
                pos($_) = $position;
            }

            # Remember text before the entry
            my $pre = substr($_, 0, $start_pos-1);
	    if ($start_pos == 0) {
		$pre = '';
	    }
            $current_entry->pre($pre);


            # Remember raw bibtex code
            my $raw = substr($_, $start_pos);
            $raw =~ s/^\s+//;
            $raw =~ s/\s+$//;
            $current_entry->raw_bibtex($raw);

            my $pos = pos $_;
            tr/\n/ /;
            pos($_) = $pos;

            if ( $type eq "STRING" ) {
                if (/\G\{\s*($re_name)\s*=\s*/cgo) {
                    my $key   = $1;
                    my $value = _parse_string( $self->{strings} );
                    if ( defined $self->{strings}->{$key} ) {
                        warn("Redefining string $key!");
                    }
                    $self->{strings}->{$key} = $value;
                    /\G[\s\n]*\}/cg;
                } else {
                    $current_entry->error("Malformed string!");
					return $current_entry;
                }
            } elsif ( $type eq "COMMENT" or $type eq "PREAMBLE" ) {
                /\G\{./cgo;
                _slurp_close_bracket;
            } else {    # normal entry
                $current_entry->parse_ok(1);

				# parse key
                if (/\G\s*\{(?:\s*($re_name)\s*,[\s\n]*|\s+\r?\s*)/cgo) {
                    $current_entry->key($1);

					# fields
                    while (/\G[\s\n]*($re_name)[\s\n]*=[\s\n]*/cgo) {
                        $current_entry->field(
                                      $1 => _parse_string( $self->{strings} ) );
                        my $idx = index( $_, ',', pos($_) );
                        pos($_) = $idx + 1 if $idx > 0;
                    }

                    return $current_entry;

                } else {

                    $current_entry->error("Malformed entry (key contains illegal characters) at " . substr($_, pos($_) || 0, 20)  . ", ignoring");
                    _slurp_close_bracket;
					return $current_entry;
                }
            }

            $self->{buffer} = substr $_, pos($_);

        } else {
            $current_entry->error("Did not find type at " . substr($_, pos($_) || 0, 20)); 
			return $current_entry;
        }

    }
}


sub next {
    my $self = shift;

    return $self->_parse_next;
}

# slurp everything till the next closing brace. Handles
# nested brackets
sub _slurp_close_bracket {
    my $bracelevel = 0;
  BRACE: {
        /\G[^\}]*\{/cg && do { $bracelevel++; redo BRACE };
        /\G[^\{]*\}/cg
          && do {
            if ( $bracelevel > 0 ) {
                $bracelevel--;
                redo BRACE;
            } else {
                return;
            }
          }
    }
}

# parse bibtex string in $_ and return. A BibTeX string is either enclosed
# in double quotes '"' or matching braces '{}'. The braced form may contain
# nested braces.
sub _parse_string {
    my $strings_ref = shift;

    my $value = "";

  PART: {
        if (/\G(\d+)/cg) {
            $value .= $1;
        } elsif (/\G($re_name)/cgo) {
            warn("Using undefined string $1") unless defined $strings_ref->{$1};
            $value .= $strings_ref->{$1} || "";
        } elsif (/\G"(([^"\\]*(\\.)*[^\\"]*)*)"/cgs)
        {    # quoted string with embeded escapes
            $value .= $1;
        } else {
            my $part = _extract_bracketed( $_ );
            $value .= substr $part, 1, length($part) - 2;    # strip quotes
        }

        if (/\G\s*#\s*/cg) {    # string concatenation by #
            redo PART;
        }
    }
    $value =~ s/[\s\n]+/ /g;
    return $value;
}

sub _extract_bracketed
{
	for($_[0]) # alias to $_
	{
		/\G\s+/cg;
		my $start = pos($_);
		my $depth = 0;
		while(1)
		{
			/\G\\./cg && next;
			/\G\{/cg && (++$depth, next);
			/\G\}/cg && (--$depth > 0 ? next : last);
			/\G([^\\\{\}]+)/cg && next; 
			last; # end of string
		}
		return substr($_, $start, pos($_)-$start);
	}
}

# Split the $string using $pattern as a delimiter with
# each part having balanced braces (so "{$pattern}"
# does NOT split).
# Return empty list if unmatched braces

sub _split_braced_string {
    my $string = shift;
    my $pattern = shift;
    my @tokens;
    return () if  $string eq '';
    my $buffer;
    while (!defined pos $string || pos $string < length $string) {
	if ( $string =~ /\G(.*?)(\{|$pattern)/cgi ) {
	    my $match = $1;
	    if ( $2 =~ /$pattern/i ) {
		$buffer .= $match;
		push @tokens, $buffer;
		$buffer = "";
	    } elsif ( $2 =~ /\{/ ) {
		$buffer .= $match . "{";
		my $numbraces=1;
		while ($numbraces !=0 && pos $string < length $string) {
		    my $symbol = substr($string, pos $string, 1);
		    $buffer .= $symbol;
		    if ($symbol eq '{') {
			$numbraces ++;
		    } elsif ($symbol eq '}') {
			$numbraces --;
		    }
		    pos($string) ++;
		}
		if ($numbraces != 0) {
		    return ();
		}
	    } else {
		$buffer .= $match;
	    }
	} else {
	    $buffer .= substr $string, (pos $string || 0);
	    last;
	}
    }
    push @tokens, $buffer if $buffer;
    return @tokens;
}


1;    # End of BibTeX::Parser


__END__
=pod

=head1 NAME

BibTeX::Parser - A pure perl BibTeX parser


=head1 SYNOPSIS

Parses BibTeX files.

    use BibTeX::Parser;
	use IO::File;

    my $fh     = IO::File->new("filename");

    # Create parser object ...
    my $parser = BibTeX::Parser->new($fh);
    
    # ... and iterate over entries
    while (my $entry = $parser->next ) {
	    if ($entry->parse_ok) {
		    my $type    = $entry->type;
		    my $title   = $entry->field("title");

		    my @authors = $entry->author;
		    # or:
		    my @editors = $entry->editor;
		    
		    foreach my $author (@authors) {
			    print $author->first . " "
			    	. $author->von . " "
				. $author->last . ", "
				. $author->jr;
		    }
	    } else {
		    warn "Error parsing file: " . $entry->error;
	    }
    }



=head1 FUNCTIONS

=head2 new

Creates new parser object. 

Parameters:

	* fh: A filehandle

=head2 next

Returns the next parsed entry or undef.

=head1 NOTES

The fields C<author> and C<editor> are canonized, see
L<BibTeX::Parser::Author>


=head1 SEE ALSO

=over 4

=item

L<BibTeX::Parser::Entry>

=item

L<BibTeX::Parser::Author>

=back

=head1 VERSION

version 1.01


=head1 AUTHOR

Gerhard Gossen <gerhard.gossen@googlemail.com> and
Boris Veytsman <boris@varphi.com>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2013-2016 by Gerhard Gossen and Boris Veytsman

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)