# You may distribute under the terms of either the GNU General Public License
# or the Artistic License (the same terms as Perl itself)
#
# (C) Paul Evans, 2013-2016 -- leonerd@leonerd.org.uk
package Devel::MAT::Dumpfile;
use strict;
use warnings;
our $VERSION = '0.23';
use Carp;
use IO::Handle; # ->read
use IO::Seekable; # ->tell
use List::Util qw( pairmap );
use Devel::MAT::SV;
use Devel::MAT::Context;
=head1 NAME
C<Devel::MAT::Dumpfile> - load and analyse a heap dump file
=head1 SYNOPSIS
use Devel::MAT::Dumpfile;
my $df = Devel::MAT::Dumpfile->load( "path/to/the/file.pmat" );
TODO
=head1 DESCRIPTION
This module provides a class that loads a heap dump file previously written by
L<Devel::MAT::Dumper>. It provides accessor methods to obtain various
well-known root starting addresses, or to find arbitrary SVs by address. Each
SV is represented by an instance of L<Devel::MAT::SV>.
=cut
my @ROOTS;
my %ROOTDESC;
foreach (
[ main_cv => "+the main code" ],
[ defstash => "+the default stash" ],
[ mainstack => "+the main stack AV" ],
[ beginav => "+the BEGIN list" ],
[ checkav => "+the CHECK list" ],
[ unitcheckav => "+the UNITCHECK list" ],
[ initav => "+the INIT list" ],
[ endav => "+the END list" ],
[ strtab => "+the shared string table HV" ],
[ envgv => "-the ENV GV" ],
[ incgv => "+the INC GV" ],
[ statgv => "+the stat GV" ],
[ statname => "+the statname SV" ],
[ tmpsv => "+the temporary SV" ],
[ defgv => "+the default GV" ],
[ argvgv => "-the ARGV GV" ],
[ argvoutgv => "+the argvout GV" ],
[ argvout_stack => "+the argvout stack AV" ],
[ errgv => "+the *@ GV" ],
[ fdpidav => "+the FD-to-PID mapping AV" ],
[ preambleav => "+the compiler preamble AV" ],
[ modglobalhv => "+the module data globals HV" ],
[ regex_padav => "+the REGEXP pad AV" ],
[ sortstash => "+the sort stash" ],
[ firstgv => "-the *a GV" ],
[ secondgv => "-the *b GV" ],
[ debstash => "-the debugger stash" ],
[ stashcache => "+the stash cache" ],
[ isarev => "+the reverse map of \@ISA dependencies" ],
[ registered_mros => "+the registered MROs HV" ],
[ rs => "+the IRS" ],
[ last_in_gv => "+the last input GV" ],
[ ofsgv => "+the OFS GV" ],
[ defoutgv => "+the default output GV" ],
[ hintgv => "-the hints (%^H) GV" ],
[ patchlevel => "+the patch level" ],
[ apiversion => "+the API version" ],
[ e_script => "+the '-e' script" ],
[ mess_sv => "+the message SV" ],
[ ors_sv => "+the ORS SV" ],
[ encoding => "+the encoding" ],
[ blockhooks => "+the block hooks" ],
[ custom_ops => "+the custom ops HV" ],
[ custom_op_names => "+the custom op names HV" ],
[ custom_op_descs => "+the custom op descriptions HV" ],
map { [ $_ => "+the $_" ] } qw(
Latin1 UpperLatin1 AboveLatin1 NonL1NonFinalFold HasMultiCharFold
utf8_mark utf8_X_regular_begin utf8_X_extend utf8_toupper utf8_totitle
utf8_tolower utf8_tofold utf8_charname_begin utf8_charname_continue
utf8_idstart utf8_idcont utf8_xidstart utf8_perl_idstart utf8_perl_idcont
utf8_xidcont utf8_foldclosures utf8_foldable ),
) {
my ( $name, $desc ) = @$_;
push @ROOTS, $name;
$ROOTDESC{$name} = $desc;
# Autogenerate the accessors
my $name_at = "${name}_at";
my $code = sub { my $self = shift; $self->sv_at( $self->{$name_at} ) };
no strict 'refs';
*$name = $code;
}
=head1 CONSTRUCTOR
=cut
=head2 load
$df = Devel::MAT::Dumpfile->load( $path, %args )
Loads a heap dump file from the given path, and returns a new
C<Devel::MAT::Dumpfile> instance representing it.
Takes the following named arguments:
=over 8
=item progress => CODE
If given, should be a CODE reference to a function that will be called
regularly during the loading process, and given a status message to update the
user.
=back
=cut
sub load
{
my $class = shift;
my ( $path, %args ) = @_;
my $progress = $args{progress};
$progress->( "Loading file $path..." ) if $progress;
open my $fh, "<", $path or croak "Cannot read $path - $!";
my $self = bless { fh => $fh }, $class;
my $filelen = -s $fh;
# Header
$self->_read(4) eq "PMAT" or croak "File magic signature not found";
my $flags = $self->_read_u8;
my $endian = ( $self->{big_endian} = $flags & 0x01 ) ? ">" : "<";
my $u32_fmt = $self->{u32_fmt} = "L$endian";
my $u64_fmt = $self->{u64_fmt} = "Q$endian";
@{$self}{qw( uint_len uint_fmt )} =
( $flags & 0x02 ) ? ( 8, $u64_fmt ) : ( 4, $u32_fmt );
@{$self}{qw( ptr_len ptr_fmt )} =
( $flags & 0x04 ) ? ( 8, $u64_fmt ) : ( 4, $u32_fmt );
@{$self}{qw( nv_len nv_fmt )} =
( $flags & 0x08 ) ? ( 10, "D$endian" ) : ( 8, "d$endian" );
$self->{ithreads} = !!( $flags & 0x10 );
$flags &= ~0x1f;
die sprintf "Cannot read %s - unrecognised flags %x\n", $path, $flags if $flags;
$self->{minus_1} = unpack $self->{uint_fmt}, pack $self->{uint_fmt}, -1;
$self->_read_u8 == 0 or die "Cannot read $path - 'zero' header field is not zero";
$self->_read_u8 == 0 or die "Cannot read $path - format version major unrecognised";
( $self->{format_minor} = $self->_read_u8 ) <= 2 or
die "Cannot read $path - format version minor unrecognised ($self->{format_minor})";
if( $self->{format_minor} < 1 ) {
warn "Loading an earlier format of dumpfile - SV MAGIC annotations may be incorrect\n";
}
$self->{perlver} = $self->_read_u32;
my $n_types = $self->_read_u8;
my @sv_sizes = unpack "(a3)*", my $tmp = $self->_read( $n_types * 3 );
$self->{sv_sizes} = [ map [ unpack "C C C", $_ ], @sv_sizes ];
if( $self->{format_minor} >= 2 ) {
my $n_ctxs = $self->_read_u8;
my @ctx_sizes = unpack "(a3)*", my $tmp = $self->_read( $n_ctxs * 3 );
$self->{ctx_sizes} = [ map [ unpack "C C C", $_ ], @ctx_sizes ];
}
# Roots
foreach (qw( undef yes no )) {
my $addr = $self->{"${_}_at"} = $self->_read_ptr;
my $class = "Devel::MAT::SV::\U$_";
$self->{uc $_} = $class->new( $self, $addr );
}
foreach ( 1 .. $self->_read_u32 ) {
my $root = $self->_read_str;
$self->{"${root}_at"} = $self->_read_ptr;
}
# Stack
my $stacksize = $self->_read_uint;
$self->{stack_at} = [ map { $self->_read_ptr } 1 .. $stacksize ];
# Heap
$self->{heap} = \my %heap;
while( my $sv = $self->_read_sv ) {
$heap{$sv->addr} = $sv;
my $pos = $fh->IO::Seekable::tell; # fully-qualified method for 5.010
$progress->( sprintf "Loading file %d of %d (%.2f%%)",
$pos, $filelen, 100*$pos / $filelen ) if $progress and (keys(%heap) % 5000) == 0;
}
# Contexts
$self->{contexts} = \my @contexts;
while( my $ctx = $self->_read_ctx ) {
push @contexts, $ctx;
}
$self->_fixup( %args ) unless $args{no_fixup};
return $self;
}
sub _fixup
{
my $self = shift;
my %args = @_;
my $progress = $args{progress};
my $heap = $self->{heap};
my $heap_total = scalar keys %$heap;
my %protosub_by_oproot;
my $count = 0;
while( my ( $addr ) = each %$heap ) {
my $sv = $heap->{$addr} or next;
# While dumping we weren't able to determine what ARRAYs were really
# PADLISTs. Now we can fix them up
$sv->_fixup if $sv->can( "_fixup" );
# Also identify the protosub of every oproot
if( $sv->type eq "CODE" and $sv->oproot and $sv->is_clone ) {
$protosub_by_oproot{$sv->oproot} = $sv;
}
$count++;
$progress->( sprintf "Fixing %d of %d (%.2f%%)",
$count, $heap_total, 100*$count / $heap_total ) if $progress and ($count % 5000) == 0;
}
# Now annotate the protosubs for closures
$count = 0;
while( my ( $addr ) = each %$heap ) {
my $sv = $heap->{$addr} or next;
if( $sv->type eq "CODE" and $sv->oproot and $sv->is_cloned ) {
if( my $protosub = $protosub_by_oproot{$sv->oproot} ) {
$sv->_set_protosub_at( $protosub->addr );
}
}
$count++;
$progress->( sprintf "Setting protosubs %d of %d (%.2f%%)",
$count, $heap_total, 100*$count / $heap_total ) if $progress and ($count % 5000) == 0;
}
# Walk the SUB contexts setting their true depth
if( $self->{format_minor} >= 2 ) {
my %prev_depth_by_cvaddr;
foreach my $ctx ( @{ $self->{contexts} } ) {
next unless $ctx->type eq "SUB";
my $cvaddr = $ctx->{cv_at};
$ctx->_set_depth( $prev_depth_by_cvaddr{$cvaddr} // $ctx->cv->depth );
$prev_depth_by_cvaddr{$cvaddr} = $ctx->olddepth;
}
}
return $self;
}
# Nicer interface to IO::Handle
sub _read
{
my $self = shift;
my ( $len ) = @_;
return "" if $len == 0;
defined( $self->{fh}->read( my $buf, $len ) ) or croak "Cannot read - $!";
return $buf;
}
sub _read_u8
{
my $self = shift;
$self->{fh}->read( my $buf, 1 ) or croak "Cannot read - $!";
return unpack "C", $buf;
}
sub _read_u32
{
my $self = shift;
$self->{fh}->read( my $buf, 4 ) or croak "Cannot read - $!";
return unpack $self->{u32_fmt}, $buf;
}
sub _read_u64
{
my $self = shift;
$self->{fh}->read( my $buf, 8 ) or croak "Cannot read - $!";
return unpack $self->{u64_fmt}, $buf;
}
sub _read_uint
{
my $self = shift;
$self->{fh}->read( my $buf, $self->{uint_len} ) or croak "Cannot read - $!";
return unpack $self->{uint_fmt}, $buf;
}
sub _read_ptr
{
my $self = shift;
$self->{fh}->read( my $buf, $self->{ptr_len} ) or croak "Cannot read - $!";
return unpack $self->{ptr_fmt}, $buf;
}
sub _read_ptrs
{
my $self = shift;
my ( $n ) = @_;
$self->{fh}->read( my $buf, $self->{ptr_len} * $n ) or croak "Cannot read - $!";
return unpack "$self->{ptr_fmt}$n", $buf;
}
sub _read_nv
{
my $self = shift;
$self->{fh}->read( my $buf, $self->{nv_len} ) or croak "Cannot read - $!";
return unpack $self->{nv_fmt}, $buf;
}
sub _read_str
{
my $self = shift;
my $len = $self->_read_uint;
return undef if $len == $self->{minus_1};
return $self->_read($len);
}
sub _read_bytesptrsstrs
{
my $self = shift;
my ( $nbytes, $nptrs, $nstrs ) = @_;
return
( $nbytes ? $self->_read( $nbytes ) : "" ),
( $nptrs ? [ $self->_read_ptrs( $nptrs ) ] : undef ),
( $nstrs ? [ map { $self->_read_str } 1 .. $nstrs ] : undef );
}
sub _read_sv
{
my $self = shift;
while(1) {
my $type = $self->_read_u8;
return if !$type;
if( $type == 0x80 ) {
# magic
# file minor format zero magic had a different layout
if( $self->{format_minor} >= 1 ) {
my $sv_addr = $self->_read_ptr;
my $type = chr $self->_read_u8;
my $flags = $self->_read_u8;
my ( $obj, $ptr ) = $self->_read_ptrs(2);
my $sv = $self->sv_at( $sv_addr );
$sv->more_magic( $type => $flags, $obj, $ptr );
}
else {
my ( $sv_addr, $obj ) = $self->_read_ptrs(2);
my $type = chr $self->_read_u8;
my $sv = $self->sv_at( $sv_addr );
# Legacy format didn't have flags, and didn't distinguish obj from ptr
# However, the only objs it ever saved were refcounted ones. Lets just
# pretend all of them are refcounted objects.
$sv->more_magic( $type => 0x01, $obj, 0 );
}
next;
}
my $pos = tell $self->{fh};
# First read the "common" header
my $sv = Devel::MAT::SV->new( $type, $self,
$self->_read_bytesptrsstrs( @{ $self->{sv_sizes}[0] } )
);
# Then the SV header
$sv->load(
$self->_read_bytesptrsstrs( @{ $self->{sv_sizes}[$type] } )
);
return $sv;
}
}
sub _read_ctx
{
my $self = shift;
my $type = $self->_read_u8;
return if !$type;
if( $self->{format_minor} >= 2 ) {
my $ctx = Devel::MAT::Context->new( $type, $self,
$self->_read_bytesptrsstrs( @{ $self->{ctx_sizes}[0] } )
);
$ctx->load(
$self->_read_bytesptrsstrs( @{ $self->{ctx_sizes}[$type] } )
);
return $ctx;
}
else {
return Devel::MAT::Context->load_v0_1( $type, $self );
}
}
=head1 METHODS
=cut
=head2 perlversion
$version = $df->perlversion
Returns the version of perl that the heap dump file was created by, as a
string in the form C<5.14.2>.
=cut
sub perlversion
{
my $self = shift;
my $v = $self->{perlver};
return join ".", $v>>24, ($v>>16) & 0xff, $v&0xffff;
}
=head2 endian
$endian = $df->endian
Returns the endian direction of the perl that the heap dump was created by, as
either C<big> or C<little>.
=cut
sub endian
{
my $self = shift;
return $self->{big_endian} ? "big" : "little";
}
=head2 uint_len
$len = $df->uint_len
Returns the length in bytes of a uint field of the perl that the heap dump was
created by.
=cut
sub uint_len
{
my $self = shift;
return $self->{uint_len};
}
=head2 ptr_len
$len = $df->ptr_len
Returns the length in bytes of a pointer field of the perl that the heap dump
was created by.
=cut
sub ptr_len
{
my $self = shift;
return $self->{ptr_len};
}
=head2 nv_len
$len = $df->nv_len
Returns the length in bytes of a double field of the perl that the heap dump
was created by.
=cut
sub nv_len
{
my $self = shift;
return $self->{nv_len};
}
=head2 ithreads
$ithreads = $df->ithreads
Returns a boolean indicating whether ithread support was enabled in the perl
that the heap dump was created by.
=cut
sub ithreads
{
my $self = shift;
return $self->{ithreads};
}
=head2 roots
%roots = $df->roots
Returns a key/value pair list giving the names and SVs at each of the roots.
=head2 roots_strong
%roots = $df->roots_strong
Returns a key/value pair list giving the names and SVs at each of the roots
that count as strong references.
=cut
sub _roots
{
my $self = shift;
return map { +$ROOTDESC{$_} => $self->sv_at( $self->{"${_}_at"} ) } @ROOTS;
}
sub roots
{
my $self = shift;
return pairmap { substr( $a, 1 ) => $b } $self->_roots;
}
sub roots_strong
{
my $self = shift;
return pairmap { $a =~ m/^\+(.*)/ ? ( $1 => $b ) : () } $self->_roots;
}
=head2 ROOTS
$sv = $df->ROOT
For each of the root names given below, a method exists with that name which
returns the SV at that root:
main_cv
defstash
mainstack
beginav
checkav
unitcheckav
initav
endav
strtabhv
envgv
incgv
statgv
statname
tmpsv
defgv
argvgv
argvoutgv
argvout_stack
fdpidav
preambleav
modglobalhv
regex_padav
sortstash
firstgv
secondgv
debstash
stashcache
isarev
registered_mros
=cut
=head2 heap
@svs = $df->heap
Returns all of the heap-allocated SVs, in no particular order
=cut
sub heap
{
my $self = shift;
return values %{ $self->{heap} };
}
=head2 contexts
@ctxs = $df->contexts
Returns a list of L<Devel::MAT::Context> objects representing the call context
stack in the dumpfile.
=cut
sub contexts
{
my $self = shift;
return @{ $self->{contexts} };
}
=head2 sv_at
$sv = $df->sv_at( $addr )
Returns the SV at the given address, or C<undef> if one does not exist.
(Note that this is unambiguous, as a Perl-level C<undef> is represented by the
immortal C<Devel::MAT::SV::UNDEF> SV).
=cut
sub sv_at
{
my $self = shift;
my ( $addr ) = @_;
return undef if !$addr;
return $self->{UNDEF} if $addr == $self->{undef_at};
return $self->{YES} if $addr == $self->{yes_at};
return $self->{NO} if $addr == $self->{no_at};
return $self->{heap}{$addr};
}
=head1 AUTHOR
Paul Evans <leonerd@leonerd.org.uk>
=cut
0x55AA;