The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w

# Try opening libperl.a with nm, and verifying it has the kind of
# symbols we expect, and no symbols we should avoid.
#
# Fail softly, expect things only on known platforms:
# - linux
# - darwin (OS X), both x86 and ppc
# - freebsd
# and on other platforms, and if things seem odd, just give up (skip_all).
#
# Also, if the rarely-used builds options -DPERL_GLOBAL_STRUCT or
# -DPERL_GLOBAL_STRUCT_PRIVATE are used, verify that they did what
# they were meant to do, hide the global variables (see perlguts for
# the details).
#
# Debugging tip: nm output (this script's input) can be faked by
# giving one command line argument for this script: it should be
# either the filename to read, or "-" for STDIN.  You can also append
# "@style" (where style is a supported nm style, like "gnu" or "darwin")
# to this filename for "cross-parsing".
#
# Some terminology:
# - "text" symbols are code
# - "data" symbols are data (duh), with subdivisions:
#   - "bss": (Block-Started-by-Symbol: originally from IBM assembler...),
#     uninitialized data, which often even doesn't exist in the object
#     file as such, only its size does, which is then created on demand
#     by the loader
#  - "const": initialized read-only data, like string literals
#  - "common": uninitialized data unless initialized...
#    (the full story is too long for here, see "man nm")
#  - "data": initialized read-write data
#    (somewhat confusingly below: "data data", but it makes code simpler)
#  - "undefined": external symbol referred to by an object,
#    most likely a text symbol.  Can be either a symbol defined by
#    a Perl object file but referred to by other Perl object files,
#    or a completely external symbol from libc, or other system libraries.

BEGIN {
    chdir 't' if -d 't';
    @INC = '../lib';
    require "./test.pl";
}

use strict;

use Config;

if ($Config{cc} =~ /g\+\+/) {
    # XXX Could use c++filt, maybe.
    skip_all "on g++";
}

my $libperl_a;

for my $f (qw(../libperl.a libperl.a)) {
  if (-f $f) {
    $libperl_a = $f;
    last;
  }
}

unless (defined $libperl_a) {
  skip_all "no libperl.a";
}

print "# \$^O = $^O\n";
print "# \$Config{cc} = $Config{cc}\n";
print "# libperl = $libperl_a\n";

my $nm;
my $nm_opt = '';
my $nm_style;
my $nm_fh;
my $nm_err_tmp = "libperl$$";

END {
    # this is still executed when we skip_all above, avoid a warning
    unlink $nm_err_tmp if $nm_err_tmp;
}

my $fake_input;
my $fake_style;

if (@ARGV == 1) {
    $fake_input = shift @ARGV;
    print "# Faking nm output from $fake_input\n";
    if ($fake_input =~ s/\@(.+)$//) {
        $fake_style = $1;
        print "# Faking nm style from $fake_style\n";
        if ($fake_style eq 'gnu' ||
            $fake_style eq 'linux' ||
            $fake_style eq 'freebsd') {
            $nm_style = 'gnu'
        } elsif ($fake_style eq 'darwin' || $fake_style eq 'osx') {
            $nm_style = 'darwin'
        } else {
            die "$0: Unknown explicit nm style '$fake_style'\n";
        }
    }
}

unless (defined $nm_style) {
    if ($^O eq 'linux') {
        # The 'gnu' style could be equally well be called 'bsd' style,
        # since the output format of the GNU binutils nm is really BSD.
        $nm_style = 'gnu';
    } elsif ($^O eq 'freebsd') {
        $nm_style = 'gnu';
    } elsif ($^O eq 'darwin') {
        $nm_style = 'darwin';
    }
}

if (defined $nm_style) {
    if ($nm_style eq 'gnu') {
        $nm = '/usr/bin/nm';
    } elsif ($nm_style eq 'darwin') {
        $nm = '/usr/bin/nm';
        # With the -m option we get better information than the BSD-like
        # default: with the default, a lot of symbols get dumped into 'S'
        # or 's', for example one cannot tell the difference between const
        # and non-const data symbols.
        $nm_opt = '-m';
    } else {
        die "$0: Unexpected nm style '$nm_style'\n";
    }
}

unless (defined $nm) {
  skip_all "no nm";
}

unless (defined $nm_style) {
  skip_all "no nm style";
}

print "# nm = $nm\n";
print "# nm_style = $nm_style\n";
print "# nm_opt = $nm_opt\n";

unless (-x $nm) {
    skip_all "no executable nm $nm";
}

if ($nm_style eq 'gnu' && !defined $fake_style) {
    open(my $gnu_verify, "$nm --version|") or
        skip_all "nm failed: $!";
    my $gnu_verified;
    while (<$gnu_verify>) {
        if (/^GNU nm/) {
            $gnu_verified = 1;
            last;
        }
    }
    unless ($gnu_verified) {
        skip_all "no GNU nm";
    }
}

if (defined $fake_input) {
    if ($fake_input eq '-') {
        open($nm_fh, "<&STDIN") or
            skip_all "Duping STDIN failed: $!";
    } else {
        open($nm_fh, "<", $fake_input) or
            skip_all "Opening '$fake_input' failed: $!";
    }
    undef $nm_err_tmp; # In this case there will be no nm errors.
} else {
    open($nm_fh, "$nm $nm_opt $libperl_a 2>$nm_err_tmp |") or
        skip_all "$nm $nm_opt $libperl_a failed: $!";
}

sub is_perlish_symbol {
    $_[0] =~ /^(?:PL_|Perl|PerlIO)/;
}

# XXX Implement "internal test" for this script (option -t?)
# to verify that the parsing does what it's intended to.

sub nm_parse_gnu {
    my $symbols = shift;
    my $line = $_;
    if (m{^(\w+\.o):$}) {
        # object file name
        $symbols->{obj}{$1}++;
        $symbols->{o} = $1;
        return;
    } else {
        die "$0: undefined current object: $line"
            unless defined $symbols->{o};
        # 64-bit systems have 16 hexdigits, 32-bit systems have 8.
        if (s/^[0-9a-f]{8}(?:[0-9a-f]{8})? //) {
            if (/^[Rr] (\w+)$/) {
                # R: read only (const)
                $symbols->{data}{const}{$1}{$symbols->{o}}++;
            } elsif (/^r .+$/) {
                # Skip local const (read only).
            } elsif (/^[Tti] (\w+)(\..+)?$/) {
                $symbols->{text}{$1}{$symbols->{o}}++;
            } elsif (/^C (\w+)$/) {
                $symbols->{data}{common}{$1}{$symbols->{o}}++;
            } elsif (/^[BbSs] (\w+)(\.\d+)?$/) {
                # Bb: uninitialized data (bss)
                # Ss: uninitialized data "for small objects"
                $symbols->{data}{bss}{$1}{$symbols->{o}}++;
            } elsif (/^0{16} D _LIB_VERSION$/) {
                # Skip the _LIB_VERSION (not ours, probably libm)
            } elsif (/^[DdGg] (\w+)$/) {
                # Dd: initialized data
                # Gg: initialized "for small objects"
                $symbols->{data}{data}{$1}{$symbols->{o}}++;
            } elsif (/^. \.?(\w+)$/) {
                # Skip the unknown types.
                print "# Unknown type: $line ($symbols->{o})\n";
            }
            return;
        } elsif (/^ {8}(?: {8})? U _?(\w+)$/) {
            my ($symbol) = $1;
            return if is_perlish_symbol($symbol);
            $symbols->{undef}{$symbol}{$symbols->{o}}++;
            return;
	}
    }
    print "# Unexpected nm output '$line' ($symbols->{o})\n";
}

sub nm_parse_darwin {
    my $symbols = shift;
    my $line = $_;
    if (m{^(?:.+)?libperl\.a\((\w+\.o)\):$}) {
        # object file name
        $symbols->{obj}{$1}++;
        $symbols->{o} = $1;
        return;
    } else {
        die "$0: undefined current object: $line" unless defined $symbols->{o};
        # 64-bit systems have 16 hexdigits, 32-bit systems have 8.
        if (s/^[0-9a-f]{8}(?:[0-9a-f]{8})? //) {
            # String literals can live in different sections
            # depending on the compiler and os release, assumedly
            # also linker flags.
            if (/^\(__TEXT,__(?:const|cstring|literal\d+)\) (?:non-)?external _?(\w+)(\.\w+)?$/) {
                my ($symbol, $suffix) = ($1, $2);
                # Ignore function-local constants like
                # _Perl_av_extend_guts.oom_array_extend
                return if defined $suffix && /__TEXT,__const/;
                # Ignore the cstring unnamed strings.
                return if $symbol =~ /^L\.str\d+$/;
                $symbols->{data}{const}{$symbol}{$symbols->{o}}++;
            } elsif (/^\(__TEXT,__text\) (?:non-)?external _(\w+)$/) {
                $symbols->{text}{$1}{$symbols->{o}}++;
            } elsif (/^\(__DATA,__(const|data|bss|common)\) (?:non-)?external _(\w+)(\.\w+)?$/) {
                my ($dtype, $symbol, $suffix) = ($1, $2, $3);
                # Ignore function-local constants like
                # _Perl_pp_gmtime.dayname
                return if defined $suffix;
                $symbols->{data}{$dtype}{$symbol}{$symbols->{o}}++;
            } elsif (/^\(__DATA,__const\) non-external _\.memset_pattern\d*$/) {
                # Skip this, whatever it is (some inlined leakage from
                # darwin libc?)
            } elsif (/^\(__TEXT,__eh_frame/) {
                # Skip the eh_frame (exception handling) symbols.
                return;
            } elsif (/^\(__\w+,__\w+\) /) {
                # Skip the unknown types.
                print "# Unknown type: $line ($symbols->{o})\n";
            }
            return;
        } elsif (/^ {8}(?: {8})? \(undefined(?: \[lazy bound\])?\) external _?(.+)/) {
            # darwin/ppc marks most undefined text symbols
            # as "[lazy bound]".
            my ($symbol) = $1;
            return if is_perlish_symbol($symbol);
            $symbols->{undef}{$symbol}{$symbols->{o}}++;
            return;
        }
    }
    print "# Unexpected nm output '$line' ($symbols->{o})\n";
}

my $nm_parse;

if ($nm_style eq 'gnu') {
    $nm_parse = \&nm_parse_gnu;
} elsif ($nm_style eq 'darwin') {
    $nm_parse = \&nm_parse_darwin;
}

unless (defined $nm_parse) {
    skip_all "no nm parser ($nm_style $nm_style, \$^O $^O)";
}

my %symbols;

while (<$nm_fh>) {
    next if /^$/;
    chomp;
    $nm_parse->(\%symbols);
}

# use Data::Dumper; print Dumper(\%symbols);

if (keys %symbols == 0) {
    skip_all "no symbols\n";
}

# These should always be true for everyone.

ok($symbols{obj}{'pp.o'}, "has object pp.o");
ok($symbols{text}{'Perl_peep'}, "has text Perl_peep");
ok($symbols{text}{'Perl_pp_uc'}{'pp.o'}, "has text Perl_pp_uc in pp.o");
ok(exists $symbols{data}{const}, "has data const symbols");
ok($symbols{data}{const}{PL_no_mem}{'globals.o'}, "has PL_no_mem");

my $DEBUGGING = $Config{ccflags} =~ /-DDEBUGGING/ ? 1 : 0;

my $GS  = $Config{ccflags} =~ /-DPERL_GLOBAL_STRUCT\b/ ? 1 : 0;
my $GSP = $Config{ccflags} =~ /-DPERL_GLOBAL_STRUCT_PRIVATE/ ? 1 : 0;

print "# GS  = $GS\n";
print "# GSP = $GSP\n";

my %data_symbols;

for my $dtype (sort keys %{$symbols{data}}) {
    for my $symbol (sort keys %{$symbols{data}{$dtype}}) {
        $data_symbols{$symbol}++;
    }
}

# The following tests differ between vanilla vs $GSP or $GS.

if ($GSP) {
    print "# -DPERL_GLOBAL_STRUCT_PRIVATE\n";
    ok(!exists $data_symbols{PL_hash_seed}, "has no PL_hash_seed");
    ok(!exists $data_symbols{PL_ppaddr}, "has no PL_ppaddr");

    ok(! exists $symbols{data}{bss}, "has no data bss symbols");
    ok(! exists $symbols{data}{data} ||
            # clang with ASAN seems to add this symbol to every object file:
            !grep($_ ne '__unnamed_1', keys %{$symbols{data}{data}}),
        "has no data data symbols");
    ok(! exists $symbols{data}{common}, "has no data common symbols");

    # -DPERL_GLOBAL_STRUCT_PRIVATE should NOT have
    # the extra text symbol for accessing the vars
    # (as opposed to "just" -DPERL_GLOBAL_STRUCT)
    ok(! exists $symbols{text}{Perl_GetVars}, "has no Perl_GetVars");
} elsif ($GS) {
    print "# -DPERL_GLOBAL_STRUCT\n";
    ok(!exists $data_symbols{PL_hash_seed}, "has no PL_hash_seed");
    ok(!exists $data_symbols{PL_ppaddr}, "has no PL_ppaddr");

    ok(! exists $symbols{data}{bss}, "has no data bss symbols");

    # These PerlIO data symbols are left visible with
    # -DPERL_GLOBAL_STRUCT (as opposed to -DPERL_GLOBAL_STRUCT_PRIVATE)
    my @PerlIO =
        qw(
           PerlIO_byte
           PerlIO_crlf
           PerlIO_pending
           PerlIO_perlio
           PerlIO_raw
           PerlIO_remove
           PerlIO_stdio
           PerlIO_unix
           PerlIO_utf8
          );

    # PL_magic_vtables is const with -DPERL_GLOBAL_STRUCT_PRIVATE but
    # otherwise not const -- because of SWIG which wants to modify
    # the table.  Evil SWIG, eeevil.

    # my_cxt_index is used with PERL_IMPLICIT_CONTEXT, which
    # -DPERL_GLOBAL_STRUCT has turned on.
    eq_array([sort keys %{$symbols{data}{data}}],
             [sort('PL_VarsPtr',
                   @PerlIO,
                   'PL_magic_vtables',
                   'my_cxt_index')],
             "data data symbols");

    # Only one data common symbol, our "supervariable".
    eq_array([sort keys %{$symbols{data}{common}}],
             ['PL_Vars'],
             "data common symbols");

    ok($symbols{data}{data}{PL_VarsPtr}{'globals.o'}, "has PL_VarsPtr");
    ok($symbols{data}{common}{PL_Vars}{'globals.o'}, "has PL_Vars");

    # -DPERL_GLOBAL_STRUCT has extra text symbol for accessing the vars.
    ok($symbols{text}{Perl_GetVars}{'util.o'}, "has Perl_GetVars");
} else {
    print "# neither -DPERL_GLOBAL_STRUCT nor -DPERL_GLOBAL_STRUCT_PRIVATE\n";

    if ( !$symbols{data}{common} ) {
        # This is likely because Perl was compiled with
        # -Accflags="-fno-common"
        $symbols{data}{common} = $symbols{data}{bss};
    }

    ok($symbols{data}{common}{PL_hash_seed}{'globals.o'}, "has PL_hash_seed");
    ok($symbols{data}{data}{PL_ppaddr}{'globals.o'}, "has PL_ppaddr");

    # None of the GLOBAL_STRUCT* business here.
    ok(! exists $symbols{data}{data}{PL_VarsPtr}, "has no PL_VarsPtr");
    ok(! exists $symbols{data}{common}{PL_Vars}, "has no PL_Vars");
    ok(! exists $symbols{text}{Perl_GetVars}, "has no Perl_GetVars");
}

ok(keys %{$symbols{undef}}, "has undefined symbols");

# memchr, memcmp, memcpy should be used all over the place.
#
# chmod, socket, getenv, sigaction, time are system/library
# calls that should each see at least one use.
my @good = qw(memchr memcmp memcpy
              chmod socket getenv sigaction time);
if ($Config{usedl}) {
    push @good, 'dlopen';
}
for my $good (@good) {
    my @o = exists $symbols{undef}{$good} ?
        sort keys %{ $symbols{undef}{$good} } : ();
    ok(@o, "uses $good (@o)");
}

# gets is horribly unsafe.
#
# fgets should not be used (Perl has its own API), even without perlio.
#
# tmpfile is unsafe.
#
# strcpy, strcat, strncpy, strncpy are unsafe.
#
# sprintf and vsprintf should not be used because
# Perl has its own safer and more portable implementations.
# (One exception: for certain floating point outputs
# the native sprintf is still used, see below.)
#
# XXX: add atoi() to @bad - unsafe and undefined failure modes.
#
my @bad = qw(gets fgets
             tmpfile
             strcpy strcat strncpy strncat tmpfile
             sprintf vsprintf);
for my $bad (@bad) {
    my @o = exists $symbols{undef}{$bad} ?
        sort keys %{ $symbols{undef}{$bad} } : ();
    # While sprintf() is bad in the general case,
    # some platforms implement Gconvert via sprintf, in sv.o.
    if ($bad eq 'sprintf' &&
        $Config{d_Gconvert} =~ /^sprintf/ &&
        @o == 1 && $o[0] eq 'sv.o') {
      SKIP: {
        skip("uses sprintf for Gconvert in sv.o");
      }
    } else {
        is(@o, 0, "uses no $bad (@o)");
    }
}

if (defined $nm_err_tmp) {
    if (open(my $nm_err_fh, $nm_err_tmp)) {
        my $error;
        while (<$nm_err_fh>) {
            # OS X has weird error where nm warns about
            # "no name list" but then outputs fine.
            if (/nm: no name list/ && $^O eq 'darwin') {
                print "# $^O ignoring $nm output: $_";
                next;
            }
            warn "$0: Unexpected $nm error: $_";
            $error++;
        }
        die "$0: Unexpected $nm errors\n" if $error;
    } else {
        warn "Failed to open '$nm_err_tmp': $!\n";
    }
}

done_testing();