The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# Copyright (C) 2004-2012, Parrot Foundation.

package Parrot::Headerizer;

=head1 NAME

Parrot::Headerizer - Parrot header generation functionality

=head1 SYNOPSIS

    use Parrot::Headerizer;

    $headerizer = Parrot::Headerizer->new( {
        macro_match => $macro_match, # optional
    } );

    $headerizer->get_sources(@ofiles);
    $headerizer->process_sources();
    $headerizer->print_final_message();
    $headerizer->print_warnings();

    @function_decls = $headerizer->extract_function_declarations($buf);
    $escaped_decl = $headerizer->generate_documentation_signature($function_decl);

=head1 DESCRIPTION

C<Parrot::Headerizer> knows how to extract all kinds of information out
of C-language files.  Its methods are used in F<tools/dev/headerizer.pl> and
F<t/codingstd/c_function_docs.t>.

=head1 METHODS

=cut

use strict;
use warnings;
use Data::Dumper;$Data::Dumper::Indent=1;
use Scalar::Util qw( reftype );
use lib qw( lib );
use Parrot::Config;
use Parrot::Headerizer::Functions qw(
    read_file
    write_file
    qualify_sourcefile
    replace_pod_item
    no_both_PARROT_EXPORT_and_PARROT_INLINE
    validate_prototype_args
    no_both_static_and_PARROT_EXPORT
    handle_split_declaration
    clean_args_for_declarations
    handle_modified_args
    add_newline_if_multiline
    add_asserts_to_declarations
    func_modifies
    add_headerizer_markers
);

=head2 C<new()>

=over 4

=item * Purpose

Constructor of headerizer object.  The object is initialized with a list of
valid C<PARROT_XXX> macros.

=item * Arguments

    $headerizer = Parrot::Headerizer->new();

No mandatory arguments, but one special use-case takes a hash reference.

    $headerizer = Parrot::Headerizer->new( {
        macro_match => $macro_match, # optional
    } );

Currently, the only meaningful element in that hash reference is
C<macro_match>.  See C<process_sources()> below for discussion of how that is
used.

=item * Return Value

Parrot::Headerizer object.

=back

=cut

sub new {
    my ($class, $args) = @_;
    if (defined $args) {
        die 'Argument to Parrot::Headerizer must be hashref'
            unless reftype($args) eq 'HASH';
    }
    else {
        $args = {};
    }
    $args->{macro_match} = undef unless defined $args->{macro_match};
    $args->{warnings} = {};
    $args->{message} = '';
    $args->{valid_macros} = { map { ( $_, 1 ) } qw(
        PARROT_EXPORT
        PARROT_INLINE
        PARROT_NOINLINE

        PARROT_CAN_RETURN_NULL
        PARROT_CANNOT_RETURN_NULL

        PARROT_IGNORABLE_RESULT
        PARROT_WARN_UNUSED_RESULT

        PARROT_PURE_FUNCTION
        PARROT_CONST_FUNCTION

        PARROT_DOES_NOT_RETURN
        PARROT_DOES_NOT_RETURN_WHEN_FALSE

        PARROT_MALLOC
        PARROT_OBSERVER

        PARROT_HOT
        PARROT_COLD

        PARROT_API

        PARROT_NO_ADDRESS_SAFETY_ANALYSIS
        )
    };
    return bless $args, $class;
}

=head2 C<get_sources()>

=over 4

=item * Purpose

Identify the source code files which need to have header information
extracted.  The header information is extracted and stored inside the
headerizer object in appropriate ways.

=item * Arguments

    $headerizer->get_sources(@ofiles);

List of names of C object (C<.o>) files.

=item * Return Value

No defined return value.

=back

=cut

sub get_sources {
    my $self = shift;
    my @ofiles = @_;
    my %sourcefiles;
    my %sourcefiles_with_statics;
    my %api;
    # Walk the object files and find corresponding source (either .c or .pmc)
    for my $ofile (@ofiles) {

        # Skip files in the src/ops/ subdirectory.
        next if $ofile =~ m/^\Qsrc$PConfig{slash}ops\E/ || # if run by hand...
                $ofile =~ m{^src/ops};                     # ... or by makefile

        $ofile =~ s{\\}{/}g; # Normalize path separators

        my $is_yacc = ($ofile =~ /\.y$/);
        if ( !$is_yacc ) {
            my $sfile = $ofile;
            $sfile    =~ s/\Q$PConfig{o}\E$/.s/;
            next if -f $sfile;
        }

        my ($sourcefile, $source_code, $hfile) =
            qualify_sourcefile( {
                ofile           => $ofile,
                PConfig         => \%PConfig,
                is_yacc         => $is_yacc,
            } );

        my @decls;
        if ( $self->{macro_match} ) {
            @decls = $self->extract_function_declarations( $source_code );
        }
        else {
            @decls =
            $self->extract_function_declarations_and_update_source( $sourcefile );
        }

        for my $decl (@decls) {
            my $components =
                $self->function_components_from_declaration( $sourcefile, $decl );
            push( @{ $sourcefiles{$hfile}->{$sourcefile} }, $components )
                unless $hfile eq 'none';
            push( @{ $sourcefiles_with_statics{$sourcefile} }, $components )
                if $components->{is_static};
            if ( $self->{macro_match} ) {
                if ( grep { $_ eq $self->{macro_match} } @{$components->{macros}} ) {
                    push( @{ $api{$sourcefile} }, $components );
                }
            }
        }
    }    # for @cfiles
    $self->{sourcefiles} = \%sourcefiles;
    $self->{sourcefiles_with_statics} = \%sourcefiles_with_statics;
    $self->{api} = \%api;
}

=head2 C<extract_function_declarations()>

=over 4

=item * Purpose

Extracts the function declarations from the text argument, and returns an
array of strings containing the function declarations.

=item * Arguments

    @function_decls = $headerizer->extract_function_declarations($text)

String holding the slurped-in content of a source code file.

=item * Return Value

List of strings holding function declarations.

=item * Comment

Called within C<get_sources()>, but also called on its own within
F<t/codingstd/c_function_docs.t>.

=back

=cut

sub extract_function_declarations {
    my $self = shift;
    my $text = shift;

    # Only check the YACC C code if we find what looks like YACC file
    $text =~ s/%\{(.*)%\}.*/$1/sm;

    # Drop all text after HEADERIZER STOP
    $text =~ s{/\*\s*HEADERIZER STOP.+}{}s;

    # Drop begin/end PMC HEADER sections
    $text =~ s{BEGIN_PMC_HEADER_PREAMBLE}{}sx;
    $text =~ s{END_PMC_HEADER_PREAMBLE}{}sx;

    # Strip blocks of comments
    $text =~ s{^/\*.*?\*/}{}mxsg;

    # Strip # compiler directives
    $text =~ s{^#(\\\n|.)*}{}mg;

    # Strip code blocks
    $text =~ s/^{.+?^}//msg;

    # Split on paragraphs
    my @funcs = split /\n{2,}/, $text;

    # If it doesn't start in the left column, it's not a func
    @funcs = grep { /^\S/ } @funcs;

    # Typedefs, enums and externs are no good
    @funcs = grep { !/^(?:typedef|enum|extern)\b/ } @funcs;

    # Structs are OK if they're not alone on the line
    @funcs = grep { !/^struct.+;\n/ } @funcs;

    # Structs are OK if they're not being defined
    @funcs = grep { !/^(?:static\s+)?struct.+{\n/ } @funcs;

    # Ignore magic function name YY_DECL
    @funcs = grep { !/YY_DECL/ } @funcs;

    # Ignore anything with magic words HEADERIZER SKIP
    @funcs = grep { !m{/\*\s*HEADERIZER SKIP\s*\*/} } @funcs;

    # pmclass declarations in PMC files are no good
    @funcs = grep { !m{^pmclass } } @funcs;

    # Variables are of no use to us
    @funcs = grep { !/=/ } @funcs;

    # Get rid of any blocks at the end
    s/\s*{.*//s for @funcs;

    # Toast anything non-whitespace
    @funcs = grep { /\S/ } @funcs;

    # If it's got a semicolon, it's not a function header
    @funcs = grep { !/;/ } @funcs;

    # remove any remaining }'s
    @funcs = grep {! /^}/} @funcs;

    chomp @funcs;

    return @funcs;
}

=head2 C<extract_function_declaration_and_update_source()>

=over 4

=item * Purpose

Extract all the function declarations from a source code file and update the
comment blocks within it.

=item * Arguments

    @function_declarations =
        $headerizer->extract_function_declaration_and_update_source($cfile_name);

String holding source code filename.

=item * Return Value

List of strings holding function declarations.

=item * Comment

Called within C<get_sources()>.  Wraps around
C<extract_function_declarations()> but differs from that method by generating
signatures, correcting POD, etc.

=back

=cut

sub extract_function_declarations_and_update_source {
    my $self = shift;
    my $cfile_name = shift;

    open( my $fhin, '<', $cfile_name ) or die "Can't open $cfile_name: $!";
    my $text = join( '', <$fhin> );
    close $fhin;

    my @func_declarations = $self->extract_function_declarations( $text );
    for my $decl ( @func_declarations ) {
        my $specs = $self->function_components_from_declaration( $cfile_name, $decl );
        my $name = $specs->{name};

        my $heading = $self->generate_documentation_signature($decl);
        $text = replace_pod_item( {
            text        => $text,
            name        => $name,
            heading     => $heading,
            cfile_name  => $cfile_name,
        } );
    }
    open( my $fhout, '>', $cfile_name ) or die "Can't create $cfile_name: $!";
    print {$fhout} $text;
    close $fhout;

    return @func_declarations;
}

=head2 C<function_components_from_declaration($file, $proto)>

=over 4

=item * Purpose

Creates a data structure in which information about a particular function can
be looked up.

=item * Arguments

List of two strings, the filename and the function declaration.

=item * Return Value

Returns a reference to a hash of these function components:

    file
    name
    args
    macros
    is_static
    is_inline
    is_api
    is_ignorable
    return_type

=item * Comment

Currently called within both
C<extract_function_declarations()> and
C<extract_function_declarations_and_update_source()>.

=back

=cut

sub function_components_from_declaration {
    my $self  = shift;
    my $file  = shift;
    my $proto = shift;

    my @lines = split( /\n/, $proto );
    chomp @lines;

    my @macros;
    my $parrot_api;
    my $parrot_inline;

    while ( @lines && ( $lines[0] =~ /^PARROT_/ ) ) {
        my $macro = shift @lines;
        if ( $macro eq 'PARROT_EXPORT' ) {
            $parrot_api = 1;
        }
        elsif ( $macro eq 'PARROT_INLINE' ) {
            $parrot_inline = 1;
        }
        push( @macros, $macro );
    }

    my $return_type = shift @lines;
    my $args = join( ' ', @lines );

    $args =~ s/\s+/ /g;
    $args =~ s{([^(]+)\s*\((.+)\);?}{$2}
        or die qq{Couldn't handle "$proto" in $file\n};

    my $name = $1;
    $args = $2;

    no_both_PARROT_EXPORT_and_PARROT_INLINE( {
        file            => $file,
        name            => $name,
        parrot_inline   => $parrot_inline,
        parrot_api      => $parrot_api,
    } );

    my @args = validate_prototype_args( $args, $proto );

    my $is_static;
    ($return_type, $is_static) = no_both_static_and_PARROT_EXPORT( {
        file            => $file,
        name            => $name,
        return_type     => $return_type,
        parrot_api      => $parrot_api,
    } );

    my $is_ignorable = 0;
    my %macros;
    for my $macro (@macros) {
        $macros{$macro} = 1;
        if (not $self->valid_macro($macro)) {
            $self->squawk( $file, $name, "Invalid macro $macro" );
        }
        if ( $macro eq 'PARROT_IGNORABLE_RESULT' ) {
            $is_ignorable = 1;
        }
    }
    $self->check_pointer_return_type( {
        return_type     => $return_type,
        macros          => \%macros,
        name            => $name,
        file            => $file,
    } );

    return {
        file         => $file,
        name         => $name,
        args         => \@args,
        macros       => \@macros,
        is_static    => $is_static,
        is_inline    => $parrot_inline,
        is_api       => $parrot_api,
        is_ignorable => $is_ignorable,
        return_type  => $return_type,
    };
}

=head2 C<check_pointer_return_type()>

=over 4

=item * Purpose

Performs some validation in the case where a function's return value is a
pointer.

=item * Arguments

    $headerizer->check_pointer_return_type( {
        return_type     => $return_type,
        macros          => \%macros,
        name            => $name,
        file            => $file,
    } );

Reference to a hash with the four elements listed above.

=item * Return Value

No defined return value.

=back

=cut

sub check_pointer_return_type {
    my ($self, $args) = @_;
    if ( $args->{return_type} =~ /\*/ ) {
        if ( !$args->{macros}->{PARROT_CAN_RETURN_NULL} &&
             !$args->{macros}->{PARROT_CANNOT_RETURN_NULL} ) {
            if ( $args->{name} !~ /^yy/ ) { # Don't complain about lexer-created functions
                $self->squawk( $args->{file}, $args->{name},
                    'Returns a pointer, but no PARROT_CAN(NOT)_RETURN_NULL macro found.' );
            }
        }
        elsif ( $args->{macros}->{PARROT_CAN_RETURN_NULL} &&
                $args->{macros}->{PARROT_CANNOT_RETURN_NULL} ) {
            $self->squawk( $args->{file}, $args->{name},
                q{Can't have both PARROT_CAN_RETURN_NULL and PARROT_CANNOT_RETURN_NULL together.} );
        }
    }
}

=head2 C<generate_documentation_signature()>

=over 4

=item * Purpose

Given an extracted function signature, return a modified
version suitable for inclusion in POD documentation.

=item * Arguments

    $heading = $headerizer->generate_documentation_signature($decl);

String holding a function declaration.

=item * Return Value

String holding a function header, split over multiple lines as needed.

=back

=cut

sub generate_documentation_signature {
    my $self = shift;
    my $function_decl = shift;

    # strip out any PARROT_* function modifiers
    foreach my $key ($self->valid_macros) {
        $function_decl =~ s/^$key$//m;
    }

    $function_decl =~ s/^\s+//g;
    $function_decl =~ s/\s+/ /g;

    # strip out any ARG* modifiers
    $function_decl =~ s/ARG(?:IN|IN_NULLOK|OUT|OUT_NULLOK|MOD|MOD_NULLOK|FREE|FREE_NOTNULL)\((.*?)\)/$1/g;

    # strip out the SHIM modifier
    $function_decl =~ s/SHIM\((.*?)\)/$1/g;

    # strip out the NULL modifiers
    $function_decl =~ s/(?:NULLOK|NOTNULL)\((.*?)\)/$1/g;

    # SHIM_INTERP is still a PARROT_INTERP
    $function_decl =~ s/SHIM_INTERP/PARROT_INTERP/g;

    # wrap with POD
    $function_decl = "=item C<$function_decl>";

    # Wrap long lines.
    my $line_len = 80;
    if (length($function_decl)<= $line_len) {
        return $function_decl;
    }
    else {
        return handle_split_declaration(
            $function_decl,
            $line_len,
        );
    }
}

=head2 C<valid_macro()>

=over 4

=item * Purpose

Tests the validity of a given macro.

=item * Arguments

    $headerizer->valid_macro( $macro )

String holding a macro.

=item * Return Value

Boolean: true value for valid macro; false value for invalid macro.

=back

=cut

sub valid_macro {
    my $self = shift;
    my $macro = shift;

    return exists $self->{valid_macros}{$macro};
}

=head2 C<valid_macros()>

=over 4

=item * Purpose

Identify all valid macros whose names are of the form C<PARROT_XXX>.

=item * Arguments

    @marcros = $headerizer->valid_macros();

None.

=item * Return Value

List of all the valid C<PARROT_XXX> macros.

=back

=cut

sub valid_macros {
    my $self = shift;

    my @macros = sort keys %{$self->{valid_macros}};

    return @macros;
}

=head2 C<squawk()>

=over 4

=item * Purpose

Builds a data structure with headerizer-specific ways of complaining if
something went wrong.

=item * Arguments

    $headerizer->squawk($file, $func, $error);

List of 3 arguments:  the file containing the error; the function containing
the error; the text of the error message.

=item * Return Value

Undefined value.

=item * Comment

C<squawk()> does not print any warnings or errors itself.  Use
C<print_warnings()> to report those.

=back

=cut

sub squawk {
    my $self  = shift;
    my $file  = shift;
    my $func  = shift;
    my $error = shift;

    push( @{ $self->{warnings}{$file}{$func} }, $error );

    return;
}

=head2 C<process_sources()>

=over 4

=item * Purpose

Once the source files needing headerization have been identified, this method
serves as a wrapper around that headerization.  Both C<.h> and C<.c> files are
handled.

=item * Arguments

None.

=item * Return Value

None.

=item * Comment

If a hash reference with an element named C<macro_match> was passed to
C<new()>, C<process_sources()> merely prints to C<STDOUT> a list of files and
functions using the macro named as the value of that element.  No
headerization or revision of headers is performed.

=back

=cut

sub process_sources {
    my ($self) = @_;
    my %sourcefiles = %{$self->{sourcefiles}};
    my %sourcefiles_with_statics = %{$self->{sourcefiles_with_statics}};
    my %api = %{$self->{api}};
    if ( $self->{macro_match} ) {
        my $nfuncs = 0;
        for my $cfile ( sort keys %api ) {
            my @funcs = sort { $a->{name} cmp $b->{name} } @{$api{$cfile}};
            print "$cfile\n";
            for my $func ( @funcs ) {
                print "    $func->{name}\n";
                ++$nfuncs;
            }
        }
        my $s = $nfuncs == 1 ? '' : 's';
        $self->{message} = "$nfuncs $self->{macro_match} function$s";
    }
    # Normal headerization and updating
    else {
        # Update all the .h files
        for my $hfile ( sort keys %sourcefiles ) {
            my $sourcefiles = $sourcefiles{$hfile};

            my $header = read_file($hfile);

            for my $cfile ( sort keys %{$sourcefiles} ) {
                my @funcs = @{ $sourcefiles->{$cfile} };
                @funcs = grep { not $_->{is_static} } @funcs;    # skip statics
                $header = $self->replace_headerized_declarations(
                    $header, $cfile, $hfile, @funcs );
            }

            write_file( $hfile, $header );
        }

        # Update all the .c files in place
        for my $cfile ( sort keys %sourcefiles_with_statics ) {
            my @funcs = @{ $sourcefiles_with_statics{$cfile} };
            @funcs = grep { $_->{is_static} } @funcs;

            my $source = read_file($cfile);
            $source = $self->replace_headerized_declarations( $source, 'static', $cfile, @funcs );

            write_file( $cfile, $source );
        }
        $self->{message} = "Headerization complete.";
    }
}

=head2 C<replace_headerized_declarations()>

=over 4

=item * Purpose


=item * Arguments

=item * Return Value

=back

=cut

sub replace_headerized_declarations {
    my $self = shift;
    my $source_code = shift;
    my $sourcefile  = shift;
    my $hfile       = shift;
    my @funcs       = @_;

    # Allow a way to not headerize statics
    if ( $source_code =~ m{/\*\s*HEADERIZER NONE:\s*$sourcefile\s*\*/} ) {
        return $source_code;
    }

    @funcs = sort {
        ( ( $b->{is_api} || 0 ) <=> ( $a->{is_api} || 0 ) )
        || ( ( lc($a->{name}) || '') cmp ( lc($b->{name}) || '') )
    } @funcs;
    my @function_decls = $self->make_function_decls(@funcs);

    my $markers_args = {
        function_decls  => \@function_decls,
        sourcefile      => $sourcefile,
        hfile           => $hfile,
        code            => $source_code,
    };

    return add_headerizer_markers( $markers_args );
}

=head2 C<make_function_decls()>

=over 4

=item * Purpose

Composes proper function declarations.

=item * Arguments

    @function_decls = $self->make_function_decls(@funcs);

List of functions.

=item * Return Value

List of function declarations.

=item * Comment

Called within C<replace_headerized_declarations()>.

=back

=cut

sub make_function_decls {
    my $self = shift;
    my @funcs = @_;

    my @decls;
    foreach my $func (@funcs) {
        my $alt_void = ' ';

        # Splint can't handle /*@alt void@*/ on pointers, although this page
        # http://www.mail-archive.com/lclint-interest@virginia.edu/msg00139.html
        # seems to say that we can.
        if ( $func->{is_ignorable} && ($func->{return_type} !~ /\*/) ) {
            $alt_void = " /*\@alt void@*/\n";
        }

        my $decl = sprintf( "%s%s%s(" => (
            $func->{return_type},
            $alt_void,
            $func->{name}
        ) );
        $decl = "static $decl" if $func->{is_static};

        my @args    = @{ $func->{args} };
        my @attrs   = $self->attrs_from_args( $func, @args );

        my @modified_args = clean_args_for_declarations($func, \@args);

        my $multiline;
        ($decl, $multiline) = handle_modified_args($decl, \@modified_args);

        my $attrs = join( "", map { "\n\t\t$_" } @attrs );
        if ($attrs) {
            $decl .= $attrs;
            $multiline = 1;
        }
        my @macros = @{ $func->{macros} };
        $multiline = 1 if @macros;

        $decl = add_newline_if_multiline($decl, $multiline);
        $decl = join( "\n", @macros, $decl );
        $decl =~ s/\t/    /g;
        push( @decls, $decl );
    }

    @decls = add_asserts_to_declarations( \@funcs, \@decls );

    return @decls;
}

=head2 C<attrs_from_args()>

=over 4

=item * Purpose

Adds to headers strings of the form C<__attribute__nonnull__(1)>.

=item * Arguments

    @attrs   = $headerizer->attrs_from_args( $func, @args );

List whose first element is a hash reference holding characteristics about a
given function, followed by list of arguments.

=item * Return Value

List.

=item * Comment

Called within C<make_function_decls()>.

=back

=cut

sub attrs_from_args {
    my $self = shift;
    my $func = shift;
    my @args = @_;

    my @attrs = ();
    my @mods  = ();

    my $name = $func->{name};
    my $file = $func->{file};
    my $n = 0;
    for my $arg (@args) {
        ++$n;
        @mods = func_modifies($arg, \@mods);
        if ( $arg =~ m{(ARGIN|ARGOUT|ARGMOD|ARGFREE_NOTNULL|NOTNULL)\(} || $arg eq 'PARROT_INTERP' ) {
            push( @attrs, "__attribute__nonnull__($n)" );
        }
        if ( ( $arg =~ m{\*} ) && ( $arg !~ /\b(SHIM|((ARGIN|ARGOUT|ARGMOD)(_NULLOK)?)|ARGFREE(_NOTNULL)?)\b/ ) ) {
            if ( $name !~ /^yy/ ) { # Don't complain about the lexer auto-generated funcs
                $self->squawk( $file, $name, qq{"$arg" isn't protected with an ARGIN, ARGOUT or ARGMOD (or a _NULLOK variant), or ARGFREE} );
            }
        }
        if ( ($arg =~ /\bconst\b/) && ($arg =~ /\*/) && ($arg !~ /\*\*/) && ($arg =~ /\b(ARG(MOD|OUT))\b/) ) {
            $self->squawk( $file, $name, qq{"$arg" is const, but that $1 conflicts with const} );
        }
    }

    return (@attrs,@mods);
}

=head2 C<print_final_message()>

=over 4

=item * Purpose

Prints a concluding message whose content reflects either normal headerization
or macro matching.

=item * Arguments

None.

=item * Return Value

Implicitly returns true value upon success.

=back

=cut

sub print_final_message {
    my $self = shift;
    if ($self->{message} ne '') {
        print "$self->{message}\n";
    }
}

=head2 C<print_headerizer_warnings()>

=over 4

=item * Purpose

Print all warnings accumulated in the course of the headerization process.

=item * Arguments

None.

=item * Return Value

Implicitly returns true value upon success.

=back

=cut

sub print_warnings {
    my $self = shift;
    my %warnings = %{$self->{warnings}};
    if ( keys %warnings ) {
        my $nwarnings     = 0;
        my $nwarningfuncs = 0;
        my $nwarningfiles = 0;
        for my $file ( sort keys %warnings ) {
            ++$nwarningfiles;
            print "$file\n";
            my $funcs = $warnings{$file};
            for my $func ( sort keys %{$funcs} ) {
                ++$nwarningfuncs;
                for my $error ( @{ $funcs->{$func} } ) {
                    print "    $func: $error\n";
                    ++$nwarnings;
                }
            }
        }

        print "$nwarnings warnings in $nwarningfuncs funcs in $nwarningfiles C files\n";
    }
}

1;

# Local Variables:
#   mode: cperl
#   cperl-indent-level: 4
#   fill-column: 100
# End:
# vim: expandtab shiftwidth=4: