The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w
#===============================================================================
#
#         FILE:  csourceparser.pl
#
#        USAGE:  ./csourceparser.pl [Option] ... File ...
#
#
#  DESCRIPTION:	 Parse and extract specified elements from source-code
#  				 written in the C language
#
#      OPTIONS:  ---
# REQUIREMENTS:  Perl Version >= 5.8.0, Parse::RecDescent, Getopt::Long, Pod::Usage
#         BUGS:  ---
#        NOTES:  ---
#       AUTHOR:  <hendrik.sirges(at)fh-swf.de>
#      COMPANY:  FH-SWF
#      VERSION:  0.1.0
#      CREATED:  07/10/05 12:34:53 CEST
#     REVISION:  ---
#===============================================================================

require 5.008000;
use strict;

use Parse::RecDescent;
use Getopt::Long;
use Pod::Usage;

# use Data::Dumper 'Dumper';

#$::RD_HINT 		= 1;					# Print hints on errors
#$::RD_ERRORS		= 1;					# Print errors
#$::RD_WARN 		= 1;
#$::RD_TRACE 		= 1;					# Print tracecode to STDERR
#$::RD_AUTOSTUB 	= 1;

###############################################################
# Grammar used to find and remove comments from C source code #
###############################################################
my $decomment_grammar = <<'END_OF_DECOMMENT';

program	: <skip:''> { @{$thisparser}{qw(code)} = () }
	  part(s)
	  {@{$thisparser}{code};}

part	: comment 		{ $thisparser->{code}		.= " "; }
        | C_code  		{ $thisparser->{code}     	.= $item[1]; }
        | string  		{ $thisparser->{code}     	.= qq("$item[1]"); }

C_code  : m{(
	      [^"/]+	# one or more non-delimiters
	      (			# then (optionally)...
	       /		# a potential comment delimiter
	       [^*/]	# which is not an actual delimiter
	      )?		#
	    )+			# all repeated once or more
	   }x

string	: m{"		# a leading delimiter
	    ((			# zero or more...
	      \\.		# escaped anything
	      |			# or
	      [^"]		# anything but a delimiter
	     )*
	    )
	    "}x
		{ $return =  $1 }


comment	: m{\s*				# optional whitespace
	    //					# comment delimiter
	    [^\n]*				# anything except a newline
	    \n					# then a newline
	    }x

	| m{\s*					# optional whitespace
	    /\*					# comment opener
	    (?:[^*]+|\*(?!/))*	# anything except */
	    \*/		      		# comment closer
        ([ \t]*)?       	# trailing blanks or tabs
	    }x
END_OF_DECOMMENT

######################################################################################
# Grammar used to parse C source code (without comments and preprocessor directives) #
# ####################################################################################
my $Cgrammar = <<'END_OF_C_GRAMMAR';

	translation_unit:
		external_declaration(s)
	|	<error>

	external_declaration:
		function_definition
	|	declaration
	|	<resync>
		{
			if ($::opt_SKIPPEDLINES || (defined $::opt_VERBOSE and $::opt_VERBOSE >= 1 ))
			{
				print "Skipping line $thisline\n"	# Try next line if possible...
			}
		 }

	function_definition:
		declaration_specifiers(?) declarator declaration_list(?) compound_statement
		{
			if($::opt_FUNCTIONS)
			{
				$::functions_output .= ::flatten_list($item[1]);
				$::functions_output .= ::flatten_list($item[2]);
				$::functions_output .= ::flatten_list($item[3]) . ";\n";
			}
		}

	declaration:
		declaration_specifiers init_declarator_list(?) ';'
		{
            if($::opt_DECLARATIONS)
            {
				$::declarations_output .= ::flatten_list($item[1]);
				$::declarations_output .= ::flatten_list($item[2]);
				$::declarations_output .= ::flatten_list($item[3]) . "\n";
            }
        }

	declaration_list:
		declaration(s)

	declaration_specifiers:
		type_qualifier 			declaration_specifiers(?)
	|	storage_class_specifier declaration_specifiers(?)
	|	type_specifier  		declaration_specifiers(?)

	storage_class_specifier:
		  'auto'
		| 'register'
		| 'static'
		| 'extern'
		| 'typedef'

	type_specifier:
		  'int'
		| 'double'
		| 'void'
		| 'char'
		| 'long'
		| 'float'
		| 'signed'
		| 'unsigned'
		| 'short'
		| struct_or_union_specifier
		| enum_specifier
		| typedef_name ...typedef_name_lookahead { [$item[1] ] }

	typedef_name_lookahead:
		declarator
#	|	pointer
#	|	',' ...parameter_type_list
#	|	')'

	type_qualifier:
		  'const'
		| 'volatile'

	struct_or_union_specifier:
		  struct_or_union IDENTIFIER(?) '{' struct_declaration_list(?) '}'
          {
		  	if($::opt_STRUCTS){
				$::structs_output .= ::flatten_list($item[1]) . " ";
				$::structs_output .= ::flatten_list($item[2]);
				$::structs_output .= ::flatten_list($item[3]) . "\n";
				$::structs_output .= ::flatten_list_beautified($item[4]);
				$::structs_output .= ::flatten_list($item[5]) . ";\n\n";
			}
          }
		| struct_or_union IDENTIFIER

	struct_or_union:
		  'struct'
		| 'union'


	struct_declaration_list:
		struct_declaration(s)

	init_declarator_list:
		init_declarator(s /(,)/)

	init_declarator:
		declarator '=' initializer
	|	declarator

	struct_declaration:
		specifier_qualifier_list struct_declarator_list ';'

	specifier_qualifier_list:
		type_specifier specifier_qualifier_list(?)
	|	type_qualifier specifier_qualifier_list(?)

	struct_declarator_list:
		struct_declarator(s /(,)/)

	struct_declarator:
		declarator(?) ':' constant_expression
	|	declarator

	enum_specifier:
		'enum' IDENTIFIER(?) '{' enumerator_list '}'
          {
		  	if($::opt_STRUCTS){
				$::structs_output .= ::flatten_list($item[1]) . " ";
				$::structs_output .= ::flatten_list($item[2]);
				$::structs_output .= ::flatten_list($item[3]) . "\n";
				$::structs_output .= ::flatten_list_beautified($item[4]);
				$::structs_output .= ::flatten_list($item[5]) . ";\n\n";
			}
          }
	|	'enum' IDENTIFIER

	enumerator_list:
		enumerator(s /(,)/)

	enumerator:
		IDENTIFIER ('=' constant_expression)(?)

	declarator:
		pointer(?) direct_declarator

	function_signature:
		'[' constant_expression(?) ']'
	|	'(' parameter_type_list ')'
	|	'(' identifier_list(?) ')'

	direct_declarator:
		IDENTIFIER function_signature(s?)
	|	'(' declarator ')' function_signature(s?)


	pointer:
	  '*' type_qualifier_list(?) pointer(?)

	type_qualifier_list:
		type_qualifier(s)

	parameter_type_list:
		parameter_list (',' '...')(?)

	parameter_list:
		parameter_declaration(s /(,)/)

	parameter_declaration:
		declaration_specifiers declarator
	|	declaration_specifiers abstract_declarator(?)

	identifier_list:
		IDENTIFIER(s /(,)/)

	initializer:
		assignment_expression
	|	'{' initializer_list (',')(?) '}'

	initializer_list:
		initializer(s /(,)/)

	type_name:
		specifier_qualifier_list abstract_declarator(?)

	abstract_declarator:
		pointer(?) direct_abstract_declarator
	|	pointer

	abstract_type:
		'[' constant_expression(?) ']'
	|	'(' parameter_type_list(?) ')'

	direct_abstract_declarator:
		'(' abstract_declarator ')' abstract_type(s?)
	|	abstract_type(s)

	typedef_name:
		IDENTIFIER

	statement:
		selection_statement
	|	expression_statement
	|	iteration_statement
	|	compound_statement
	|	jump_statement
	|	labeled_statement


	labeled_statement:
		'case' constant_expression ':' statement
	|	IDENTIFIER ':' statement
	|	'default' ':' statement

	expression_statement:
		expression(?) ';'

	compound_statement:
		'{' declaration_list(?) statement_list(?) '}'

	statement_list:
		statement(s)

	selection_statement:
		'if'      '(' expression  ')' statement ('else' statement)(?)
	|	'switch'  '(' expression  ')' statement

	iteration_statement:
		'for'   '(' expression(?) ';' expression(?) ';' expression(?) ')' statement
	|	'while' '(' expression ')' statement
	|	'do' statement 'while' '(' expression ')'

	jump_statement:
		'return' expression(?) ';'
	|	'break' ';'
	|	'continue' ';'
	|	'goto' IDENTIFIER ';'

	expression:
		assignment_expression(s /(,)/)

	assignment_expression:
		unary_expression ASSIGNMENT_OPERATOR assignment_expression
	|	conditional_expression


	conditional_expression:
		 logical_OR_expression  ('?' expression ':' conditional_expression)(?)

	constant_expression:
		conditional_expression

	logical_OR_expression:
		logical_AND_expression(s /(\|\|)/)

	logical_AND_expression:
		inclusive_OR_expression(s /(&&)/)

	inclusive_OR_expression:
		exclusive_OR_expression(s /(\|)/)

	exclusive_OR_expression:
		AND_expression(s /(\^)/)

	AND_expression:
		equality_expression(s /(&)/)

	equality_expression:
		relational_expression(s /(==|!=)/)

	relational_expression:
		shift_expression(s /(<=|>=|<|>)/)

	shift_expression:
		additive_expression(s /(<<|>>)/)

	additive_expression:
		multiplicative_expression(s /(\+|-)/)

	multiplicative_expression:
		cast_expression(s /(\*|\/|%)/)

	cast_expression:
		unary_expression
	|	'(' type_name ')' cast_expression

	unary_expression:
		postfix_expression
	|	'++'  unary_expression
	|	'--'  unary_expression
	|	'sizeof' '(' type_name ')'
	|   UNARY_OPERATOR cast_expression
	|	'sizeof'  	unary_expression


	postfix_expression:
		primary_expression postfix_expression_token(s?)


	postfix_expression_token:
    	  '[' expression ']'
		| '(' argument_expression_list(?)')'
		| '.'  IDENTIFIER
		| '->' IDENTIFIER
		| '++'
		| '--'


	primary_expression:
		IDENTIFIER
	|	constant
	|	STRING
	|	'(' expression ')'

	argument_expression_list:
		assignment_expression(s /(,)/)

	constant:
		CHARACTER_CONSTANT
	|	FLOATING_CONSTANT
	|	INTEGER_CONSTANT
	|	ENUMERATION_CONSTANT


###	TERMINALS


	INTEGER_CONSTANT:
		/(?:0[xX][\da-fA-F]+) 					# Hexadecimal
		 |(?:0[0-7]*)		  					# Octal or Zero
		 |(?:[1-9]\d*)		  					# Decimal
		 [uUlL]?			  					# Suffix
		 /x

	CHARACTER_CONSTANT:
		/'([^\\'"]							# None of these
		 |\\['\\ntvbrfa'"]  				# or a backslash followed by one of those
		 |\\[0-7]{1,3}|\\x\d+)'			# or an octal or hex constant
		/x

	FLOATING_CONSTANT:
		/(?:\d+|(?=\.\d+))					# No leading digits only if '.moreDigits' follows
		 (?:\.|(?=[eE]))						# There may be no floating point only if an exponent is present
		 \d*									# Zero or more floating digits
		 ([eE][+-]?\d+)?						# expontent
		 [lLfF]?								# Suffix
		/x

	ENUMERATION_CONSTANT:
		INTEGER_CONSTANT

	STRING:
		/"(([^\\'"])							# None of these
		|(\\[\\ntvbrfa'"])					# or a backslash followed by one of those
		|(\\[0-7]{1,3})|(\\x\d+))*"/x		# or an octal or hex

	IDENTIFIER:
		/(?!(auto|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|goto		# LOOKAHEAD FOR KEYWORDS
			|if|int|long|register|return|signed|sizeof|short|static|struct|switch|typedef			# NONE OF THE KEYWORDS
			|union|unsigned|void|volatile|while)[^a-zA-Z_])											# SHOULD FULLY MATCH!
			(([a-zA-Z]\w*)|(_\w+))/x																# Check for valid identifier

	ASSIGNMENT_OPERATOR:
		'=' | '*=' | '/=' | '%=' | '+=' | '-=' | '<<=' | '>>=' | '&=' | '^=' | '|='

	UNARY_OPERATOR:
		'&' | '*' | '+' | '-' | '~' | '!'

END_OF_C_GRAMMAR


#===  FUNCTION  ================================================================
#         NAME:  flatten_list
#  DESCRIPTION:  Extracts values from a recursive list. Double whitespaces will
#  				 be reduced
# PARAMETER  1:  Array Reference
#===============================================================================

sub flatten_list {
    ( my $tokens = join ' ', map { ref($_) ? flatten_list(@$_) : ($_) } @_ ) =~ s/\s+/ /g;
    $tokens;
}

#===  FUNCTION  ================================================================
#         NAME:  flatten_list_beautified
#  DESCRIPTION:  Like flatten_list but inserts a newline after each semicolon
# PARAMETER  1:  Array Reference
#===============================================================================

sub flatten_list_beautified {
    ( my $tokens = join ' ', map { ref($_) ? flatten_list(@$_) : ($_) } @_ ) =~ s/\s+/ /g;
    $tokens =~ s/;/;\n/g;
	$tokens =~ s/^\s*/\t/mg;
    $tokens;
}

#--------------------------------------------------------------------------#
#  Parsing variables													   #
#--------------------------------------------------------------------------#

my $decommentParser;          # Parser using decomment grammar
my $Cparser;                  # Parser using C grammar
my $C_source;                 # C source code
my $decommented_C_source;     # C source code without comments
my $preprocessed_C_source;    # C source code without preprocessor directives
    # preprocessor directives are just removed, not evalued

#--------------------------------------------------------------------------#
#  Command line options													   #
#--------------------------------------------------------------------------#

our $opt_HELP         = '';
our $opt_SKIPPEDLINES = '';
our $opt_ERRORS       = '';
our $opt_TRACE        = '';
our $opt_CODE         = '';
our $opt_VERBOSE      = 0;
our $opt_PRECOMPILE   = '';

our $opt_FUNCTIONS    = '';
our $opt_DECLARATIONS = '';
our $opt_STRUCTS	  = '';

Getopt::Long::Configure("bundling");    # Enables option bundling

GetOptions(                             # Parse command line options
    'help|h'         => \$opt_HELP,            # --help -h
    'skippedlines|s' => \$opt_SKIPPEDLINES,    # --skippedlines -s
    'errors|e'       => \$opt_ERRORS,          # --errors -e
    'trace|t'        => \$opt_TRACE,           # --trace -t
    'code|c'         => \$opt_CODE,            # --code -c
    'verbose|v+'     => \$opt_VERBOSE,         # --verbose -v
    'functions|f'    => \$opt_FUNCTIONS,       # --functions -f
    'declarations|d' => \$opt_DECLARATIONS,    # --declarations -d
    'precompile|p'   => \$opt_PRECOMPILE,      # --precompile -p
	'structs|u'		 =>	\$opt_STRUCTS		   # --structs -u
);

# variables for parser output
our $functions_output    = '';
our $declarations_output = '';
our $structs_output 	 = '';

$opt_HELP and pod2usage( -verbose => 2 );

# Set error reporting
if ($opt_ERRORS) {
    $::RD_HINT   = 1;                          # Print hints on errors
    $::RD_ERRORS = 1;                          # Print errors
    open( Parse::RecDescent::ERROR, ">errfile" )
      or die "Can't open errfile: $!";
}
# Die if no input files present
@ARGV or pod2usage( -message => "Error: More arguments required.", -verbose => 0 );

{
    local $/;
    $C_source = <>;
}

# Set trace level
if ( $opt_TRACE || ( $opt_VERBOSE >= 3 ) ) {
    $::RD_TRACE = 1;
}

# Generate precompiled parser modules
if($opt_PRECOMPILE) {
	print("\nCreating precompiled parsers...  \n");
	Parse::RecDescent->Precompile( $decomment_grammar,"CSourceParser::DecommentGrammar" );
	Parse::RecDescent->Precompile( $Cgrammar, "CSourceParser::Cgrammar" );
	print("Done\n\n");
}


#--------------------------------------------------------------------------#
#  Parse Level 1 (removes comments)										   #
#--------------------------------------------------------------------------#

if ( -e "DecommentGrammar.pm" ) {
    require DecommentGrammar;
    $decommentParser = new CSourceParser::DecommentGrammar
      or die "Malformed Decomment grammar!\n";
}
else {
    $decommentParser = new Parse::RecDescent($decomment_grammar)
      or die "Malformed Decomment grammar!\n";
}

defined( $decommented_C_source = $decommentParser->program($C_source) )
  or die "Malformed C code found at parse level 1!\n";

#--------------------------------------------------------------------------#
#  Parse Level 2 (Removes preprocessor directives)						   #
#--------------------------------------------------------------------------#

open( PREPROCESS, "<", \$decommented_C_source )    # open string as filehandle
  or die "Can't open input string for parse level 2: $!";
$preprocessed_C_source = "";
my $skip_line = 0;

foreach (<PREPROCESS>) {

    # Match preprocessor directives ...
    if (
        m{\s*																				# Optional whitespace
		\#																					# Preprocessor opener
		\s*																					# Optional whitespace
		(?:(define|include|undef|ifdef|ifndef|if|endif|else|elif|line|error|pragma)\s)	 	# Keyword followed by one or more whitespace
		.*																					# anything (optinal)
		}x || $skip_line
      )
    {
        $skip_line = /.*\\[\n]/;    # Ignore this line AND NEXT LINE ALSO
        $preprocessed_C_source .= " ";    # if this line ends with backslash
    }
    else {
        $preprocessed_C_source .= $_;     # OK, parse this line
    }
}

if ( $opt_CODE || ( $opt_VERBOSE >= 1 ) ) {  # print sourcecode with linenumbers
    my $i = 1;
    foreach ( split( /\n/, $preprocessed_C_source ) ) {
        print "$i\t$_\n";
        $i++;
    }
}

if ( $opt_VERBOSE >= 2 ) {
    $::RD_TRACE = 1;
}

#--------------------------------------------------------------------------#
#  Parse Level 3 (parses C code)										   #
#--------------------------------------------------------------------------#

$::RD_AUTOACTION =
  q {  [ @item[1..$#item] ] };    # set default auto-action for grammar rules

if ( -e "CGrammar.pm" ) {
    require CGrammar;
    $Cparser = new CSourceParser::CGrammar or die "Malformed C grammar!\n";
}
else {
    $Cparser = new Parse::RecDescent($Cgrammar) or die "Malformed C grammar!\n";
}

defined( $Cparser->translation_unit($preprocessed_C_source) )
  or die "Malformed C code found at parse level 3!\n";

print "\nDefined Functions:\n\n$functions_output\n\n"
  if defined $functions_output
  and $opt_FUNCTIONS;
print "\nDeclarations:\n\n$declarations_output\n\n"
  if defined $declarations_output
  and $opt_DECLARATIONS;
print "\nStructures:\n\n$structs_output\n\n"
  if defined $structs_output
  and $opt_STRUCTS;

__END__


#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#  Application Documentation
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


=head1 NAME

csourceparser.pl - extract components from sourcecode written in the C programming language


=head1 VERSION

This documentation refers to csourceparser.pl version 0.1.0


=head1 SYNOPSIS

B<./csourceparser.pl  [OPTION]  ...   FILE ...>

B<Examples:>

Print signatures of functions defined in a C-file:

./csourceparser.pl -f myprog.c

Print all declarations in a C-file:

./csourceparser.pl -d myprog.c

Print all structures in a C-file:

./csourceparser.pl -u myheader.h

=head1 REQUIRED ARGUMENTS

One or more C-Sourcefiles to parse.

=head1 OPTIONS

=over 4

=item B<-c, --code>

Show parsed source code with line numbers

=item B<-d, --declarations>

Prints (global) declarations in the sourcefile to stdout

=item B<-e, --errors>

Print error messages generated by Parse::RecDescent to the file 'errfile'

=item B<-f, --functions>

Prints the signatures of functions defined in the source file to stdout (the function bodys are ommited)

=item B<-h, --help>

Print this help

=item B<-p, --precompile>

Generate precompiled parsers Cgrammar.pm and DecommentGrammar.pm in the current working directory. Precompiled parsers will speed up parsing.
If these files are available in the current working directory they will be used automatically. Every time the --precompile
option is set the precompiled parsers are generateted newly so this option should only be used only once.
Also, don't forget to recreate the precompiled parseres if you modify the grammar.

=item B<-s, --skippedlines>

Show which lines had been skipped during parse due to parser errors or unrecognized tokens in the C source code

=item B<-t, --trace>

Print full tracecode generated by Parse::RecDescent. Note: this can be B<a lot>

=item B<-u, --structs>

Print all structs and unions defined in the C sourcefile

=item B<-v, --verbose>

Each use encreases verbosity level by one.

=over 4

=item Level 1:

Print parsed sourcecode and skipped lines (same as I<-sc>)

=item Level 2:

Print tracecode of parse level 3 (C code without preprocessor directives and comments)

=item Level 3:

Print full tracecode.

=back

=back

=head1 DIAGNOSTICS

If you don't get the output you expect try to set the -sc options to see what happens.
When the parser can't handle the input it will silently get skipped (for empty lines this is a normal behaviour).

If the parser doesn't behave as you expect take a look at the tracecode, e.g. ./csourceparser -t file.c 2> trace for full tracecode or
./csourceparser -vv file.c 2> trace to see only parser level 3 (C-parser) trace code. Depending on the size of the input file(s) this could take
some time and may occupy some hd-space.

Please refer to the Parse::RecDescent documentation if you get errors after modifying the grammar.

=head1 DEPENDENCIES

=over 4

=item *

Perl >= v5.00800

=item *

Parse::RecDescent

=item *

Getopt::Long

=item *

Pod::Usage

=head1 BUGS AND LIMITATIONS

Macros in C-Files could not be parsed at this time. Any declarations and definitions containing macros my cause errors.
Perhaps this feature could be implemented in the  future by using a "real" preprocessor like m4.

Please report problems to Hendrik Sirges  (hendrik.sirges at fh-swf.de)
Patches are welcome.

=head1 AUTHOR

 Hendrik Sirges  <hendrik.sirges[at]fh-swf.de>

=head1 LICENCE AND COPYRIGHT

This program is Copyright 2005 by Hendrik Sirges.  This program is free software; you can redistribute
it and/or modify it under the terms of the Perl Artistic License or the GNU General Public
License as published by the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
Public License for more details.

If you do not have a copy of the GNU General Public License write to the Free Software Foundation,
Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

=cut