The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
=head1 NAME

Python::Bytecode::SAX - Process Python bytecode, generating SAX events.

=head1 SYNOPSIS

  use Python::Bytecode::SAX;
  use XML::SAX::Writer;
  my $handler = XML::SAX::Writer->new( Output => 'foo.xml' );
  my $parser = Python::Bytecode::SAX->new( Handler => $handler, SAX => 2 );
  $parser->parse_file('foo.pyc');

Or

  use Python::Bytecode::SAX;
  use XML::Handler::YAWriter;
  my $handler = XML::Handler::YAWriter->new(
      AsFile => 'foo.xml',
      Pretty  => {
          CompactAttrIndent  => 1,
          PrettyWhiteIndent  => 1,
          PrettyWhiteNewline => 1,
          CatchEmptyElement  => 1
      }
  );
  my $parser = Python::Bytecode::SAX->new( Handler => $handler, SAX => 1 );
  $parser->parse_file('foo.pyc');

=head1 DESCRIPTION

This module reads and decodes a Python bytecode file, generating SAX1 or SAX2
events (SAX1 is the default) for what it finds.

Until more documentation is written, you can examine the two XML files generated
in the F<t/> directory by C<make test> to get a feel for the overal structure and
the element and attribute names.

=head1 HISTORY

Based on the L<Python::Bytecode> module by Simon Cozens E<lt>simon@cpan.orgE<gt>,
which is based on the F<dis.py> file in the Python Standard Library.

=head1 SEE ALSO

L<Python::Bytecode> by Simon Cozens E<lt>simon@cpan.orgE<gt>.

=head1 AUTHOR

Gregor N. Purdy E<lt>gregor@focusresearch.comE<gt>

=head1 COPYRIGHT

Copyright (C) 2003 Gregor N. Purdy. All rights reserved.

=head1 LICENSE

This program is free software. Its use is subject to the same license as Perl.

=cut

package Python::Bytecode::SAX;
use 5.6.0;

our $VERSION = '0.1';

use strict;
use Data::Dumper;
use Carp;

use overload '""' => sub { my $obj = shift; 
    "<Code object ".$obj->{name}.", file ".$obj->{filename}." line ".$obj->{lineno}." at ".sprintf('0x%x>',0+$obj);
    }, "0+" => sub { $_[0] }, fallback => 1;

my $no_main_yet;


#
# new()
#

sub new
{
    my $class = shift;
    my %opts = @_;

    return bless { %opts }, $class;
}


#
# _start_doc()
#

sub _start_doc
{
    my $self = shift;

    $self->{Handler}->start_document();
}


#
# _end_doc()
#

sub _end_doc
{
    my $self = shift;

    $self->_flush;
    $self->{Handler}->end_document();
}


#
# _start_elem()
#

sub _start_elem
{
    my $self = shift;
    my $name = shift;

    $self->_flush;
    $self->{_elem}  = $name;
    $self->{_attrs} = { };
}


#
# _attr()
#

sub _attr
{
   my $self  = shift;
   my $attr  = shift;
   my $value = shift;

   $self->{_attrs}{$attr} = $value;
}


#
# _end_elem()
#

sub _end_elem
{
    my $self = shift;
    my $name = shift;

    $self->_flush;
    
    my $SAX = $self->{SAX} || 1;

    if ($SAX == 1) {
        $self->{Handler}->end_element({ Name => $name });
    }
    else {
        $self->{Handler}->end_element({ Name => $name });
    }
}


#
# _chars()
#

sub _chars
{
    my $self = shift;
    my $data = shift;

    $self->_flush;

    my $SAX = $self->{SAX} || 1;

    if ($SAX == 1) {
        $self->{Handler}->characters({ Data => $data });
    }
    else {
        $self->{Handler}->characters({ Data => $data });
    }
}


#
# _flush
#

sub _flush
{
    my $self = shift;

    return unless $self->{_elem};

    my $SAX = $self->{SAX} || 1;

    if ($SAX == 1) {
        $self->{Handler}->start_element({ Name => $self->{_elem}, Attributes => $self->{_attrs} });
    }
    else {
        my $attrs = { };
        foreach my $attr (keys %{$self->{_attrs}}) {
          $attrs->{$attr}{Name} = $attr;
          $attrs->{$attr}{Value} = $self->{_attrs}{$attr};
        }
        $self->{Handler}->start_element({ Name => $self->{_elem}, Attributes => $attrs });
    }

    $self->{_elem}  = undef;
    $self->{_attrs} = undef;
}


#
# parse_string()
#
# Open a filehandle for reading the string.
#

sub parse_string
{
  my $self = shift;
  my $string = shift;

  my $fh;
  open($fh, "<", \$string) or die;
  $self->{fh} = $fh;

  return $self->_parse;
}


#
# parse_file()
#
# Open a filehandle for reading the file.
#

sub parse_file
{
    my $self = shift;
    my $file = shift;

    my $fh;
    open($fh, "<", $file) or die;
    $self->{fh} = $fh;

    return $self->_parse;
}


#
# _parse()
#

sub _parse
{
    my $self = shift;

    my $magic   = $self->r_long();
    my $magic2  = $self->r_long(); # Second magic number
    my $version = $Python::Bytecode::versions{$magic};

    my $data    = _get_data_by_magic($magic);
    $self->_init($data);

    $self->_start_doc();

    $self->_start_elem('file');
    $self->_attr(magic   => $magic);
    $self->_attr(magic2  => $magic2);
    $self->_attr(version => $version);

    $self->r_object(emit => 1);

    $self->_end_elem('file');

    $self->_end_doc();

    return $self;
}


#
# _get_data_by_magic()
#

sub _get_data_by_magic
{
    require Python::Bytecode::v21;
    require Python::Bytecode::v22;
    my $magic = shift;
    unless (exists $Python::Bytecode::data{$magic}) {
        require Carp;
        Carp::croak("Unrecognised magic number $magic; Only know Python versions "
        . join ", ", map { "$_ ($Python::Bytecode::versions{$_})" } keys %Python::Bytecode::versions
        );
    }
    return $Python::Bytecode::data{$magic};
}


#
# r_byte()
#

sub r_byte
{ 
    my $self = shift;
    return ord getc $self->{fh};
}


#
# r_long()
#

sub r_long
{
    my $self = shift;
    my $x = $self->r_byte;
    $x |= $self->r_byte << 8;
    $x |= $self->r_byte << 16;
    $x |= $self->r_byte << 24;
    return $x;
}


#
# r_short()
#

sub r_short
{
    my $self = shift;
    my $x = $self->r_byte;
    $x |= $self->r_byte << 8;
    $x |= -($x & 0x8000);
    return $x;
}


#
# r_string()
#

sub r_string
{
    my $self = shift;
    my $length = $self->r_long; 
    my $buf; 
    if ( exists $self->{stuff}) {
        $buf = join "", splice ( @{$self->{stuff}},0,$length,() );
    } else {
        read $self->{fh}, $buf, $length; 
    }
    return $buf;
}


#
# r_object()
#

sub r_object
{
    my $self = shift;
    my %options = @_;

    confess "Don't know whether or not to emit!" unless exists $options{emit};

    my $type = chr $self->r_byte();

    if ($type eq "c") {
        my $code = $self->r_code(emit => $options{emit}, $options{index});
        return $code;
    }
    elsif ($type eq 's') {
      if ($options{emit}) {
          $self->_start_elem('literal');
          $self->_attr(type  => 'String');
          $self->_attr(value => $self->r_string);
          $self->_attr(index => $options{index}) if defined $options{index};
          $self->_end_elem('literal');
      }
      else {
          return $self->r_string();
      }
    }
    elsif ($type eq 'i') {
      if ($options{emit}) {
          $self->_start_elem('literal');
          $self->_attr(type  => 'Long');
          $self->_attr(value => $self->r_long);
          $self->_attr(index => $options{index}) if defined $options{index};
          $self->_end_elem('literal');
      }
      else {
          return $self->r_long();
      }
    }
    elsif ($type eq 'N') {
      if ($options{emit}) {
          $self->_start_elem('literal');
          $self->_attr(type  => 'None');
          $self->_attr(index => $options{index}) if defined $options{index};
          $self->_end_elem('literal');
      }
      else {
          return undef;
      }
    }
    elsif ($type eq "(") {
        $self->r_tuple(name => $options{name}, index => $options{index});
    }
    else {
        die "Oops! I didn't implement ".ord $type;
    }
}


#
# r_tuple()
#

sub r_tuple
{
    my $self = shift;
    my %options = @_;

    my $n = $self->r_long;
   
    $options{name} = 'tuple' unless $options{name};

    $self->_start_elem($options{name});
    $self->_attr(index => $options{index}) if defined $options{index};
    $self->_attr('count' => $n);
    
    for my $i (0..$n - 1) {
      $self->r_object(index => $i, emit => 1);
    }

    $self->_end_elem($options{name});
}


#
# r_code()
#

sub r_code {
    my $self = shift;

    $self->_start_elem('outer_code');
    $self->_attr('argcount'  => $self->r_short);
    $self->_attr('nlocals'   => $self->r_short);
    $self->_attr('stacksize' => $self->r_short);
    $self->_attr('flags'     => $self->r_short);

    $self->_start_elem('code');
    $self->disassemble($self->r_object(emit => 0));
    $self->_end_elem('code');

    $self->r_object(name => 'constants', emit => 1);
    $self->r_object(name => 'names',     emit => 1);
    $self->r_object(name => 'varnames',  emit => 1);
    $self->r_object(name => 'freevars',  emit => 1);
    $self->r_object(name => 'cellvars',  emit => 1);

    $self->_start_elem('filename');
    $self->r_object(emit => 1);
    $self->_end_elem('filename');

    $self->_start_elem('name');
    $self->r_object(emit => 1);
    $self->_end_elem('name');

    $self->_start_elem('lineno');
    $self->_chars($self->r_short);
    $self->_end_elem('lineno');

    $self->_start_elem('lnotab'); # Maps address to line #?
    $self->r_object(emit => 1); 
    $self->_end_elem('lnotab');

    $self->_end_elem('outer_code');
}


###############################################################################
###############################################################################
##
## DATA TABLES
##
###############################################################################
###############################################################################


$Parrot::Bytecode::DATA = <<EOF;

# This'll amuse you. It's actually lifted directly from dis.py :)
# Instruction opcodes for compiled code

def_op('STOP_CODE', 0)
def_op('POP_TOP', 1)
def_op('ROT_TWO', 2)
def_op('ROT_THREE', 3)
def_op('DUP_TOP', 4)
def_op('ROT_FOUR', 5)

def_op('UNARY_POSITIVE', 10)
def_op('UNARY_NEGATIVE', 11)
def_op('UNARY_NOT', 12)
def_op('UNARY_CONVERT', 13)

def_op('UNARY_INVERT', 15)

def_op('BINARY_POWER', 19)

def_op('BINARY_MULTIPLY', 20)
def_op('BINARY_DIVIDE', 21)
def_op('BINARY_MODULO', 22)
def_op('BINARY_ADD', 23)
def_op('BINARY_SUBTRACT', 24)
def_op('BINARY_SUBSCR', 25)

def_op('SLICE+0', 30)
def_op('SLICE+1', 31)
def_op('SLICE+2', 32)
def_op('SLICE+3', 33)

def_op('STORE_SLICE+0', 40)
def_op('STORE_SLICE+1', 41)
def_op('STORE_SLICE+2', 42)
def_op('STORE_SLICE+3', 43)

def_op('DELETE_SLICE+0', 50)
def_op('DELETE_SLICE+1', 51)
def_op('DELETE_SLICE+2', 52)
def_op('DELETE_SLICE+3', 53)

def_op('INPLACE_ADD', 55)
def_op('INPLACE_SUBTRACT', 56)
def_op('INPLACE_MULTIPLY', 57)
def_op('INPLACE_DIVIDE', 58)
def_op('INPLACE_MODULO', 59)
def_op('STORE_SUBSCR', 60)
def_op('DELETE_SUBSCR', 61)

def_op('BINARY_LSHIFT', 62)
def_op('BINARY_RSHIFT', 63)
def_op('BINARY_AND', 64)
def_op('BINARY_XOR', 65)
def_op('BINARY_OR', 66)
def_op('INPLACE_POWER', 67)

def_op('PRINT_EXPR', 70)
def_op('PRINT_ITEM', 71)
def_op('PRINT_NEWLINE', 72)
def_op('PRINT_ITEM_TO', 73)
def_op('PRINT_NEWLINE_TO', 74)
def_op('INPLACE_LSHIFT', 75)
def_op('INPLACE_RSHIFT', 76)
def_op('INPLACE_AND', 77)
def_op('INPLACE_XOR', 78)
def_op('INPLACE_OR', 79)
def_op('BREAK_LOOP', 80)

def_op('LOAD_LOCALS', 82)
def_op('RETURN_VALUE', 83)
def_op('IMPORT_STAR', 84)
def_op('EXEC_STMT', 85)

def_op('POP_BLOCK', 87)
def_op('END_FINALLY', 88)
def_op('BUILD_CLASS', 89)

HAVE_ARGUMENT = 90      # Opcodes from here have an argument:

name_op('STORE_NAME', 90)   # Index in name list
name_op('DELETE_NAME', 91)  # ""
def_op('UNPACK_SEQUENCE', 92)   # Number of tuple items

name_op('STORE_ATTR', 95)   # Index in name list
name_op('DELETE_ATTR', 96)  # ""
name_op('STORE_GLOBAL', 97) # ""
name_op('DELETE_GLOBAL', 98)    # ""
def_op('DUP_TOPX', 99)      # number of items to duplicate
def_op('LOAD_CONST', 100)   # Index in const list
hasconst.append(100)
name_op('LOAD_NAME', 101)   # Index in name list
def_op('BUILD_TUPLE', 102)  # Number of tuple items
def_op('BUILD_LIST', 103)   # Number of list items
def_op('BUILD_MAP', 104)    # Always zero for now
name_op('LOAD_ATTR', 105)   # Index in name list
def_op('COMPARE_OP', 106)   # Comparison operator
hascompare.append(106)
name_op('IMPORT_NAME', 107) # Index in name list
name_op('IMPORT_FROM', 108) # Index in name list

jrel_op('JUMP_FORWARD', 110)    # Number of bytes to skip
jrel_op('JUMP_IF_FALSE', 111)   # ""
jrel_op('JUMP_IF_TRUE', 112)    # ""
jabs_op('JUMP_ABSOLUTE', 113)   # Target byte offset from beginning of code
jrel_op('FOR_LOOP', 114)    # Number of bytes to skip

name_op('LOAD_GLOBAL', 116) # Index in name list

jrel_op('SETUP_LOOP', 120)  # Distance to target address
jrel_op('SETUP_EXCEPT', 121)    # ""
jrel_op('SETUP_FINALLY', 122)   # ""

def_op('LOAD_FAST', 124)    # Local variable number
haslocal.append(124)
def_op('STORE_FAST', 125)   # Local variable number
haslocal.append(125)
def_op('DELETE_FAST', 126)  # Local variable number
haslocal.append(126)

def_op('SET_LINENO', 127)   # Current line number
SET_LINENO = 127

def_op('RAISE_VARARGS', 130)    # Number of raise arguments (1, 2, or 3)
def_op('CALL_FUNCTION', 131)    # #args + (#kwargs << 8)
def_op('MAKE_FUNCTION', 132)    # Number of args with default values
def_op('BUILD_SLICE', 133)      # Number of items

def_op('CALL_FUNCTION_VAR', 140)     # #args + (#kwargs << 8)
def_op('CALL_FUNCTION_KW', 141)      # #args + (#kwargs << 8)
def_op('CALL_FUNCTION_VAR_KW', 142)  # #args + (#kwargs << 8)

def_op('EXTENDED_ARG', 143)
EXTENDED_ARG = 143

EOF


#
# _init()
#

# Set up op code data structures

sub _init
{
    my $self = shift;
    my $data = shift;
    my @opnames;
    my %c; # Natty constants.
    my %has;
    for (split /\n/, $data) { # This ought to come predigested, but I am lazy
        next if /^#/ or not /\S/;
        if    (/^def_op\('([^']+)', (\d+)\)/) { $opnames[$2]=$1; } 
        elsif (/^(jrel|jabs|name)_op\('([^']+)', (\d+)\)/) { $opnames[$3]=$2; $has{$1}{$3}++ } 
        elsif (/(\w+)\s*=\s*(\d+)/) { $c{$1}=$2; }
        elsif (/^has(\w+)\.append\((\d+)\)/) { $has{$1}{$2}++ }
    }
    $self->{opnames} = \@opnames;
    $self->{has} = \%has;
    $self->{c} = \%c;
}


#
# disassemble()
#

my @cmp_op   = ('<', '<=', '==', '!=', '>', '>=', 'in', 'not in', 'is', 'is not', 'exception match', 'BAD');

sub disassemble
{
    my $self = shift;
    my $code = shift;

confess "NO CODE!" unless $code;

    my @code = map { ord } split //, $code;

    my $offset = 0;
    my $extarg = 0;
    my @dis;

    while (@code) {
        my $c = shift @code;

        my $opname = $self->opname($c);

        $self->_start_elem('op');
        $self->_attr('offset' => $offset);
        $self->_attr('name'   => $opname);
        $self->_attr('code'   => $c);
        
        $offset++;
        my $arg;

        if ($c>=$self->{c}{HAVE_ARGUMENT}) {
            $arg = shift @code; 
            $arg += (256 * shift (@code)) + $extarg;
            $extarg = 0;
            $extarg = $arg * 65535 if ($c == $self->{c}{EXTENDED_ARG});
            $offset+=2;

            $self->_attr('value' => $arg);

            if ($self->{has}{const}{$c}) {
                $self->_attr('value-type'    => 'const-index');
            }
            elsif ($self->{has}{"local"}{$c}) {
                $self->_attr('value-type'    => 'local-index');
            }
            elsif ($self->{has}{name}{$c}) {
                $self->_attr('value-type'    => 'name-index');
            }
            elsif ($self->{has}{compare}{$c}) {
                $self->_attr('value-type'    => 'compare-op');
                $self->_attr('value-compare' => $cmp_op[$arg]);
            }
            elsif ($self->{has}{jrel}{$c}) {
                $self->_attr('value-type'    => 'jump-offset');
                $self->_attr('value-jump'    => 'rel');
                $self->_attr('value-target'  => $offset + $arg );
            }
            elsif ($self->{has}{jabs}{$c}) {
                $self->_attr('value-type'    => 'jump-target');
                $self->_attr('value-jump'    => 'rel');
                $self->_attr('value-target'  => $arg );
            }
            else {
                $self->_attr('value-type'    => 'literal');
            }
        }

        $self->_end_elem('op');
    }
}


#
# opname()
#

sub opname {
    $_[0]->{opnames}[$_[1]];
}


1;