The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package VCP::Source::revml ;

=head1 NAME

VCP::Source::revml - Outputs versioned files to a revml file

=head1 SYNOPSIS

## revml output class:

   vcp foo.revml                     [dest_spec]
   vcp foo.revml --uncompress        [dest_spec]
   vcp foo.revml --dtd <revml.dtd>   [dest_spec]
   vcp foo.revml --version <version> [dest_spec]
   vcp revml:foo.revml:/foo/bar/...  [dest_spec]

Where <source> is a filename for input; or missing or '-' for STDIN.

=head1 DESCRIPTION

This source driver allows L<vcp|vcp> to read a RevML file.

For now, all revisions are fully reconstituted in the working
directory in order to make sure that all of the patches apply cleanly.
This can require a huge amount of disk space, but it works (optimizing
this is on the TODO).

Using the --uncompress option uncompresses gzipped input.
If the input file ends in '.gz', the uncompress flag is implied.

=cut

=for DEVELOPER_USE_ONLY
To use an alternate DTD:
   vcp revml[:<source>] --dtd <dtd>

=cut

use VCP::Logger qw( pr );

BEGIN {
   ## Beginning vcpers might try running a command like "vcp" just
   ## to see what happens.  Since RevML is not required for most
   ## vcp uses and XML::Parser requires a C compiler, vcp is often
   ## distributed without XML::Parser, so this message is to help
   ## steer hapless new users to the help system.

   ## The exit(1) is to avoid untidy and scary 
   ## "compilation failed in BEGIN" messages
   pr( <<TOHERE ), exit 1 unless eval "require XML::Parser";
XML::Parser is not installed on this system and it is required to
process RevML files.  If this is not what you want to do, try

    vcp help

TOHERE
}

use strict ;

use Carp ;
use Fcntl ;
use File::Spec;
use Digest::MD5 ;
use MIME::Base64 ;
use Regexp::Shellish qw( compile_shellish );
use RevML::Doctype ;
use Symbol ;
use UNIVERSAL qw( isa ) ;
use XML::Parser ;
use Time::Local qw( timegm ) ;
use VCP::Debug ':debug' ;
use VCP::Patch ;
use VCP::Rev ;
use VCP::Utils qw( shell_quote empty );

use vars qw( $VERSION $debug ) ;

$VERSION = 0.1 ;

$debug = 0 ;

use base 'VCP::Source' ;

use fields (
   'DOCTYPE',
   'HEADER',            ## The $header is held here until the first <rev> is read
   'IN_FH',             ## The handle of the input revml file
   'WORK_NAME',         ## The name of the working file (diff or content)
   'WORK_FH',           ## The filehandle of working file
   'REV',               ## The VCP::Rev containing all of this rev's meta info
   'STACK',             ## A stack of currently open elements
   'UNDECODED_CONTENT', ## Base64 content waiting to be decoded.
   'FILESPEC_RE',       ## A perl5 re compiled from $self->repo_filespec
   'UNCOMPRESS',        ## un-compress gzipped input
) ;


#=item new
#
#Creates a new instance.  The only parameter is '-dtd', which overrides
#the default DTD found by searching for modules matching RevML::DTD:v*.pm.
#
#Attempts to open the input file if one is specified.
#
#=cut

sub new {
   my $class = shift ;
   $class = ref $class || $class ;

   my VCP::Source::revml $self = $class->SUPER::new( @_ ) ;

   my ( $spec, $args ) = @_ ;

   $self->parse_repo_spec( $spec ) ;

   $self->repo_id(
      join ":",
         "revml",
         defined $self->repo_server   ? $self->repo_server   : "",
         defined $self->repo_filespec ? $self->repo_filespec : "",
   );

   $self->parse_options(
      $args,
      'dtd|version' => sub {
         $self->{DOCTYPE} = RevML::Doctype->new( shift @$args ) ;
      },
      'uncompress'  => \$self->{UNCOMPRESS},
   );

   $self->{DOCTYPE} = RevML::Doctype->new
      unless $self->{DOCTYPE} ;

   my @errors ;

   ## This supports a brain-dead compatability mode where you can
   ## "just" say revml:filename.revml or even just filename.revml
   ## on the command line.  The parse routines will stick that
   ## in repo_filespec and not set the server.  If, however, the
   ## server is set, then the filespec is a pattern we need to use
   ## to select files.
   my $file = $self->repo_server;
   if ( empty $file ) {
      $self->repo_server( $self->repo_filespec );
      $self->repo_filespec( undef );
      $file = $self->repo_server;
   }

   $file = $self->repo_server(
      File::Spec->rel2abs( $file )
   ) unless $file eq "-";

   $file = "-" if empty $file;

   # always un-compress if filename ends in ".gz"
   my $gzip;
   if ( $^O =~ /Win32/ ) {
      $self->{UNCOMPRESS} = 1 if $file =~ /\.gz$/i ;
      $gzip = "gzip.exe";
   }
   else {
      $self->{UNCOMPRESS} = 1 if $file =~ /\.gz$/ ;
      $gzip = "gzip";
   }

   my $fs = $self->repo_filespec;
   $self->{FILESPEC_RE} = ( ! empty $fs )
      ? compile_shellish( $fs )
      : qr{^};

   if ( $file eq '-' ) {
      if( $self->{UNCOMPRESS} ) {
         open( $self->{IN_FH}, "gzip --decompress --stdout - |" )
            or die "$!: gzip --decompress --stdout - |";
      }
      else {
         $self->{IN_FH}   = \*STDIN ;
      }
      ## TODO: Check IN_FH for writability when it's set to STDIN
      ## don't you mean readability?
   }
   else {
      require Symbol ;
      $self->{IN_FH} = Symbol::gensym ;

      if( $self->{UNCOMPRESS} ) {
         my $in_name = shell_quote $file;

         open( $self->{IN_FH}, "gzip --decompress --stdout $in_name |" )
            or die "$!: gzip --decompress --stdout $in_name |";
      }
      else {
         open( $self->{IN_FH}, "<$file" ) or die "$!: $file\n";
      }
   }

   $self->{WORK_FH} = Symbol::gensym ;

   die join( '', @errors ) if @errors ;

   return $self ;
}


sub handle_header {
   my VCP::Source::revml $self = shift ;

   ( $self->{HEADER} ) = @_;

   ## Save this off until we get our first rev from the input
   $self->revs( VCP::Revs->new ) ;
   $self->parse_revml_file ;
   $self->dest->handle_header( $self->{HEADER} );
}


sub get_rev {
   my VCP::Source::revml $self = shift ;
   my VCP::Rev $r ;
   ( $r ) = @_ ;

   die "can't check out ", $r->as_string, "\n"
      unless defined $r->is_base_rev || $r->action eq "add" || $r->action eq "edit";

   return $r->work_path;
}


sub parse_revml_file {
   my VCP::Source::revml $self = shift ;

   my @stack ;
   $self->{STACK} = \@stack ;

   my $char_handler = sub {
      my $expat = shift ;
      my $pelt = $stack[-1] ; ## parent element
      my $tag = $pelt->{NAME} ;
      $pelt->{TEXT} .= $_[0] if exists $pelt->{TEXT} && defined $pelt->{TEXT};
      my $sub = "${tag}_characters" ;
      $self->$sub( @_ ) if $self->can( $sub ) ;
   } ;

   my $p = XML::Parser->new(
      Handlers => {
         Start => sub {
	    my $expat = shift ;
	    my $tag = shift ;

	    if ( $tag eq "char" ) {
	       while ( @_ ) {
	          my ( $attr, $value ) = ( shift, shift ) ;
#print STDERR $value, "=" ;
		  if ( $attr eq "code" ) {
		     if ( $value =~ s{^0x}{} ) {
			$value = chr( hex( $value ) ) ;
		     }
		     else {
			$value = chr( $value ) ;
		     }
#print STDERR ord $value, "\n" ;
		     $char_handler->( $expat, $value ) ;
		  }
	       }
	       return ;
	    }

            ## TODO: suss out "container" elements from the doctype.
	    push @stack, {
	       NAME => $tag,
	       @_,
	       ( $self->can( "${tag}_characters" )
                  || 0 <= index "revml,rev,branch,branches,", $tag . ","
               ) 
                  ? ()
                  : ( TEXT => "" ),
	    } ;

	    my $sub = "start_$tag" ;
	    $self->$sub( @_ ) if $self->can( $sub ) ;
	 },

	 End => sub {
	    my $expat = shift ;
	    my $tag = shift ;
	    return if $tag eq "char" ;

#print STDERR "</$tag>\n" ;
	    die "Unexpected </$tag>, expected </$stack[-1]>\n"
	       unless $tag eq $stack[-1]->{NAME} ;
	    my $sub = "end_$tag" ;
	    $self->$sub( @_ ) if $self->can( $sub ) ;
	    my $elt = pop @stack ;

	    if ( @stack ) {
               if (
                  exists $elt->{TEXT}
                  && defined $elt->{TEXT}
               ) {
                  ## Save all the meta fields for start_content() or start_diff()
                  if ( $tag eq 'label' ) {
                     push @{$stack[-1]->{labels}}, $elt->{TEXT} ;
                  }
                  elsif ( $stack[-1]->{NAME} eq "revml" ) {
                     die "Header field $tag after first rev\n"
                        if $self->revs->get;
                     ## ASSume none of these occur after first rev.
                     $self->{HEADER}->{$tag} = $elt->{TEXT} ;
                  }
                  else {
                     $stack[-1]->{$tag} = $elt->{TEXT} ;
                  }
               }
               else {
                  ## It's a node with child nodes.
                  delete $elt->{NAME};

                  if ( $tag eq "branch" ) {
                     require VCP::Branches;
                     require VCP::Branch;

                     $stack[-1]->{branches} ||= VCP::Branches->new;
                     $stack[-1]->{branches}->add( 
                         VCP::Branch->new( %$elt )
                     );
                  }
                  elsif ( $tag eq "branches" ) {
                     $self->{HEADER}->{branches} = $elt->{branches};
                  }
                  elsif ( $stack[-1]->{NAME} eq "revml" && $tag ne "rev" ) {
                     die "Header field $tag after first rev\n"
                        if $self->revs->get;
                     ## ASSume none of these occur after first rev.
                     $self->{HEADER}->{$tag} = $elt;
                  }
                  else {
                     $stack[-1]->{$tag} = $elt;
                  }
               }
	    }
	 },

	 Char => $char_handler,
      },
   ) ;
   $p->parse( $self->{IN_FH} ) ;
}


sub start_rev {
   my VCP::Source::revml $self = shift ;

   ## Make sure no older rev is lying around to confuse us.
   $self->{REV} = undef ;
}

## RevML is contstrained so that the diff and content tags are after all of
## the meta info for a revision.  And we really don't want to hold
## the entire content of a file in memory, in case it's large.  So we
## intercept start_content and start_diff and initialize the REV
## member as well as opening a place to catch all of the data that gets
## extracted from the file.
sub init_rev_meta {
   my VCP::Source::revml $self = shift ;
   my ( $is_placeholder ) = @_;

   my $rev_elt = $self->{STACK}->[-2] ;
   my VCP::Rev $r = VCP::Rev->new() ;
   ## All revml tag naes are lc, all internal data member names are uc
#require Data::Dumper ; print Data::Dumper::Dumper( $self->{STACK} ) ;

   for my $key ( grep /^[a-z_0-9]+$/, keys %$rev_elt ) {
      if ( $key eq 'labels' ) {
         $r->set_labels( $rev_elt->{labels} );
      }
      else {
         ## We know that all kids *in use today* of <rev> are pure PCDATA
	 ## Later, we'll need sub-attributes.
	 ## TODO: Flatten the element tree by preficing attribute names
	 ## with '@'?.
         $r->$key( $rev_elt->{$key} ) ;
      }
   }
#require Data::Dumper ; print Data::Dumper::Dumper( $r ) ;

   if ( $is_placeholder ) {
      $r->action( "placeholder" );
   }
   else {
      $r->work_path(
         $self->work_path( $r->name, $r->branch_id || "-", $r->rev_id )
      ) ;

      $self->mkpdir( $r->work_path ) ;
   }

   $self->{REV} = $r ;
   return ;
}


sub start_delete {
   my VCP::Source::revml $self = shift ;

   $self->init_rev_meta ;
   $self->{REV}->set_action( "delete" ) ;
   ## Clear the work_path so that VCP::Rev doesn't try to delete it.
   $self->{REV}->set_work_path( undef ) ;
   1;  ## prevent void context warning
}


sub start_move {
   my VCP::Source::revml $self = shift ;

   $self->init_rev_meta ;
   $self->{REV}->set_action( "move" ) ;
   ## Clear the work_path so that VCP::Rev doesn't try to delete it.
   $self->{REV}->set_work_path( undef ) ;
   die "<move> unsupported" ;
}


sub start_content {
   my VCP::Source::revml $self = shift ;

   $self->init_rev_meta ;
#require Data::Dumper ; print Data::Dumper::Dumper( $self->{REV} ) ;
   $self->{REV}->action( "edit" ) ;
   $self->{WORK_NAME} = $self->{REV}->work_path ;
   $self->{UNDECODED_CONTENT} = "" ;
   sysopen $self->{WORK_FH}, $self->{WORK_NAME}, O_WRONLY | O_CREAT | O_TRUNC
      or die "$!: $self->{WORK_NAME}" ;
   ## The binmode here is to make sure we don't convert \n to \r\n and
   ## to allow ^Z out the door (^Z is EOF on windows, and they take those
   ## things rather more seriously there than on Unix).
   binmode $self->{WORK_FH};
}


sub content_characters {
   my VCP::Source::revml $self = shift ;
   if ( $self->{STACK}->[-1]->{encoding} eq "base64" ) {
      $self->{UNDECODED_CONTENT} .= shift ;
      if ( $self->{UNDECODED_CONTENT} =~ s{(.*\n)}{} ) {
	 syswrite( $self->{WORK_FH}, decode_base64( $1 ) )
	    or die "$! writing $self->{WORK_NAME}" ;
      }
   }
   elsif ( $self->{STACK}->[-1]->{encoding} eq "none" ) {
# print STDERR map( sprintf( " %02x=$_", ord ), $_[0] =~ m/(.)/gs ), "\n" ;
      syswrite $self->{WORK_FH}, $_[0]
         or die "$! writing $self->{WORK_NAME}" ;
   }
   else {
      die "unknown encoding '$self->{STACK}->[-1]->{encoding}'\n";
   }
   return ;
}

sub end_content {
   my VCP::Source::revml $self = shift ;
   
   if ( length $self->{UNDECODED_CONTENT} ) {
      syswrite( $self->{WORK_FH}, decode_base64( $self->{UNDECODED_CONTENT} ) )
	 or die "$! writing $self->{WORK_NAME}" ;
   }
   close $self->{WORK_FH} or die "$! closing $self->{WORK_NAME}" ;
}

sub start_delta {
   my VCP::Source::revml $self = shift ;

   $self->init_rev_meta ;
   my $r = $self->{REV} ;
   $r->action( 'edit' ) ;
   $self->{WORK_NAME} = $self->work_path( $r->name, $r->branch_id || "-", 'delta' ) ;
   sysopen $self->{WORK_FH}, $self->{WORK_NAME}, O_WRONLY | O_CREAT | O_TRUNC
      or die "$!: $self->{WORK_NAME}" ;
   ## See comment in start_content :)
   binmode $self->{WORK_FH};
}


## TODO: Could keep deltas in memory if they're small.
*delta_characters = \&content_characters ;
## grumble...name used once warning...grumble
*delta_characters = \&content_characters ;

sub end_delta {
   my VCP::Source::revml $self = shift ;

   close $self->{WORK_FH} or die "$! closing $self->{WORK_NAME}" ;

#print STDERR `hexdump -cx $self->{WORK_NAME}` ;

   my VCP::Rev $r = $self->{REV} ;
   my $abs_name = "$self->{HEADER}->{rev_root}/" . $r->name;
   if ( $abs_name !~ $self->{FILESPEC_RE} ) {
      return;
   }

   my VCP::Rev $bv_r = $self->revs->get( $r->previous_id ) ;
   $bv_r = $bv_r->previous
       while ! defined $bv_r->work_path;

   die "No original content to patch for ", $r->as_string
      unless defined $bv_r;

   if ( -s $self->{WORK_NAME} ) {
      ## source fn, result fn, patch fn
      vcp_patch( $bv_r->work_path, $r->work_path, $self->{WORK_NAME} );
      unless ( $ENV{VCPNODELETE} ) {
         unlink $self->{WORK_NAME}
            or pr "$! unlinking $self->{WORK_NAME}\n" ;
      }
   }
   else {
      ## TODO: Don't assume working link()
      debug "linking ", $bv_r->work_path, ", ", $r->work_path
         if debugging ;

      link $bv_r->work_path, $r->work_path
         or die "$! linking ", $bv_r->work_path, ", ", $r->work_path
   }
}


## Convert ISO8601 UTC time to local time since the epoch
sub end_time {
   my VCP::Source::revml $self = shift ;

   my $timestr = $self->{STACK}->[-1]->{TEXT};
   ## TODO: Get parser context here & give file, line, and column. filename
   ## and rev, while we're scheduling more work for the future.
   confess "Malformed time value $timestr\n"
      unless $timestr =~ /^\d\d\d\d(\D\d\d){5}/ ;
   confess "Non-UTC time value $timestr\n" unless substr $timestr, -1 eq 'Z' ;
   my @f = split( /\D/, $timestr ) ;
   --$f[1] ; # Month of year needs to be 0..11
   $self->{STACK}->[-1]->{TEXT} = timegm( reverse @f ) ;
}

# double assign => avoid used once warning
*end_mod_time = *end_mod_time = \&end_time ;


## TODO: Verify that we should be using a Base64 encoded MD5 digest,
## according to <delta>'s attributes.  Oh, and same goes for <content>'s
## encoding.

## TODO: workaround backfilling if the destination is revml, since
## it can't put the original content in place.  We'll need to flag
## some kind of special pass-through mode for that.

sub end_digest {
   my VCP::Source::revml $self = shift ;

   $self->init_rev_meta unless defined $self->{REV} ;

   my $r = $self->{REV} ;
   my $abs_name = "$self->{HEADER}->{rev_root}/" . $r->name;
   if ( $abs_name !~ $self->{FILESPEC_RE} ) {
      return;
   }


   my $original_digest = $self->{STACK}->[-1]->{TEXT};

   if ( $r->is_base_rev ) {
      ## Don't bother checking the digest if the destination returns
      ## FALSE, meaning that a backfill is not possible with that destination.
      ## VCP::Dest::revml does this.
      if ( $self->{HEADER} ) {
         $self->dest->handle_header( $self->{HEADER} );
         $self->{HEADER} = undef;
      }

      return unless $self->dest->backfill( $r ) ;
   }

   my $work_path = $r->work_path ;

   my $d = Digest::MD5->new() ;
   sysopen F, $work_path, O_RDONLY
      or die "$! opening '$work_path' for digestion\n" ;
   ## See comment for binmode in start_content :)
   binmode F;
   $d->addfile( \*F ) ;
   close F ;
   my $reconstituted_digest = $d->b64digest ;

   ## TODO: provide an option to turn this in to a warning
   ## TODO: make this abort writing anything to the dest, but continue
   ## processing, so as to deliver as many error messages as possible.
   unless ( $original_digest eq $reconstituted_digest ) {
      my $reject_file_name = $r->name ;
      $reject_file_name =~ s{[^A-Za-z0-9 -.]+}{-}g ;
      $reject_file_name =~ s{^-+}{}g ;
      my $reject_file_path = File::Spec->catfile(
         File::Spec->tmpdir,
	 $reject_file_name
      ) ;

      link $work_path, $reject_file_path 
         or die "digest check failed for ", $r->as_string, "\n",
	 "   failed to leave copy in '$reject_file_path': $!\n" ;

      die "digest check failed for ", $r->as_string, "\n",
	 "   copy left in '$reject_file_path'\n",
         "   got      digest: $reconstituted_digest\n",
         "   expected digest: $original_digest\n";
   }
}


sub end_placeholder {
   my VCP::Source::revml $self = shift ;
   $self->init_rev_meta( "is_placeholder" );
}


## Having this and no sub rev_characters causes the parser to accumulate
## content.
sub end_rev {
   my VCP::Source::revml $self = shift ;

   my $abs_name = "$self->{HEADER}->{rev_root}/" . $self->{REV}->name;
   if ( $abs_name !~ $self->{FILESPEC_RE} ) {
      return;
   }

   $self->revs->add( $self->{REV} );

   ## This is backwards from the other sources, because RevML files are
   ## in oldest-first order, while other VCP sources deal with SCMs that
   ## report file logs in newest-first order.
   if ( defined $self->{REV}->previous_id ) {
      $self->{REV}->previous( 
         $self->revs->get( $self->{REV}->previous_id )
      ) if defined $self->{REV}->previous_id;
   }

   ## Release this rev.
   $self->{REV} = undef ;
}


=head1 AUTHOR

Barrie Slaymaker <barries@slaysys.com>

=head1 COPYRIGHT

Copyright (c) 2000, 2001, 2002 Perforce Software, Inc.
All rights reserved.

See L<VCP::License|VCP::License> (C<vcp help license>) for the terms of use.

=cut

1 ;