The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl

use warnings;
use strict;
use lib qw(./lib);

use YAML;
use SVN::Analysis;

use Git::Repository;
use Getopt::Long;

my $last_log_time = $^T;

my ($map_file_name, $db_file_name, $git_base, $new_base, $help_me, $verbose);
my $getopt_okay = GetOptions(
	'map=s',    \$map_file_name,
	'db=s',     \$db_file_name,
	'from=s',   \$git_base,
	'into=s',   \$new_base,
	'help',     \$help_me,
	'verbose',  \$verbose,
);

if ($help_me or !$getopt_okay) {
	die(
		"$0 usage:\n",
		"  --map=FILENAME           location of project map file\n",
		"  --db=FILENAME            location of snanalyze sqlite3 database\n",
		"  --from=GIT_BASE_DIR      location of monolithic git checkout\n",
		"  --into=NEW_PARENT_DIR    directory to be created for projects\n",
		"  --verbose                explain what's going on\n",
		"  --help                   you're soaking in it.\n",
	);
}

die "$0: Destination directory $new_base shouldn't pre-exist.\n" if (
	-e $new_base
);

die "$0: Source git directory $git_base must pre-exist.\n" unless (
	-e $git_base
);

my $analysis = SVN::Analysis->new(
	db_file_name  => $db_file_name,
	verbose       => $verbose,
);

# Acquire the projects.
my $projects = YAML::LoadFile($map_file_name);

unless (scalar keys %$projects) {
	print "No projects in $map_file_name.  I guess we're done.\n";
}

# Set up the target projects.

mkdir $new_base or die "mkdir $new_base failed: $!";

my @project_names = sort keys %$projects;

foreach (@project_names) {
	my $clone_path = "$new_base/$_";

	chdir $git_base or die $!;
	mkdir $clone_path or die $!;

	_log("Cloning $git_base into $clone_path.");

	my $project = $projects->{$_};

	system("pax", "-rw", "-pe", "-X", ".", $clone_path) and die $!;

	$project->{git} = Git::Repository->new(work_tree => $clone_path) or die $!;

	# http://lists.zerezo.com/git/msg654051.html
	# "in some cases git-filter-branch refuses to run unless git-status
	# is invoked first".  This seems to be one of those times.
	$project->{git}->run("status");

	$project->{git_base_dir} = $clone_path;
}

# Find the last outside contributions to each project.
# O(N^2) unfortunately.

my %copy_into;

foreach my $src_name (@project_names) {
	foreach my $dst_name (@project_names) {
		next if $src_name eq $dst_name;

		my $copy = $analysis->get_last_copy_into_tree(
			$projects->{$src_name}{svndir},
			$projects->{$dst_name}{svndir},
		);

		# There was no copy
		next unless $copy;

		$copy_into{$dst_name}{$src_name} = $copy;

		# Tag the source so we can filter-branch by tag.
		# Tags are revised during filter-branch, but revs just break.

		$projects->{$dst_name}{git}->run(
			'tag', 'svn-r-' . $copy->src_rev(),
			$analysis->get_other_rev_from_svn($copy->src_rev())
		);
	}
}

# Repositories that don't exist in %copy_into are stand-alone.
# They have no outside contributions.
# Everything else can be removed from their repositories.

foreach my $this_name (@project_names) {
	next if exists $copy_into{$this_name};

	_log("Isolating stand-alone project $this_name...");

	my $this_project = $projects->{$this_name};
	$this_project->{git}->run(
		'filter-branch',
		'--subdirectory-filter', $this_project->{gitdir},
		'--tag-name-filter', 'cat',
		'--prune-empty',
		'--',
		'--all',
	);

	fix_tags($this_project);
	purge_project($this_project);
}

# Repositories that do have contributions are a bit more complex.
# Contributing projects may be removed after the last contribution.
# Projects that don't contribute may be removed entirely.

foreach my $this_name (@project_names) {
	next unless exists $copy_into{$this_name};
	my $inbound = $copy_into{$this_name};

	my $this_project = $projects->{$this_name};

	foreach my $other_name (@project_names) {
		next if $this_name eq $other_name;

		my $other_project = $projects->{$other_name};

		my $prune_range;
		if (exists $inbound->{$other_name}) {
			$prune_range = 'svn-r-' . $inbound->{$other_name}->src_rev() . '..';

			_log("Truncating $other_name ($prune_range) from $this_name...");
		}
		else {
			_log("Removing $other_name entirely from $this_name...");
			$prune_range = '--all';

			# TODO - Optimization.  The removed subproject probably touches
			# a smaller rev range than --all.  We can either list that range
			# or maybe derive the revision subset from git itself.
		}

		$this_project->{git}->run(
			'filter-branch',
			'--tag-name-filter', 'cat',
			'--index-filter',
			"git rm -r --cached --ignore-unmatch $other_project->{gitdir}",
			'--prune-empty',
			'--',
			$prune_range
		);

		purge_project($this_project);
	}

	fix_tags($this_project);
	purge_project($this_project);
}

exit;

#################

sub fix_tags {
	my $project = shift;

	_log("Fixing tags...");

	my $git = $project->{git};

	# Build the repository graph.  So far this is easier than using git
	# tools to find the parents and children on demand.

	my (%parents, %children);
	foreach my $rev ($git->run('rev-list', '--parents', '--all')) {
		my ($self, @parents) = ($rev =~ m/(\S+)/g);

		push @{$children{$_}}, $self foreach @parents;
		$parents{$self} = \@parents;
	}

	# Check each branch.

	BRANCH: foreach my $branch ($git->run('branch')) {
		$branch =~ s/\s*\*\s+//;
		$branch =~ s/^\s+//;
		$branch =~ s/\s+$//;

		# Get parents.  We do the lookup via rev-list so we can get the
		# branch's SHA1 as well.

		my $parents = $git->run('rev-list', '--no-walk', '--parents', $branch);
		my @parents = ($parents =~ m/(\S+)/g);

		my $this = shift(@parents);

		# We only care if there is one parent.
		# We don't want to mangle merges.

		next BRANCH unless @parents == 1;

		# Single revisions off HEAD are interesting...
		my @branch_revs = $git->run('rev-list', $branch, '^HEAD');
		if (@branch_revs == 1 and $branch_revs[0] eq $this) {

			# ... but they're only interesting if they only delete things.
			OP: foreach my $op (
				$git->run("log", "--pretty=%b", "--name-status", "--no-walk", $branch)
			) {
				next OP     if      $op =~ /^\s*$/;
				next BRANCH unless  $op =~ /^D\s/;
			}

			# Tag the parent commit with the branch commit's info.
			convert_branch_to_tag($git, $branch, $parents[0]);
			next BRANCH;
		}

		# Get children.
		my $children = $children{$this};

		# We only care if there is one child.
		next unless $children and @$children == 1;

		# If the branch is exactly between its parent and child, and there's
		# no actual branching going on, then convert it to a tag.

		my $parent = $parents[0];
		my $child  = $children->[0];

		my $parent_children = $children{$parent};
		next BRANCH unless (
			$parent_children and
			@$parent_children == 1 and
			$parent_children->[0] eq $this
		);

		my $child_parents = $parents{$child};
		next BRANCH unless (
			$child_parents and
			@$child_parents == 1 and
			$child_parents->[0] eq $this
		);

		convert_branch_to_tag($git, $branch, $this);
	}
}

# Convert a branch to an annotated tag.
# The tag's header information will be identical to the branch's.
# The branch will be deleted after tagging.

sub convert_branch_to_tag {
	my ($git, $branch, $rev_to_tag) = @_;

	# Temporarily remove the tag, if it exists.
	$git->run("tag", "-d", $branch);

	# git cat-file parsing brought to you by
	# http://github.com/book/git-gadgets/blob/master/git-copytag
	my %info;

	# get tag content
	@info{qw( header body )} = split /\n\n/, $git->run(
		'cat-file', 'commit', $branch
	), 2;

	# parse header
	%info = ( %info, map { ( split / /, $_, 2 ) } split /\n/, $info{header} );
	$info{author} =~ /^(.*?)\s+<([^>]*)>\s+(.*?)\s*$/;
	@info{qw( author_name author_email author_date )} = ( $1, $2, $3 );
	$info{committer} =~ /^(.*?)\s+<([^>]*)>\s+(.*?)\s*$/;
	@info{qw( committer_name committer_email committer_date )} = (
		$1, $2, $3
	);

	my %env;
	if (exists $info{committer}) {
		$env{GIT_COMMITTER_EMAIL} = $info{author_email};
		$env{GIT_COMMITTER_NAME}  = $info{author_name};
		$env{GIT_COMMITTER_DATE}  = $info{author_date};
	}

	if (exists $info{author}) {
		$env{GIT_AUTHOR_EMAIL}    = $info{author_email};
		$env{GIT_AUTHOR_NAME}     = $info{author_name};
		$env{GIT_AUTHOR_DATE}     = $info{author_date};
	}

	my $annotation = $info{body};
	$annotation =~ s/'/\\'/g;

	$git->run(
		"tag", "-a",
		"-m", $annotation,
		$branch, $rev_to_tag,
		{ env => \%env },

	);

	$git->run("branch", "-D", $branch);
}

sub purge_project {
	my $project = shift;

	_log("Purging backups from $project->{git_base_dir}...");

	system(
		"/bin/rm", "-r", "-f",
		"$project->{git_base_dir}/.git/refs/original",
		"$project->{git_base_dir}/.git/refs/logs/",
	) and die $!;

	_log("Expiring reflog entries from $project->{git_base_dir}...");
	$project->{git}->run("reflog", "expire", "--all");

	_log("Garbage collecting $project->{git_base_dir}...");
	$project->{git}->run("gc", "--aggressive", "--prune");

	_log("Checking $project->{git_base_dir}...");
	$project->{git}->run("fsck", "--full", "--strict");
}

sub _log {
	return unless $verbose;

	my $message = join "", @_;
	my $now = time();

	printf "%-6d %-6d %s\n", ($now - $^T), ($now - $last_log_time), $message;
	$last_log_time = $now;
}

__END__

=head1 NAME

snerp-projector - break a Git repository into projects

=head1 SYNOPSIS

	snerp-projector \
		--map paths-to-projects.txt \
		--db snanalyze-index.sqlite3 \
		--from monolithic-git-working-dir \
		--into new-parent-dir

=head1 DESCRIPTION

snerp-projector identifies and breaks up a monolithic Git repository
into separate, smaller repositories for each subproject.

The original monolithic git repository (--from) is cloned for each
anticipated subrpoject (listed in --map).  The subproject clones are
placed into a new parent directory (--into).

Subprojects are described in a YAML file, which contains a hash of
hashes:

	project:
		gitdir: project/dir
		svndir: trunk/project/dir

For example:

	poe-loop-gtk:
		gitdir: polo-gtk
		svndir: trunk/polo-gtk

Needing both gitdir and svndir isn't ideal, but there hasn't been a
better idea yet.

=head2 Repository Requirements

snerp-projector makes some assumptions about the repository.  These
may change in the future, faster if someone can contribute patches.

=over

=item * Each subproject lives entirely under a single directory.

=item * The complete history for each subproject is worth preserving.

=back

=head2 Subproject Types

Subprojects vary in structure.  Each structure has its own
requirements, which are outlined here.

=head3 Notes Applicable to All Project Types

In all cases, the original repository is cloned once for each
subproject listed in the --map file.

The extraction of each subproject will modify most or all of the
cloned repositories.  Unrelated commits will be removed from each
subproject's repository.  As much of the subproject as possible will
be removed from all other repositories.

=head3 Isolated Projects

Some projects are created as new directories and contain all new
files.  No other project contributes to them.  No files are copied in
from elsewhere in the repository.

These projects are easiest to split into their own repositories:

	git filter-branch \
		-f \
		--subdirectory-filter project/path \
		--tag-name-filter cat \
		--prune-empty \
		-- \
		--all

=head3 Spin-Off Projects

Spin-off projects include files copied or moved in from other
projects.  These pre-project files and commits contribute to the
spin-off and its history, so they must be preserved.  However, they
may be discarded after the final contribution to the spin-off project.

Projects that don't contribute to spin-offs are removed entirely:

	git filter-branch \
		--index-filter 'git rm -r --cached --ignore-unmatch source/path' \
		master

Projects that do contribute are discontinued after the final
contribution.  Everything afterwards is removed.  For example, if
poe's last contribution to poe-extras was change df174613:

	git filter-branch \
		--index-filter 'git rm -r --cached --ignore-unmatch extras' \
		df174613..

That's a literal "..".

=head1 SEE ALSO

L<App::SnerpVortex> - Main documentation for Snerp Vortex.

snanalyze - Analyze a Subversion dump, and produce an index database
for other tools to process.

mnerp - Convert a Subversion repository to a flat filesystem or Git.
Uses the snanalyze index, with help from the snassign tools, to
intelligently branch and tag as it goes.

=head1 AUTHORS AND LICENSE

Snerp Vortex is Copyright 2010 by Rocco Caputo and contributors.

It is released under the same terms as Perl itself.

=cut