lib/GraphViz2/Marpa/PathUtils.pm

package GraphViz2::Marpa::PathUtils;

use parent 'GraphViz2::Marpa';
use strict;
use warnings;
use warnings qw(FATAL utf8);    # Fatalize encoding glitches.
use open     qw(:std :utf8);    # Undeclared streams in UTF-8.

use File::Basename; # For basename().

use GraphViz2::Marpa::Renderer::Graphviz;

use Moo;

use Set::Tiny;

use Sort::Key 'nkeysort';

use Types::Standard qw/ArrayRef Bool HashRef Int Str/;

has allow_cycles =>
(
	default  => sub{return 0},
	is       => 'rw',
	isa      => Bool,
	required => 0,
);

has cluster_sets =>
(
	default  => sub{return {} },
	is       => 'rw',
	isa      => HashRef,
	required => 0,
);

has cluster_trees =>
(
	default  => sub{return {} },
	is       => 'rw',
	isa      => HashRef,
	required => 0,
);

has fixed_path_set =>
(
	default  => sub{return []},
	is       => 'rw',
	isa      => ArrayRef,
	required => 0,
);

has path_length =>
(
	default  => sub{return 0},
	is       => 'rw',
	isa      => Int,
	required => 0,
);

has report_clusters =>
(
	default  => sub{return 0},
	is       => 'rw',
	isa      => Bool,
	required => 0,
);

has report_paths =>
(
	default  => sub{return 0},
	is       => 'rw',
	isa      => Bool,
	required => 0,
);

has start_node =>
(
	default  => sub{return ''},
	is       => 'rw',
	isa      => Str,
	required => 0,
);

our $VERSION = '2.00';

# -----------------------------------------------

sub _coelesce_cluster_membership
{
	my($self, $count, $node, $clusters, $subgraph) = @_;

	# Some nodes may be heads but never tails, so they won't exist in this hash.

	return if (! defined $$clusters{$node});

	for my $member ($$clusters{$node} -> members)
	{
		next if ($$subgraph{$count} -> member($member) );

		$$subgraph{$count} -> insert($member);
		$self -> _coelesce_cluster_membership($count, $member, $clusters, $subgraph);
	}

} # End of _coelesce_cluster_membership.

# -----------------------------------------------

sub _coelesce_cluster_sets
{
	my($self, $clusters) = @_;

	# We take a copy of the keys so that we can delete hash keys
	# in arbitrary order while processing the sets which are the values.

	my(@nodes) = sort keys %$clusters;
	my($count) = 0;

	my(%subgraphs);

	for my $node (@nodes)
	{
		$count++;

		$subgraphs{$count} = Set::Tiny -> new;

		$subgraphs{$count} -> insert($node);
		$self -> _coelesce_cluster_membership($count, $node, $clusters, \%subgraphs);
	}

	return \%subgraphs;

} # End of _coelesce_cluster_sets.

# -----------------------------------------------

sub _find_cluster_containing_start_node
{
	my($self) = @_;
	my($sets) = $self -> cluster_sets;

	my(%new_trees);

	for my $id (keys %$sets)
	{
		next if (! $$sets{$id} -> member($self -> start_node) );

		$new_trees{$id} = $self -> _find_clusters_trees([$$sets{$id} -> members]);
	}

	# There should be just 1 tree containing the start node.

	my(@ids) = keys %new_trees;

	die "Error: @{[$#ids + 1]} trees containg the start node. There should be exactly 1\n" if ($#ids != 0);

	return $new_trees{$ids[0]};

} # End of _find_cluster_containing_start_node.

# -----------------------------------------------

sub _find_cluster_mothers_with_edges
{
	my($self) = @_;

	my($name, $node_id);
	my(%seen);
	my($uid);

	$self -> tree -> walk_down
	({
		callback => sub
		{
			my($node) = @_;
			$name     = $node -> name;

			# Ignore non-edges.

			return 1 if ($name ne 'edge_id'); # Keep walking.

			$node_id = $self -> decode_node($node -> mother);
			$uid     = $$node_id{uid};

			# Ignore if this mother has been seen.

			return 1 if ($seen{$uid}); # Keep walking.

			# Save mother's details.

			$seen{$uid} = $node -> mother;

			return 1; # Keep walking.
		},
		_depth => 0,
	});

	return \%seen;

} # End of _find_cluster_mothers_with_edges.

# -----------------------------------------------

sub _find_cluster_reachable_nodes
{
	my($self, $uids) = @_;

	my(%clusters);
	my(@daughters);
	my($last_node_id);
	my(%node, $name, $next_node_id);

	# This loop is outwardly the same as the one in _find_clusters_trees().

	for my $mother_uid (sort keys %$uids)
	{
		@daughters = $$uids{$mother_uid} -> daughters;

		for my $index (0 .. $#daughters)
		{
			$name = $daughters[$index] -> name;

			# Ignore non-edges, since we are checking for connectivity.

			next if ($name ne 'edge_id');

			# These offsets are only valid if the DOT file is valid.

			$last_node_id = $self -> decode_node($daughters[$index - 1]);
			$next_node_id = $self -> decode_node($daughters[$index + 1]);
			$node{tail}   =
			{
				id   => $$last_node_id{id},
				name => $$last_node_id{name},
			};
			$node{head} =
			{
				id   => $$next_node_id{id},
				name => $$next_node_id{name},
			};

			# Cases to handle (see data/path.set.01.in.gv):
			# o These edge combinations:
			# o a -> b.
			# o c -> { d e }.
			# o { f g } -> h.
			# o { i j } -> { k l }.

			if ($node{tail}{id} eq 'node_id')
			{
				if ($node{head}{id} eq 'node_id')
				{
					# Case: a -> b.
					# So now we add 'b' to the set of all nodes reachable from 'a'.
					# The point is that, if 'a' appears anywhere else in the graph, then all
					# nodes connected (via edges) to that other copy of 'a' are also connected to 'b'.
					# Likewise for 'a'.
					# The reason for doing both 'a' and 'b' is that either one may appear elsewhere
					# in the graph, but we don't know which one, if either, does.

					for my $type (qw/tail head/)
					{
						$name            = $node{$type}{name};
						$clusters{$name} ||= Set::Tiny -> new;

						$clusters{$name} -> insert($type eq 'tail' ? $node{head}{name} : $node{tail}{name});
					}
				}
				else
				{
					# Case: c -> { d e }.
					# From 'c', every node in the subgraph can be reached.
					# Start at $index + 1, which is the '{'.

					$self -> _find_cluster_reachable_subgraph_1(\%clusters, \%node, $daughters[$index + 1]);
				}
			}
			else
			{
				if ($node{head}{id} eq 'node_id')
				{
					# Case: { f g } -> h.
					# Every node in the subgraph can reach 'h'.
					# We use $index - 2 ('{') since we know $index - 1 is '}'.

					$self -> _find_cluster_reachable_subgraph_2(\%clusters, \%node, $daughters[$index - 2]);
				}
				else
				{
					# Case: { i j } -> { k l }.
					# Every node in the 1st subgraph can reach every node in the 2nd.
					# Start at $index + 1 in order to skip the '{'.
					# We use $index - 2 ('{') since we know $index - 1 is '}'.

					$self -> _find_cluster_reachable_subgraph_3(\%clusters, $daughters[$index - 2], $daughters[$index + 1]);
				}
			}
		}
	}

	return \%clusters;

} # End of _find_cluster_reachable_nodes.

# -----------------------------------------------

sub _find_cluster_reachable_subgraph_1
{
	my($self, $clusters, $node, $head_subgraph) = @_;
	my($real_tail) = $$node{tail}{name};
	my(@daughters) = $head_subgraph -> daughters;

	my($node_id);
	my($real_head);

	for my $i (0 .. $#daughters)
	{
		$node_id = $self -> decode_node($daughters[$i]);

		# Ignore non-nodes within the head subgraph.

		next if ($$node_id{id} ne 'node_id');

		# Stockpile all nodes within the head subgraph.

		$real_head             = $$node_id{name};
		$$clusters{$real_tail} ||= Set::Tiny -> new;

		$$clusters{$real_tail} -> insert($real_head);

		$$clusters{$real_head} ||= Set::Tiny -> new;

		$$clusters{$real_head} -> insert($real_tail);
	}

} # End of _find_cluster_reachable_subgraph_1.

# -----------------------------------------------

sub _find_cluster_reachable_subgraph_2
{
	my($self, $clusters, $node, $tail_subgraph) = @_;
	my($real_head) = $$node{head}{name};
	my(@daughters) = $tail_subgraph -> daughters;

	my($node_id);
	my($real_tail);

	for my $i (0 .. $#daughters)
	{
		$node_id = $self -> decode_node($daughters[$i]);

		# Ignore non-nodes within tail subgraph.

		next if ($$node_id{id} ne 'node_id');

		# Stockpile all nodes within the tail subgraph.

		$real_tail             = $$node_id{name};
		$$clusters{$real_head} ||= Set::Tiny -> new;

		$$clusters{$real_head} -> insert($real_tail);

		$$clusters{$real_tail} ||= Set::Tiny -> new;

		$$clusters{$real_tail} -> insert($real_head);
	}

} # End of _find_cluster_reachable_subgraph_2.

# -----------------------------------------------

sub _find_cluster_reachable_subgraph_3
{
	my($self, $clusters, $tail_subgraph, $head_subgraph) = @_;
	my(@tail_daughters) = $tail_subgraph -> daughters;
	my(@head_daughters) = $head_subgraph -> daughters;

	my($node_id);
	my($real_tail, $real_head);

	for my $i (0 .. $#tail_daughters)
	{
		$node_id = $self -> decode_node($tail_daughters[$i]);

		# Ignore non-nodes within tail subgraph.

		next if ($$node_id{id} ne 'node_id');

		# Stockpile all nodes within the tail subgraph.

		$real_tail = $$node_id{name};

		for my $j (0 .. $#head_daughters)
		{
			$node_id = $self -> decode_node($head_daughters[$j]);

			# Ignore non-nodes within head subgraph.

			next if ($$node_id{id} ne 'node_id');

			$real_head             = $$node_id{name};
			$$clusters{$real_head} ||= Set::Tiny -> new;

			$$clusters{$real_head} -> insert($real_tail);

			$$clusters{$real_tail} ||= Set::Tiny -> new;

			$$clusters{$real_tail} -> insert($real_head);
		}
	}

} # End of _find_cluster_reachable_subgraph_3.

# -----------------------------------------------

sub _find_cluster_standalone_nodes
{
	my($self, $subgraphs) = @_;

	# We need to find the # of subgraphs, since stand-alone clusters
	# will be numbered starting 1 after the highest set # so far.

	my(@keys)  = sort keys %$subgraphs;
	my($count) = $#keys < 0 ? 0 : $keys[$#keys];

	# Get the names of all members of all subgraphs.
	# Any nodes found by walk_down(), which are not in this collection,
	# are standalone nodes.

	my(@members);
	my(%seen);

	for my $id (keys %$subgraphs)
	{
		@members        = $$subgraphs{$id} -> members;
		@seen{@members} = (1) x @members;
	}

	my($node_id);
	my($value);

	$self -> tree -> walk_down
	({
		callback => sub
		{
			my($node) = @_;
			$node_id  = $self -> decode_node($node);

			# Ignore non-nodes and nodes which have been seen.

			return 1 if ( ($$node_id{id} ne 'node_id') || defined $seen{$$node_id{name} }); # Keep walking.

			$count++;

			$$subgraphs{$count} = Set::Tiny -> new;

			$$subgraphs{$count} -> insert($$node_id{name});

			return 1; # Keep walking.
		},
		_depth => 0,
	});

	$self -> cluster_sets($subgraphs);

} # End of _find_cluster_standalone_nodes.

# -----------------------------------------------
# Called by the user.

sub find_clusters
{
	my($self)          = @_;
	my($subgraph_sets) = $self -> _preprocess;

	# Find nodes which do not participate in paths,
	# i.e. there are no edges leading to them or from them.
	# Such nodes are stand-alone clusters.

	$self -> _find_cluster_standalone_nodes($subgraph_sets);
	$self -> report_cluster_members if ($self -> report_clusters);
	$self -> _generate_tree_per_cluster;
	$self -> output_clusters if ($self -> output_file);

	# Return 0 for success and 1 for failure.

	return 0;

} # End of find_clusters.

# -----------------------------------------------

sub _find_clusters_trees
{
	my($self, $members) = @_;

	# We use a copy of the tree so the user of this module can still get access to the whole tree
	# after each cluster cuts out the unwanted nodes (i.e. those nodes in all other clusters).
	# Further, this means the copy has all the node and edge attributes for the wanted nodes.
	#
	# We will excise all nodes which are not members, and also excise any subgraphs consisting
	# entirely of unwanted nodes. Also, we'll excise all edges involving the unwanted nodes.
	# Note: The Tree::DAG_Node docs warn against editing the tree during a walk_down(), so we
	# stockpile mothers whose daughters need to be processed after walk_down() returns.

	my($cluster) = $self -> tree -> copy_tree;

	my(%wanted);

	@wanted{@$members} = (1) x @$members;

	my($name_id);
	my(%seen);

	$cluster -> walk_down
	({
		callback => sub
		{
			my($node) = @_;
			$name_id  = $self -> decode_node($node);

			# Check for unwanted nodes.

			if ( ($$name_id{id} eq 'node_id') && ! $wanted{$$name_id{name} })
			{
				$seen{$$name_id{uid} } = $node -> mother;
			}

			return 1; # Keep walking.
		},
		_depth => 0,
	});

	# This loop is outwardly the same as the one in _find_reachable_nodes().

	my(@daughters);
	my($limit);
	my($mother);

	for my $unwanted_uid (keys %seen)
	{
		$mother    = $seen{$unwanted_uid};
		@daughters = $mother -> daughters;
		$limit     = $#daughters;

		# Cases to handle (see data/path.set.01.in.gv):
		# o Delete the node, and edges in these combinations:
		# o a -> b.
		# o c -> { d e }.
		# o { f g } -> h.
		# o { i j } -> { k l } (Impossible, given the tree node is a 'node_id').

		for my $i (0 .. $limit)
		{
			$name_id = $self -> decode_node($daughters[$i]);

			next if ($unwanted_uid != $$name_id{uid});

			$daughters[$i] -> unlink_from_mother;

			for my $offset ($i - 1, $i + 1)
			{
				next if ( ($offset < 0) || ($offset > $limit) );

				$name_id = $self -> decode_node($daughters[$offset]);

				$daughters[$offset] -> unlink_from_mother if ($$name_id{id} eq 'edge_id');
			}
		}
	}

	return $cluster;

} # End of _find_clusters_trees.

# -----------------------------------------------
# Find N candidates for the next node along the path.

sub _find_fixed_length_candidates
{
	my($self, $tree, $prolog, $solution, $stack) = @_;
	my($current_name_id) = $self -> decode_node($$solution[$#$solution]);

	# Add the node's parent, if it's not the root.
	# Then add the node's children.

	my(@daughters);
	my($index);
	my($name_id, @neighbours);

	$tree -> walk_down
	({
		callback =>
		sub
		{
			my($node) = @_;
			$name_id  = $self -> decode_node($node);

			# We only want neighbours of the current node.

			return 1 if ($$name_id{name} ne $$current_name_id{name}); # Keep walking.

			# Now find its neighbours. These are sisters separated by an edge.
			# But beware of subgraphs.

			@daughters = $node -> mother -> daughters;
			$index     = $node -> my_daughter_index;

			$self -> _find_fixed_length_edges($prolog, \@daughters, $index, \@neighbours);

			return 1; # Keep walking.
		},
		_depth => 0,
	});

	# Elements:
	# 0 .. N - 1: The neighbours.
	# N:          The count of neighbours.

	push @$stack, @neighbours, $#neighbours + 1;

} # End of _find_fixed_length_candidates.

# -----------------------------------------------

sub _find_fixed_length_edges
{
	my($self, $prolog, $daughters, $index, $neighbours) = @_;

	# TODO: [$i + 2] could be a subgraph.
	# And what if the start node is inside a subgraph?

	my($node_id);

	# $index points to the daughter holding 'a'. Handle:
	# o a -> b and a - b.
	# o But not subgraphs.

	if ( ($index + 2) <= $#$daughters)
	{
		$node_id = $self -> decode_node($$daughters[$index + 1]);

		if ($$node_id{id} eq 'edge_id')
		{
			$node_id = $self -> decode_node($$daughters[$index + 2]);

			if ($$node_id{id} eq 'node_id')
			{
				# Handle a -> b and a - b.

				push @$neighbours, $$daughters[$index + 2];
			}
		}
	}

	# $index points to the daughter holding 'a'. Handle:
	# o b - a.

	return if ($$prolog{digraph} eq 'digraph');

	if ( ($index - 2) >= 0)
	{
		$node_id = $self -> decode_node($$daughters[$index - 1]);

		if ($$node_id{id} eq 'edge_id')
		{
			$node_id = $self -> decode_node($$daughters[$index - 2]);

			if ($$node_id{id} eq 'node_id')
			{
				# Handle b - a.

				push @$neighbours, $$daughters[$index - 2];
			}
		}
	}

} # End of _find_fixed_length_edges.

# -----------------------------------------------
# Find all paths starting from any copy of the target start_node.
# See References in the docs for the book in which I found this algorithm.

sub _find_fixed_length_path_set
{
	my($self, $tree, $prolog, $start) = @_;

	my(@all_solutions);
	my($count, $candidate);
	my(@one_solution);
	my(@stack);

	# Push the first copy of the start node, and its count (1), onto the stack.

	push @stack, $$start[0], 1;

	# Process these N candidates 1-by-1.
	# The top-of-stack is a candidate count.

	while ($#stack >= 0)
	{
		while ($stack[$#stack] > 0)
		{
			($count, $candidate) = (pop @stack, pop @stack);

			push @stack, $count - 1;
			push @one_solution, $candidate;

			# Does this candidate suit the solution so far?

			if ($#one_solution == $self -> path_length)
			{
				# Yes. Save this solution.

				push @all_solutions, [@one_solution];

				# Discard this candidate, and try another.

				pop @one_solution;
			}
			else
			{
				# No. The solution is still too short.
				# Push N more candidates onto the stack.

				$self -> _find_fixed_length_candidates($tree, $prolog, \@one_solution, \@stack);
			}
		}

		# Pop the candidate count (0) off the stack.

		pop @stack;

		# Remaining candidates, if any, must be contending for the 2nd last slot.
		# So, pop off the node in the last slot, since we've finished
		# processing all candidates for that slot.
		# Then, backtrack to test the next set of candidates for what,
		# after this pop, will be the new last slot.

		pop @one_solution;
	}

	$self -> fixed_path_set([@all_solutions]);

} # End of _find_fixed_length_path_set.

# -----------------------------------------------
# Find all paths starting from any copy of the target start_node.

sub _find_fixed_length_paths
{
	my($self, $tree, $prolog) = @_;

	# Phase 1: Find all copies of the start node.

	my(@daughters);
	my($index);
	my($node_id);
	my(@start);
	my($value);

	$tree -> walk_down
	({
		callback =>
		sub
		{
			my($node) = @_;
			$node_id  = $self -> decode_node($node);

			# Skip the special tree nodes.

			return 1 if ($$node_id{id} =~ /(?:graph|prolog|root)/); # Keep walking.

			# Skip the tree nodes with names other than the start node.

			return 1 if ($$node_id{name} ne $self -> start_node); # Keep walking.

			# Skip the tree nodes which are not on a path.

			$index     = $node -> my_daughter_index;
			@daughters = $node -> mother -> daughters;

			if ($$prolog{digraph} eq 'graph')
			{
				if ( ($index > 0) && ($daughters[$index - 1] -> name eq 'edge_id') )
				{
					push @start, $node;
				}
				elsif ( ($index < $#daughters) && ($daughters[$index + 1] -> name eq 'edge_id') )
				{
					push @start, $node;
				}
			}
			else
			{
				if ( ($index < $#daughters) && ($daughters[$index + 1] -> name eq 'edge_id') )
				{
					push @start, $node;
				}
			}

			return 1; # Keep walking.
		},
		_depth => 0,
	});

	# Give up if the given node was not found.

	die 'Error: Start node (', $self -> start_node, ") not found\n" if ($#start < 0);

	# Phase 2: Process each copy of the start node.

	$self -> _find_fixed_length_path_set($tree, $prolog, \@start);

} # End of _find_fixed_length_paths.

# -----------------------------------------------
# Called by the user.

sub find_fixed_length_paths
{
	my($self) = @_;

	die "Error: No start node specified\n" if (length($self -> start_node) == 0);
	die "Error: Path length must be > 0\n" if ($self -> path_length <= 0);

	$self -> cluster_sets($self -> _preprocess);

	my($tree)   = $self -> _find_cluster_containing_start_node;
	my($prolog) = $self -> decode_tree($self -> tree);

	$self -> _find_fixed_length_paths($tree, $prolog);
	$self -> _winnow_fixed_length_paths;

	my($title) = 'Input file: ' . basename($self -> input_file) . "\\n" .
		'Starting node: ' . $self -> start_node . "\\n" .
		'Path length: ' . $self -> path_length . "\\n" .
		'Allow cycles: ' . $self -> allow_cycles . "\\n" .
		'Paths: ' . scalar @{$self -> fixed_path_set};

	$self -> report_fixed_length_paths($title) if ($self -> report_paths);
	$self -> output_fixed_length_gv($tree, $prolog, $title) if ($self -> output_file);

	# Return 0 for success and 1 for failure.

	return 0;

} # End of find_fixed_length_paths.

# -----------------------------------------------

sub _generate_tree_per_cluster
{
	my($self) = @_;
	my($sets) = $self -> cluster_sets;

	my(%new_clusters);

	for my $id (sort keys %$sets)
	{
		$self -> log(debug => "Tree for cluster $id:");

		$new_clusters{$id} = $self -> _find_clusters_trees([$$sets{$id} -> members]);

		$self -> log(debug => join("\n", @{$new_clusters{$id} -> tree2string}) );

		# TODO: This is commented out because I'm not ready to handle nested subgraphs.

		#$self -> _winnow_cluster_tree($new_clusters{$id});
		#$self -> log(debug => join("\n", @{$new_clusters{$id} -> tree2string}) );
	}

	$self -> cluster_trees(\%new_clusters);

} # End of _generate_tree_per_cluster.

# -----------------------------------------------

sub output_clusters
{
	my($self)   = @_;
	my($sets)   = $self -> cluster_trees;
	my($prefix) = $self -> output_file;

	my($file_name);
	my($renderer);

	for my $id (sort keys %$sets)
	{
		$file_name = sprintf('%s.%03i.gv', $prefix, $id);

		GraphViz2::Marpa::Renderer::Graphviz -> new
		(
			logger      => $self -> logger,
			output_file => $file_name,
			tree        => $$sets{$id},
		) -> run;

		$self -> log(info => "Wrote cluster $id to $file_name");
	}

} # End of output_clusters.

# -----------------------------------------------
# Prepare the dot input, renumbering the nodes so dot does not coalesce the path set.

sub output_fixed_length_gv
{
	my($self, $tree, $prolog, $title) = @_;
	$$prolog{strict} .= ' ' if ($$prolog{strict} eq 'strict');

	# Now output the paths, using the nodes' original names as labels.

	my(@dot_text);

	push @dot_text, "$$prolog{strict}$$prolog{digraph} fixed_length_paths", '{', qq|\tlabel = "$title" rankdir = LR|, '';

	# We have to rename all the nodes so they can all be included
	# in a single DOT file without dot linking them based on their names.

	my($new_id) = 0;

	my(@set);

	for my $set (@{$self -> fixed_path_set})
	{
		my(%node_set, @node_set);

		for my $node_id (@$set)
		{
			# Allow for paths with loops, so we don't declare the same node twice.
			# Actually, I doubt Graphviz would care, since each declaration would be identical.
			# Also, later, we sort by name (i.e. $new_id) to get the order of nodes in the path.

			if (! defined($node_set{$$node_id{name}}) )
			{
				$node_set{$$node_id{name} } = 1;
				$$node_id{label}            = $$node_id{name};
				$$node_id{name}             = ++$new_id;
			}

			push @node_set, $node_id;
		}

		push @set, [@node_set];
	}

	# Firstly, declare all nodes.

	my($s);

	for my $set (@set)
	{
		for my $node_id (@$set)
		{
			push @dot_text, qq|\t"$$node_id{name}" [label = "$$node_id{label}"]|; # We don't know the attributes of the node.
		}
	}

	push @dot_text, '';

	# Secondly, declare all edges.

	my($edge) = ($$prolog{digraph} eq 'digraph') ? ' -> ' : ' -- ';

	for my $set (@set)
	{
		push @dot_text, "\t" . join(" $edge ", map{qq|"$$_{name}"|} @$set);
	}

	push @dot_text, '}', '';

	my($output_file) = $self -> output_file;

	open(my $fh, '> :encoding(utf-8)', $output_file) || die "Can't open(> $output_file): $!";
	print $fh join("\n", @dot_text);
	close $fh;

} # End of output_fixed_length_gv.

# -----------------------------------------------

sub _preprocess
{
	my($self) = @_;

	# Parse the input and create $self -> tree.
	# But stop GraphViz2::Marpa rendering anything.

	my($output_file) = $self -> output_file;

	$self -> output_file('');
	$self -> run;
	$self -> output_file($output_file);

	# Find mothers who have edges amongst their daughters.
	# Then process the daughters of those mothers.

	my($uids)      = $self -> _find_cluster_mothers_with_edges;
	my($clusters)  = $self -> _find_cluster_reachable_nodes($uids);
	my($subgraphs) = $self -> _coelesce_cluster_sets($clusters);

	# Renumber subgraphs by discarding duplicate sets.

	my($count) = 0;

	my(@members, $members);
	my(%subgraph_sets, %seen);

	for my $id (sort keys %$subgraphs)
	{
		@members = sort $$subgraphs{$id} -> members;
		$members = join(' ', @members);

		next if ($seen{$members});

		$seen{$members} = 1;

		$count++;

		$subgraph_sets{$count} = Set::Tiny -> new;

		$subgraph_sets{$count} -> insert(@members);
	}

	return \%subgraph_sets;

} # End of _preprocess.

# -----------------------------------------------

sub report_cluster_members
{
	my($self) = @_;
	my($sets) = $self -> cluster_sets;
	my(@keys) = nkeysort{$_} keys %$sets;

	$self -> log(notice => 'Input file: ' . basename($self -> input_file) . '. Clusters: ' . scalar(@keys) . ':');

	for my $id (@keys)
	{
		$self -> log(notice => "Cluster: $id. " . $$sets{$id} -> as_string);
	}

} # End of report_cluster_members.

# -----------------------------------------------

sub report_fixed_length_paths
{
	my($self, $title) = @_;
	$title            =~ s/\\n/. /g;
	my($count)        = 0;

	$self -> log(notice => "$title:");

	for my $candidate (@{$self -> fixed_path_set})
	{
		$count++;

		$self -> log(notice => "Path: $count. " . join(' -> ', map{$$_{name} } @$candidate) );
	}

} # End of report_fixed_length_paths.

# -----------------------------------------------

sub _winnow_cluster_tree
{
	my($self, $tree) = @_;

	# Each tree will probably have had nodes deleted. This may have left
	# empty subgraphs, such as {} or subgraph {} or subgraph sub_name {}.
	# We wish to delete these from the tree. They are all characterized by
	# having no nodes in the {}, even if they have edges and attributes there.
	# Of course, there must be no nodes in nested subgraphs too.

	my(@daughters);
	my($node_id);
	my(%seen);
	my($uid);

	$tree -> walk_down
	({
		callback => sub
		{
			my($node)  = @_;
			@daughters = $node -> daughters;

			if ($self -> _winnow_subgraph(\@daughters) )
			{
				$node_id               = $self -> decode_node($node);
				$seen{$$node_id{uid} } = 1;
			}

			return 1; # Keep walking.
		},
		_depth => 0,
	});

	$self -> log(info => "Target for winnow: $_") for sort keys %seen;

} # End of _winnow_cluster_tree.

# -----------------------------------------------
# Eliminate solutions which have (unwanted) cycles.

sub _winnow_fixed_length_paths
{
	my($self)   = @_;
	my($cycles) = $self -> allow_cycles;

	my(@solutions);

	for my $candidate (@{$self -> fixed_path_set})
	{
		# Count the number of times each node appears in this candidate path.

		my(%seen);

		@$candidate = map{$self -> decode_node($_)} @$candidate;

		$seen{$$_{name} }++ for @$candidate;

		# Exclude nodes depending on the allow_cycles option:
		# o 0 - Do not allow any cycles.
		# o 1 - Allow any node to be included once or twice.

		if ($cycles == 0)
		{
			@$candidate = grep{$seen{$$_{name} } == 1} @$candidate;
		}
		elsif ($cycles == 1)
		{
			@$candidate = grep{$seen{$$_{name} } <= 2} @$candidate;
		}

		push @solutions, [@$candidate] if ($#$candidate == $self -> path_length);
	}

	$self -> fixed_path_set([@solutions]);

} # End of _winnow_fixed_length_paths.

# -----------------------------------------------

sub _winnow_subgraph
{
	my($self, $daughters) = @_;

	# A subgraph is empty if 2 daughters, { and }, have no nodes between them.

	my($brace_found) = 0;

	my($node_id_i, $node_id_j, $node_count);

	for my $i (0 .. $#$daughters)
	{
		$node_id_i = $self -> decode_node($$daughters[$i]);

		# Find an open brace, {.

		if ($$node_id_i{name} eq '{')
		{
			$brace_found = 1;
			$node_count  = 0;

			for (my $j = $i + 1; $j <= $#$daughters; $j++)
			{
				$node_id_j = $self -> decode_node($$daughters[$j]);

				# Find the first closing brace, }.

				last if ($$node_id_j{name} eq '}');

				# Count the nodes between the { and }.

				$node_count++ if ($$node_id_j{id} eq 'node_id');
			}

			# Exit when at least 1 set of 'empty' braces was found.

			last if ($node_count == 0);
		}
	}

	# Return 0 if there is no need to stockpile the uid of the mother,
	# and return 1 if a nodeless set of braces was found, since in the
	# latter case there is something in the tree worth winnowing.

	return ($brace_found && ($node_count == 0) ) ? 1 : 0;

} # End of _winnow_subgraph.

# -----------------------------------------------

1;

=pod

=head1 NAME

L<GraphViz2::Marpa::PathUtils> - Provide various analyses of Graphviz dot files

=head1 SYNOPSIS

All scripts and input and output files listed here are shipped with the distro.

=head2 Finding clusters

Either pass parameters in to new():

	GraphViz2::Marpa::PathUtils -> new
	(
	    input_file      => 'data/clusters.in.09.gv',
	    output_file     => 'out/clusters.out.09', # Actually a prefix. See FAQ.
	    report_clusters => 1,
	) -> find_clusters;

Or call methods to set parameters:

	my($parser) = GraphViz2::Marpa::PathUtils -> new;

	$parser -> input_file('data/clusters.in.09.gv');
	$parser -> output_file('out/clusters.out.09'); # Actually a prefix. See FAQ.
	$parser -> report_clusters(1);

And then:

	$parser -> find_clusters;

Command line usage:

	shell> perl scripts/find.clusters.pl -h

Or see scripts/find.clusters.sh, which hard-codes my test data values.

=head2 Finding fixed length paths

Either pass parameters in to new():

	GraphViz2::Marpa::PathUtils -> new
	(
	    allow_cycles => 0,
	    input_file   => 'data/fixed.length.paths.in.01.gv',
	    output_file  => 'out/fixed.length.paths.out.01.gv',
	    path_length  => 3,
	    report_paths => 1,
	    start_node   => 'Act_1',
	) -> find_fixed_length_paths;

Or call methods to set parameters;

	my($parser) = GraphViz2::Marpa::PathUtils -> new;

	$parser -> allow_cycles(0);
	$parser -> input_file('data/fixed.length.paths.in.01.gv');
	$parser -> output_file('out/fixed.length.paths.in.01.gv');
	$parser -> path_length(3);
	$parser -> report_paths(1);
	$parser -> start_node('Act_1');

And then:

	$parser -> find_fixed_length_paths;

Command line usage:

	shell> perl scripts/find.fixed.length.paths.pl -h

Or see scripts/fixed.length.paths.sh, which hard-codes my test data values.

=head1 DESCRIPTION

GraphViz2::Marpa::PathUtils parses L<Graphviz|http://www.graphviz.org/> dot files and processes
the output in various ways.

Features:

=over 4

=item o Find all groups of nodes, called clusters

Nodes within such groups are connected to each other, but have no links to nodes outside the
cluster.

Graphviz uses 'cluster' is a slightly-different sense. Here, you could say 'sub-tree' instead of
'cluster'.

See L</find_clusters()>.

=item o Find all fixed length paths, starting from a given node

This algorithm does not handle edges into or out of subgraphs.

See L</find_fixed_length_paths()>.

=back

Sample output: L<http://savage.net.au/Perl-modules/html/graphviz2.marpa.pathutils/index.html>.

This class is a descendent of L<GraphViz2::Marpa>, and hence inherits all its keys to new(), and
all its methods.

=head1 Scripts shipped with this distro

All scripts are in the scripts/ directory. This means they do I<not> get installed along with the
package.

Input data files are in data/, output data files are in out/, and html and svg files are in html/.

=over 4

=item o copy.config.pl

This is only for use by the author.

=item o find.clusters.pl

Try shell> perl find.clusters.pl -h

This runs the L</find_clusters()> method.

=item o find.clusters.sh

This runs find.clusters.pl with hard-coded parameters, and is what I use for testing new code.

=item o find.fixed.length.paths.pl

This runs the L</find_fixed_length_paths()> method.

Try shell> perl find.fixed.length.paths.pl -h

=item o find.fixed.length.paths.sh

This runs find.fixed.length.paths.pl with hard-coded parameters, and is what I use for testing new
code.

=item o generate.demo.pl

Uses the L<Text::Xslate> template htdocs/assets/templates/graphviz2/marpa/pathutils/pathutils.tx
to generate html/index.html.

=item o generate.demo.sh

Runs generate.demo.pl.

Then copies html/*.html and html/*.svg to my web server's dir,
$DR/Perl-modules/html/graphviz2.marpa.pathutils/.

=item o pod2html.sh

Converts all *.pm files to *.html, and copies them in my web server's dir structure.

=back

See also t/test.t.

=head1 Distributions

This module is available as a Unix-style distro (*.tgz).

See L<http://savage.net.au/Perl-modules/html/installing-a-module.html>
for help on unpacking and installing distros.

=head1 Installation

Install L<GraphViz2::Marpa::PathUtils> as you would for any C<Perl> module:

Run:

	cpanm GraphViz2::Marpa::PathUtils

or run:

	sudo cpan GraphViz2::Marpa::PathUtils

or unpack the distro, and then either:

	perl Build.PL
	./Build
	./Build test
	sudo ./Build install

or:

	perl Makefile.PL
	make (or dmake or nmake)
	make test
	make install

=head1 Constructor and Initialization

=head2 Calling new()

C<new()> is called as C<< my($obj) = GraphViz2::Marpa::PathUtils -> new(k1 => v1, k2 => v2, ...) >>.

It returns a new object of type C<GraphViz2::Marpa::PathUtils>.

This class is a descendent of L<GraphViz2::Marpa>, and hence inherits all its parameters.

In particular, see L<GraphViz2::Marpa/Constructor and Initialization> for these parameters:

=over 4

=item o description

=item o input_file

=item o logger

=item o maxlevel

=item o minlevel

=item o output_file

See the L</FAQ> for the 2 interpretations of this parameter.

=back

Further, these key-value pairs are accepted in the parameter list (see corresponding methods for
details [e.g. L</path_length($integer)>]):

=over 4

=item o allow_cycles => $integer

Specify whether or not cycles are allowed in the paths found.

Values for $integer:

=over 4

=item o 0 - Do not allow any cycles

This is the default.

=item o 1 - Allow any node to be included once or twice.

Note: allow_cycles => 1 is not implemented yet in V 2.

=back

Default: 0.

This option is only used when calling L</find_fixed_length_paths()>.

=item o path_length => $integer

Specify the length of all paths to be included in the output.

Here, length means the number of edges between nodes.

Default: 0.

This parameter is mandatory, and must be > 0.

This option is only used when calling L</find_fixed_length_paths()>.

=item o report_clusters => $Boolean

Specify whether or not to print a report of the clusters found.

Default: 0 (do not print).

This option is only used when calling L</find_clusters()>.

=item o report_paths => $Boolean

Specify whether or not to print a report of the paths found.

Default: 0 (do not print).

This option is only used when calling L</find_fixed_length_paths()>.

=item o start_node => $theNameOfANode

Specify the name of the node where all paths must start from.

Default: ''.

This parameter is mandatory.

The name can be the empty string, but must not be undef.

This option is only used when calling L</find_fixed_length_paths()>.

=back

=head1 Methods

This class is a descendent of L<GraphViz2::Marpa>, and hence inherits all its methods.

In particular, see L<GraphViz2::Marpa/Methods> for these methods:

=over 4

=item o description()

=item o input_file()

=item o logger()

=item o maxlevel()

=item o minlevel()

=item o output_file()

See the L</FAQ> for the 2 interpretations of this method.

=back

Further, these methods are implemented.

=head2 allow_cycles([$integer])

Here the [] indicate an optional parameter.

Get or set the value determining whether or not cycles are allowed in the paths found.

'allow_cycles' is a parameter to L</new()>. See L</Constructor and Initialization> for details.

Note: allow_cycles => 1 is not implemented yet in V 2.

=head2 cluster_sets()

Returns a hashref. The keys are integers and the values are sets of type L<Set::Tiny>.

Each set contains the nodes of each independent cluster within the input file. Here, independent
means there are no edges joining one cluster to another.

Each set is turned into a tree during a call to L</find_clusters()>. You can retrieve these trees
by calling L</cluster_trees()>.

Both L</find_clusters()> and L</find_fixed_length_paths()> populate this hashref, but the former
adds stand-alone nodes because they are clusters in their own right.

So why doesn't L</find_fixed_length_paths()> add stand-alone nodes? Because you can't have a path
of length 0, so stand-alone nodes cannot participate in the search for fixed length paths.

=head2 cluster_trees()

Returns a hashref. The keys are integers and the values are L<Tree::DAG_Node> trees.

This is only populated by L</find_clusters()>.

The input comes from calling L</cluster_sets()>, so the output is one tree per cluster. This means
there is 1 tree per set returned by L</cluster_sets()>.

=head2 find_clusters()

This is one of the 2 methods which do all the work, and hence must be called.
The other is L</find_fixed_length_paths()>.

The program ignores port and compass suffixes on node names. See
L<http://savage.net.au/Perl-modules/html/graphviz2.marpa.pathutils/clusters.in.10.html>
for a sample.

See the L</Synopsis> and scripts/find.clusters.pl.

Returns 0 for success and 1 for failure.

=head2 find_fixed_length_paths()

This is one of the 2 methods which do all the work, and hence must be called.
The other is L</find_clusters()>.

Notes:

=over 4

=item o Edges pointing in to or out of subgraphs

The code does not handle these cases, which I<will> be difficult to implement.

=item o Edge tails or heads using ports and compasses

Just specify the start node without the port+compass details.

=back

See the L</Synopsis> and scripts/find.fixed.length.paths.pl.

Returns 0 for success and 1 for failure.

=head2 fixed_path_set()

Returns the arrayref of paths found. Each element is 1 path, and paths are stored as an arrayref of
objects of type L<Tree::DAG_Node>.

See the source code of L</output_fixed_length_gv()>, or L</report_fixed_length_paths()>, for sample
usage.

=head2 new()

See L</Constructor and Initialization> for details on the parameters accepted by L</new()>.

=head2 output_clusters()

Output a set of files, one per cluster, if new() was called as new(output_file => '...').

=head2 output_fixed_length_gv($tree, $title)

Output a DOT file for given $tree, with the given $title, if new() was called as
new(output_file => '...').

=head2 path_length([$integer])

Here the [] indicate an optional parameter.

Get or set the length of the paths to be searched for.

'path_length' is a parameter to L</new()>. See L</Constructor and Initialization> for details.

=head2 report_cluster_members()

This prints the clusters found, if new() was called as new(report_clusters => 1).

=head2 report_clusters([$Boolean])

Here the [] indicate an optional parameter.

Get or set the option which determines whether or not the clusters found are printed.

'report_clusters' is a parameter to L</new()>. See L</Constructor and Initialization> for details.

=head2 report_fixed_length_paths()

This prints the fixed length paths found, if new() was called as new(report_paths => 1).

=head2 report_paths([$Boolean])

Here the [] indicate an optional parameter.

Get or set the option which determines whether or not the fixed length paths found are printed.

'report_paths' is a parameter to L</new()>. See L</Constructor and Initialization> for details.

=head2 start_node([$string])

Here the [] indicate an optional parameter.

Get or set the name of the node from where all paths must start.

Specify the start node without the port+compass details.

'start_node' is a parameter to L</new()>. See L</Constructor and Initialization> for details.

=head1 FAQ

=head2 How does find_fixed_length_paths() handle port+compass detail?

Just specify the start node without the port+compass details, and it will work.

Check L<GraphViz2::Marpa::PathUtils::Demo> line 82. The corresponding data is
data/fixed.length.paths.in.04.gv, out/* and html/* files.

=head2 I can't get this module to work with Path::Tiny objects

If your input and output file names are L<Path::Tiny> objects, do this:

	new(input_file => "$input_file", output_file => "$output_file", ...)

=head2 What are the 2 interpretations of C<output_file>?

This section discusses input/output DOT file naming. HTML file names follow the same system.

=over 4

=item o For clusters

Each cluster's DOT syntax is written to a different file. So C<output_file> must be the prefix
of these output files.

Hence, in scripts/find.clusters.sh, C<GV2=clusters.out.$1> (using $1 = '09', say) means output
clusters' DOT files will be called C<clusters.out.09.001.gv> up to C<clusters.out.09.007.gv>.

So, the prefix has ".$n.gv" attached as a suffix, for clusters 1 .. N.

=item o For fixed length paths

Since there is only ever 1 DOT file output here, C<output_file> is the name of that file.

Hence, in scripts/find.fixed.length.path.sh, the value supplied is
C<out/fixed.length.paths.out.$FILE.gv>, where $FILE will be '01', '02' or '03'. So the output file,
using $FILE='01', will be C<out/fixed.length.paths.out.01.gv>.

=back

=head2 I used a node label of "\N" and now your module doesn't work!

The most likely explanation is that you're calling I<find_fixed_path_lengths()> and you've specified
all nodes, or at least some, to have a label like "\N".

This escape sequence triggers special processing in AT&T's Graphviz to generate labels for nodes,
overriding the code I use to re-name nodes in the output of I<find_fixed_path_lengths()>.

See I<_prepare_fixed_length_output()> for the gory details.

The purpose of my re-numbering code is to allow a node to appear in the output multiple times but to
stop Graphviz automatically regarding all such references to be the same node. Giving a node
different names (which are un-seen) but the same label (which is seen) makes Graphviz think they are
really different nodes.

The 3 samples in part 2 of
L<the demo page|http://savage.net.au/Perl-modules/html/graphviz2.marpa.pathutils/index.html>
should make this issue clear.

=head2 What is the homepages of Graphviz and Marpa?

L<Graphviz|http://www.graphviz.org/>. L<Marpa|http://savage.net.au/Marpa.html>.

See also Jeffrey's annotated
L<blog|http://jeffreykegler.github.io/Ocean-of-Awareness-blog/metapages/annotated.html>.

=head2 How are clusters named?

They are simply counted in the order discovered in the input file.

=head2 How are cycles in fixed path length analysis handled?

Note: allow_cycles => 1 is not implemented yet in V 2.

This is controlled by the I<allow_cycles> option to new(), or the corresponding method
L</allow_cycles($integer)>.

The code keeps track of the number of times each node is entered. If new(allow_cycles => 0) was
called, nodes are only considered if they are entered once. If new(allow_cycles => 1) was called,
nodes are also considered if they are entered a second time.

Sample code: Using the input file data/fixed.length.paths.in.01.gv we can specify various
combinations of parameters like this:

	allow_cycles  path_length  start node  solutions
	0             3            Act_1       4
	1             3            Act_1       ?

	0             4            Act_1       5
	1             4            Act_1       ?

=head2 Are all (fixed length) paths found unique?

Yes, as long as they are unique in the input. Something like this produces 8 identical solutions
(starting from A, of path length 3) because each node B, C, D, can be entered in 2 different ways,
and 2**3 = 8.

	digraph G
	{
	    A -> B -> C -> D;
	    A -> B -> C -> D;
	}

See data/fixed.length.paths.in.03.gv and html/fixed.length.paths.out.03.svg.

=head2 The number of options is confusing

Agreed. Remember that this code calls L<GraphViz2::Marpa>'s run() method, which expects a large
number of options.

=head2 Why do I get error messages like the following?

	Error: <stdin>:1: syntax error near line 1
	context: digraph >>>  Graph <<<  {

Graphviz reserves some words as keywords, meaning they can't be used as an ID, e.g. for the name of
the graph.

The keywords are: I<digraph>, I<edge>, I<graph>, I<node>, I<strict> and I<subgraph>.
Compass points are not keywords.

See L<keywords|http://www.graphviz.org/content/dot-language> in the discussion of the syntax of DOT
for details.

So, don't do this:

	strict graph graph{...}
	strict graph Graph{...}
	strict graph strict{...}
	etc...

Likewise for non-strict graphs, and digraphs. You can however add double-quotes around such reserved
words:

	strict graph "graph"{...}

Even better, use a more meaningful name for your graph...

=head1 TODO

Problems:

=over 4

=item o Fixed length paths and subgraphs

If the edge is pointing to a subgraph, all nodes in that subgraph are heads of the edge.

Likewise, if the start node is inside a subgraph, the edge can be outside the subgraph.

=item o Re-implement allow_cycles()

This has been ignored in the re-write for V 2.

=back

Search the source for 'TODO' and 'allow_cycles' to find where patches must be made.

=head1 References

Combinatorial Algorithms for Computers and Calculators, A Nijenhuis and H Wilf, p 240.

This books very clearly explains the backtracking parser I used to process the combinations of nodes
found at each point along each path. Source code in the book is in Fortran.

The book is available as a PDF from L<http://www.math.upenn.edu/~wilf/website/CombAlgDownld.html>.

=head1 Version Numbers

Version numbers < 1.00 represent development versions. From 1.00 up, they are production versions.

=head1 Machine-Readable Change Log

The file CHANGES was converted into Changelog.ini by L<Module::Metadata::Changes>.

=head1 Repository

L<https://github.com/ronsavage/GraphViz2-Marpa-PathUtils>

=head1 Support

Email the author, or log a bug on RT:

L<https://rt.cpan.org/Public/Dist/Display.html?Name=GraphViz2::Marpa::PathUtils>.

=head1 Author

L<GraphViz2::Marpa::PathUtils> was written by Ron Savage I<E<lt>ron@savage.net.auE<gt>> in 2012.

Home page: L<http://savage.net.au/index.html>.

=head1 Copyright

Australian copyright (c) 2012, Ron Savage.

	All Programs of mine are 'OSI Certified Open Source Software';
	you can redistribute them and/or modify them under the terms of
	The Artistic License, a copy of which is available at:
	http://www.opensource.org/licenses/index.html

=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)