The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Fuse::TagLayer;

# use strict;
use warnings;

use Data::Dumper;
use DBI;
use File::ExtAttr ();
use File::Find ();
use File::Basename ();
use Fcntl qw(SEEK_SET);
use POSIX qw(S_ISDIR ENOENT EISDIR EINVAL ENOSYS);

our $VERSION = '0.10';
our $self;

sub new {
	my $class = shift;

	$self = bless({
		@_
	}, $class);

	$self->{uid} ||= 0;
	$self->{gid} ||= 0;

	print "TagLayer: Building TagLayer tags database...\n" if $self->{debug};
	init_db();	## init db (SQLite)

	## prepare a mysql insert statement
	$self->{mysql_file_insert} = database()->prepare("INSERT INTO `file_tags` (`file`,`basename`,`tags`) VALUES (?,?,?); ") or die database()->errstr;
	$self->{mysql_tags_insert} = database()->prepare("INSERT INTO `tags` (`tag`,`count`) VALUES (?,?); ") or die database()->errstr;

	## prepare a mountpoint regex
	my $mntre = quotemeta($self->{mountpoint});
	$self->{mountpoint_regex} = qr/^$mntre/;

	## build SQL tables
	# table:file_tags
	database()->{AutoCommit} = 0;
	File::Find::find({ wanted => \&wanted }, $self->{realdir});
	database()->commit;

	# table:tags
	for (keys %{ $self->{global_tags} }){
#		print "$_, ". $self->{global_tags}->{$_} ." \n" if $self->{debug};
		$self->{mysql_tags_insert}->execute($_, $self->{global_tags}->{$_}) or die database()->errstr;

		$self->{tags_cnt}++;
		database()->commit if $self->{tags_cnt} && ($self->{tags_cnt} % 250) == 0;
	}
	delete($self->{global_tags});
	database()->commit;
	database()->{AutoCommit} = 1;

	$self->{db_epoch} = time();

	print "TagLayer: processed ".($self->{files_cnt}||0)." files with ".($self->{tags_cnt}||0)." tags.\n" if $self->{debug};

	return $self;
}

sub init_db {
	my $sql = "create table if not exists `tags` (
		`tag` varchar(255) null,
		`count` integer DEFAULT '0'
	);";
	database()->do($sql) or die database()->errstr;
	my $empty = database()->prepare("DELETE FROM `tags`; ") or die database()->errstr;	# no TRUNCATE TABLE in SQLite
	$empty->execute();

	$sql = "create table if not exists `file_tags` (
		`file` varchar(255) null,
		`basename` varchar(255) null,
		`tags` varchar(255) null
	);";
	database()->do($sql) or die database()->errstr;
	my $empty2 = database()->prepare("DELETE FROM `file_tags`; ") or die database()->errstr;	# no TRUNCATE TABLE in SQLite
	$empty2->execute();
}

sub database {
	unless($self->{dbh}){
		$self->{dbh} = DBI->connect("dbi:SQLite::memory:", "", "");
	#	$self->{dbh} = DBI->connect("dbi:SQLite:dbname=/tmp/taglayer.sqlite", "", "");	# for debug only, make sure you delete the file after each mount!!
	}
	return $self->{dbh};
}

sub mount {
	my $self = shift;

	print '## TagLayer: mount() self:'.Dumper($self) if $self->{debug};

	## check local mount point
	if(!-d $self->{mountpoint}){
		die 'Fuse::TagLayer: Mountpoint '.$self->{mountpoint}.' does not exists!';
	}

	Fuse::main(
		mountpoint => $self->{mountpoint},
		threaded   => $self->{threaded} ? 1 : 0,
		debug	   => $self->{debug} > 1 ? 1 : 0,

		readdir	=> "Fuse::TagLayer::virt_readdir",
		getattr	=> "Fuse::TagLayer::virt_getattr",
		open	=> "Fuse::TagLayer::real_open",
		read	=> "Fuse::TagLayer::real_read",
		release	=> "Fuse::TagLayer::real_release",
		statfs	=> "Fuse::TagLayer::virt_statfs",
	);
	return;
}

sub dirpath_to_tags {
	## explode path into tags
	# if path comes from wanted, it returns unclean "tags",
	# if path comes from our paths, tags should already be cleaned

	my @pathtags = split(/\//,shift);
	shift(@pathtags); # root path means no dirtags

	return @pathtags;
}

sub wanted {
	## if our mountpoint is within the realdir, ignore ourself
	return if $File::Find::dir =~ $self->{mountpoint_regex};

	## only dirs with files qualify
	return if !-f $File::Find::name;

	my $realdir = $self->{realdir};
	my $rel_dir = $File::Find::dir;
	$rel_dir =~ s/^$realdir//;

	my @tags;
	## dir tags
	@tags = dirpath_to_tags($rel_dir) unless $self->{no_tags_from_path};

	if($self->{more_tags}){
		my $filename = lc($_);
		$filename =~ s/(\.[a-zA-Z0-9]{2,5})$//;
		if(my $suffix = $1){
			$suffix =~ s/jpeg/jpg/;
			push(@tags, 'zsuffix-'.$suffix);
		}

		push(@tags, split(/[^\p{L}\p{N}]/,$filename));	# matches all (Unicode) characters that are neither letters nor numbers
	}

	## xattr tags
	if(!$self->{no_tags_from_xattr}){
		if(my $xattrtags = File::ExtAttr::getfattr( $File::Find::dir.'/'.$_, 'tags') ){
			push(@tags, split(/,\s*/,$xattrtags));
		}
	}

	# clean and dedup, as there might be duplicates after cleansing
	my %tags;
	for(@tags){
		my $tag_cleaned = lc($_);
		$tag_cleaned =~ s/[^\p{L}\p{N}]//g;	# matches all (Unicode) characters that are neither letters nor numbers
		next if length($tag_cleaned) < 2;
		$tags{$tag_cleaned}++;

		$self->{global_tags}->{$tag_cleaned}++;
	}

	# insert "/path/to", "filename", "tags as csv string"
	$self->{mysql_file_insert}->execute( $File::Find::dir, $_, join(", ", keys %tags) ) or die database()->errstr;
	$self->{files_cnt}++;
#	print "File: $self->{files_cnt}: $File::Find::dir, $_, ".join(", ", keys %tags)."\n";

	if($self->{files_cnt} && ($self->{files_cnt} % 250) == 0){
		database()->commit;
		print " $self->{files_cnt} files processed\n" if $self->{debug};
	}
}

## note the singular "file", as it should return only one file
sub file_by_tagpath {
	my ($basename,$directory) = File::Basename::fileparse(shift);
# print "Directory:$directory Basename:$basename\n";
	# 1st: only by tags
	my @tags = dirpath_to_tags($directory);

	return undef if !@tags;

	my @sql_files;
	for(@tags){
		push(@sql_files, "`tags` REGEXP '$_'");
	}
	my $sql_files = join(" AND ",@sql_files);

# print "PREFAIL: tags: @tags (".@tags.") ;; SELECT `file`,`basename` FROM `file_tags` WHERE $sql_files;\n";
	my $pre = database()->selectall_arrayref("SELECT `file`,`basename` FROM `file_tags` WHERE $sql_files; ", {Columns=>[1,2]}); # push first two rows into arrayref

	return undef if !@$pre;

	# 2nd: by basename
	my @files;
	for(@$pre){
		push(@files, ${$_}[0].'/'.${$_}[1]) if ${$_}[1] eq $basename;
	}

	print "++ WARNING ++ file_by_tagpath($basename,@_) found multiple files: @files\n" if @files > 1;

	return @files ? shift(@files) : undef;
}

sub virt_readdir {
	my ($path,$offset) = @_;

	my (@dirs,@files);
	if($path eq '/'){
		## full tags list SQL style
		my $dirs = database()->selectcol_arrayref("SELECT `tag` FROM `tags`; "); # push first row into arrayref
		@dirs = @$dirs;
	}else{
		my @regex_dirs;
		my @sql_files;
		my @pathtags = dirpath_to_tags($path);
		for(@pathtags){
			push(@regex_dirs, '^'.$_.'$');
			push(@sql_files, "`tags` REGEXP '$_'");
		}

		my $regex_dirs = join('|',@regex_dirs);
		$regex_dirs = qr/$regex_dirs/;
		my $sql_files = join(" AND ",@sql_files);
		print "## virt_readdir: $path: regex_dirs:$regex_dirs ; sql_files:$sql_files\n" if $self->{debug};

		## gather files: SQL style: results-set
		my %tags;
		$entries = database()->prepare("SELECT `basename`,`tags` FROM `file_tags` WHERE $sql_files; "); # push first row into arrayref
		$entries->execute();
		while( my $entry = $entries->fetchrow_hashref ){
			push(@files, $entry->{basename});

			for( split(/,\s*/,$entry->{tags}) ){
				my $tag_cleaned = lc($_);
				$tag_cleaned =~ s/[^\p{L}\p{N}]//g;	# matches all (Unicode) characters that are neither letters nor numbers
				next if $tag_cleaned =~ $regex_dirs;
				$tags{$_}++;
			}
		}
		@dirs = keys %tags;
	}

	print "## virt_readdir: $path: sub-tags left (as dirs):@dirs ; files:@files\n" if $self->{debug};
	return (@dirs || @files) ? ((@dirs,@files), 0) : 0;
}

sub real_getattr {
	my $file = shift; # we have real paths in the db anyway
	print "real_getattr: file:$file\n" if $self->{debug};
	my (@list) = lstat($file);
	return -ENOENT() unless @list;	# "-ENOENT" was "-$!", but if we compare both in Dumper, "-$!" is a string, and ENOENT is numeric
	return @list;
}

sub virt_getattr {
	my ($path) = shift;
	print "## virt_getattr: path:$path => " if $self->{debug};

	return -ENOENT() unless $self->{tags_cnt};

#	my $cnt = () = $path =~ /\//g; # from an older approach, to find out how deep we are in the tag-path

	## find which file exactly is meant here
	my $file = file_by_tagpath($path);

	if($file){
		return real_getattr( $file );
	}else{
		print "## virt_getattr: path:$path ; file_by_tagpath() returned <undef>\n" if $self->{debug};
		my ($modes) = (0040<<9) + 0775;
		my ($dev, $ino, $rdev, $blocks, $uid, $gid, $nlink, $blksize) = (0,0,0,1,$self->{uid},$self->{gid},1,1024);
		my $size = 0;
		$blocks = $size;
		my ($atime, $ctime, $mtime);
		$atime = $ctime = $mtime = $self->{db_epoch};

		return ($dev,$ino,$modes,$nlink,$uid,$gid,$rdev,$size,$atime,$mtime,$ctime,$blksize,$blocks);
	}
	return -ENOENT(); # never
}

sub real_open {
	my ($path,$mode) = @_;

	## find which file exactly is meant here
	my $file = file_by_tagpath($path);

	return -ENOSYS() if !$file;
	return -ENOENT() unless -e $file;

	my $fh;
	sysopen($fh,$file,$mode) or return -$!;

	return (0, $fh);
}

sub real_read {
	my ($path,$bufsize,$off,$fh) = @_;

	my $rv = -ENOSYS();

	if(seek($fh,$off,SEEK_SET)) {
		read($fh,$rv,$bufsize);
	}

	return $rv;
}

sub real_release {
	my ($path,$mode,$fh) = @_;

	close($fh) or return -$!;

	return 0;
}

sub virt_statfs { return 255, 1, 1, 1, 1, 2 }

sub umount {
	database()->disconnect();
}


1;

__END__

=head1 NAME

Fuse::TagLayer - A read-only tag-filesystem overlay for hierarchical filesystems

=head1 SYNOPSIS

  use Fuse::TagLayer;
  my $ftl = Fuse::PerlSSH::FS->new(
	realdir		=> '/some/local/path',
	mountpoint	=> '/some/local/mountpoint',
	debug		=> 2,
  );
  $ftl->mount();

The bundled L<taglayer> mounting script uses this module here, I<Fuse::TagLayer>, as
its backend. On mount, it scans a specified dir for tags and mounts them as
the TagLayer filesystem at the mountpoint, by default /path/to/specified-dir/+tags.

  taglayer <real directory> [<tag directory mountpoint>]

=head1 DESCRIPTION

Fuse::TagLayer offers all the tags found in one subdir/volume as a tag-based file-system
at the mountpoint you specify, currently read-only. This is in addition to the real
filesystem which is considered to be 'canonical' - with the tag-file-system being
just another "layer" to access these files (thus the name).

=head2 How it works

Fuse::TagLayer, on mount, scans a specified dir-path and gathers all the tags found in
the files' "user.tags" extended-attribute. These xattr-tags are supplemented
by "tags" derrived from what could be called "directory fragments". That means, a
path like "/Path/to/file" is interpreted as being the tags "Path" and "to" (dropping
the filename as source for tags for now). All these tags then are inserted into a
database (SQLite) and the db is used to expose a tag-based file system at the mountpoint.

=head1 METHODS

Right now, the module offers some OO-ish methods, and some plain functions. The mounting
script uses the below OO methods new(), mount() and umount(). But note the quirk that
$self is stored in a global I<our> variable, to mediate between the OO API and the 
Fuse-style functions.

=head2 new()

=head2 mount()

=head2 umount()

=head1 FUNCTIONS

A growing list of functions that match the FUSE bindings, some prefixed by "virt_"
and some by "real_". The latter faciliating the loopback/ pass-trough to the real
filesystem:

  virt_readdir()
  virt_getattr()
  real_getattr()
  real_open()
  real_read()
  real_release()

=head1 EXPORT

None by default.

=head1 CAVEATS or TODO

=head2 Should root contain all or no files?

When we regard the root dir as displaying files without any tags, then only these should show
up. When we regard tags as filters, root would show all files, as on root-level, no
tags (filters) are applied, a bit like in a global key-value filesystem. But when we
think of webapps, most apps will ask you for at least one tag before you can browse
results, so following this paradigm, root should show no files.

=head2 Uniqueness

Currently, filenames, just as tags, are treated as being unique within the tag-filesystem.
So, files of the I<same name> in I<different> directories are not handled properly.
Only one of these name-doublettes might show up after the internal deduplication.

=head2 No tests

No working tests. But everything is read-only so trying TagLayer should be safe.

=head2 On "tagging" (or why it's read-only)

Right now, the resulting tag-fs is read-only, as we haven't implemented write() to the
tag-based path so far. Eventually, when TagLayer grows into a real loop layer, this might
change. Also, once this happens, we have to decide if tags coming from the 'canonical path'
directory elements parsing, are to be considered read-only or not. (Would adding/removing
a tag result in a I<mv> within the underlying real file-system?)

=head2 Tagged directories

Via xattr, it is possible to tag a directory. This is ignored for now, as we regard all 
dirs within the tag-path to be "virtual" and only files in there as being "real". Makes
things easier and is probably in-line with the idea behind a tag-based fs, putting away
with directories.

=head1 SEE ALSO

L<FUSE|Fuse>, obviously.

=head1 AUTHOR

Clipland GmbH L<http://www.clipland.com/>

=head1 COPYRIGHT & LICENSE

Copyright 2012-2013 Clipland GmbH. All rights reserved.

This library is free software, dual-licensed under L<GPLv3|http://www.gnu.org/licenses/gpl>/L<AL2|http://opensource.org/licenses/Artistic-2.0>.
You can redistribute it and/or modify it under the same terms as Perl itself.