lib/File/Find/Duplicates.pm

package File::Find::Duplicates;

=head1 NAME

File::Find::Duplicates - Find duplicate files

=head1 SYNOPSIS

  use File::Find::Duplicates;

  my @dupes = find_duplicate_files('/basedir1', '/basedir2');

  foreach my $dupeset (@dupes) { 
    printf "Files %s (of size %d) hash to %s\n",
      join(", ", @{$dupeset->files}), $dupeset->size, $dupeset->md5;
  }

=head1 DESCRIPTION

This module provides a way of finding duplicate files on your system.

=head1 FUNCTIONS

=head2 find_duplicate_files

  my %dupes = find_duplicate_files('/basedir1', '/basedir2');

When passed a base directory (or list of such directories) it returns
a list of objects with the following methods:

=head2 files

A listref of the names of the duplicate files.

=head2 size

The size of the duplicate files.

=head2 md5

The md5 sum of the duplicate files.

=head1 TODO

Check the contents of tars, zipfiles etc to ensure none of these also
exist elsewhere (if so requested).

=head1 SEE ALSO

L<File::Find>.

=head1 AUTHOR

Tony Bowden

=head1 BUGS and QUERIES

Please direct all correspondence regarding this module to:
  bug-File-Find-Duplicates@rt.cpan.org

=head1 COPYRIGHT AND LICENSE

  Copyright (C) 2001-2005 Tony Bowden.

  This program is free software; you can redistribute it and/or modify it under
  the terms of the GNU General Public License; either version 2 of the License,
  or (at your option) any later version.

  This program is distributed in the hope that it will be useful, but WITHOUT
  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  FOR A PARTICULAR PURPOSE.

=cut

use strict;
use File::Find;
use Digest::MD5;
require Exporter;
use vars qw($VERSION @ISA @EXPORT);

@ISA     = qw/Exporter/;
@EXPORT  = qw/find_duplicate_files/;
$VERSION = '1.00';

use Class::Struct 'File::Find::Duplicates::Set' =>
  [ files => '@', size => '$', md5 => '$' ];

sub find_duplicate_files {
  my (@dupes, %files);
  find sub {
    -f && push @{ $files{ (stat(_))[7] } }, $File::Find::name;
  }, @_;
  foreach my $size (sort { $b <=> $a } keys %files) {
    next unless @{ $files{$size} } > 1;
    my %md5;
    foreach my $file (@{ $files{$size} }) {
      open(my $fh, $file) or next;
      binmode($fh);
      push @{ $md5{ Digest::MD5->new->addfile($fh)->hexdigest } }, $file;
    }

    push @dupes, map File::Find::Duplicates::Set->new(
      files => $md5{$_},
      size  => $size,
      md5   => $_,
      ),
      grep @{ $md5{$_} } > 1, keys %md5;
  }
  return @dupes;
}

return q/
 dissolving ... removing ... there is water at the bottom of the ocean
/;

	Global
`s`	Focus search bar
`?`	Bring up this help dialog

	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)

	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse

	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)