Searching a drive and copying files

Sun Jul 23 17:28:04 UTC 2006

in message <336A5DA6-5A43-44C0-8961-139C81702AB3 at familyfunzone.net>,
wrote Joshua Lewis thusly...
>
> I need to search my drive for all pictures on my system and copy
> them  to a networked system using sftp or ssh or what not. There
> will be  duplicate names on the drive so I was hoping to have dups
> placed in a  separate folder.

Unison, net/unison port, should be able to handle the duplicates
based on file checksum.  (I personally have not used it much, so i
cannot answer any other queried about it; refer to its fine man
page.)

> Due to my for lack of a better term stupidity when I  first got
> my camera I will probably have instances when there will be  three
> or four duplicates.  can help me out with that it  would be great.
...
> My goal is to find all my pictures and compare them then delete
> the  dups that don't look that good. A daunting task as I have 20
> GB of  data. I bet 10 GB are dups.

A checksum-based management of duplicates will help with the files
with identical contents, but not with files that differ even a bit.

Perl program below -- a modified version of Randal Schwartz's
version[0] -- uses md5(1) to identify duplicates (as in identical
files), failing that, Image::Magick based on fuzz factor.  When it
finds duplicates, it asks to enter the item number from the file
list to be deleted.

  [0] Article "Finding similar images",
      http://www.stonehenge.com/merlyn/LinuxMag/col50.html

To be able to run, it needs Image::Magick (graphics/ImageMagick
port), Cache::FileCache (devel/p5-Cache-Cache), List::Util
(lang/p5-Scalar-List-Utils), File::Copy & File::Path.

Mind that it, rather Image::Magick, may consume all of your memory
and/or temporary fs if you run it on all the files at once.

If you are good in Perl, you could modify the program to move the
duplicates in a directory (instead of deleting), and possibly not to
ask to take the particular action (if as you say you would have a
boat load of duplicates).

Without further interruptions, program follows ...

  #!perl

  #  This is a modified version of Randal Schwartz's ...
  #
  #    http://www.stonehenge.com/merlyn/LinuxMag/col50.html
  #
  #  ... as it uses checksum (MD5 for now) to detect identical files, failing that
  #  uses Image::Magick.

  use warnings; use strict;

  $|++;

  use Image::Magick;
  use Cache::FileCache;
  use File::Copy qw( move );
  use File::Path qw( mkpath );
  use List::Util qw( reduce );

  use Carp qw(carp);

  use Getopt::Long qw( :config gnu_compat no_ignore_case no_debug );

  #  User option; permitted average deviation in the vector elements.
  my $fuzz = 15;

  #  User option; if defined, rename corrupt images into this dir.
  my $corrupt_dir = "CORRUPT";
  {
    my $usage;
    GetOptions
    (
      'h|usage|help' => \$usage
    , 'f|fuzz=i' => \$fuzz
    , 'c|corrupt=s' => \$corrupt_dir
    , 'nc|nocorrupt' => sub { undef $corrupt_dir; }
    )
    or usage( 1 );

    usage( 0 ) if $usage;

    #  Check if any arguments remain which will be file names
    usage( 1, "No file(s) or directory(ies) given." ) unless scalar @ARGV;
  }

  sub warnif;

  my $cache = Cache::FileCache->new
    ( {
        namespace => 'image.cache'
      , cache_root => ( glob( "~/log/misc" ) )[ 0 ]
      }
    );

  my @buckets;

  FILE: while ( @ARGV )
  {
    my $file = shift;
    next FILE if -l $file;
    if ( -d $file )
    {
      opendir DIR, $file or next FILE;
      unshift @ARGV, map { m/^\./ ? () : "$file/$_"; } sort readdir DIR;
      next FILE;
    }

    next FILE unless -f _ or -d _;

    my ( @stat ) = stat _ or die "should not happen: $!";

    #  dev/ino/mtime
    my $key = "@stat[ 0, 1, 9 ]";

    my @vector;

    #print "$file ";
    if ( my $data = $cache->get( $key ) )
    {
      #print "... is cached\n";
      @vector = @$data;
    }
    else
    {
      my $image = Image::Magick->new;
      if ( my $x = $image->Read( $file ) )
      {
        if ( defined $corrupt_dir and $x =~ m/corrupt|unexpected end-of-file/i )
        {
          print "$file ";
          print "... renaming into $corrupt_dir\n";

          -d $corrupt_dir
            or mkpath $corrupt_dir, 0, 0700
            or die "Cannot mkpath $corrupt_dir: $!";

          move $file, $corrupt_dir or warn "Cannot rename: $!";
        }
        else
        {
          print "$file ";
          print "... skipping ( $x )\n";
        }
        next FILE;
      }

      #print "is ", join( "x", $image->Get( 'width', 'height' ) ), "\n";
      warnif $image->Normalize();
      warnif $image->Resize( geometry => '4x4!' );
      warnif $image->Set( magick => 'rgb' );
      @vector = unpack "C*", $image->ImageToBlob();
      $cache->set( $key, [ @vector ] );
    }
    BUCKET: for my $bucket ( @buckets )
    {
      my $error = 0;
      INDEX: for my $index ( 0 .. $#vector )
      {
        $error += abs( $bucket->[ 0 ][ $index ] - $vector[ $index ] );
        next BUCKET if $error > $fuzz * @vector;
      }
      push @$bucket, $file;

      #print "linked ", join( ", ", @$bucket[ 1 .. $#$bucket ] ), "\n";
      next FILE;
    }
    push @buckets, [ [ @vector ], $file ];
  }

  #  Connect images only, no interactive process
  #exit;

  for my $bucket ( @buckets )
  {
    my @names = @$bucket;
    shift @names;  # first element is vector
    next unless @names > 1;  # skip unique images
    my $images = Image::Magick->new;
    $images->Read( @names );

    compare_as_text( $images );

    my $sums = collect_md5sum( $images );
    {
      #  Silence warning about single use of $b.
      no warnings 'once';
      compare_as_image( $images )
        unless reduce { $a eq $b ? $a : 0 } @$sums;
    }

    print "Delete? [picture number] ";

    my $img_count = scalar @{ $images };
    my @dead;
    chomp( my $dead = <STDIN> );
    @dead =
        $dead =~ m/^ \s* [*+] $/x  ? ( 1 .. $img_count )
      : $dead =~ m/^ \s* - \d+ $/x ? ( $img_count + $dead + 1 .. $img_count )
      : grep { $_ >= 1 and $_ <= $img_count } $dead =~ /(\d+)/g;

    for ( @dead )
    {
      my $dead_name = $images->[ $_ - 1 ]->Get( 'base-filename' );

      warn "rm $dead_name\n";
      unlink $dead_name or warn "Cannot rm $dead_name: $!";
      warn "\n";
    }
  }

  sub compare_as_text
  {
    my $images = shift;

    my $frmt = "%d: %s\n -- %dx%d  %0.3f kB\n";

    foreach my $img ( 0 .. scalar @$images - 1 )
    {
      printf $frmt , ( $img + 1 ), $images->[ $img ]->Get( 'base-filename' )
      , $images->[ $img ]->Get( 'width' ), $images->[ $img ]->Get( 'height' )
      , ( $images->[ $img ]->Get( 'filesize' ) / 1024 )
      ;
    }
  }

  sub collect_md5sum
  {
    my $images = shift;
    my @md5;
    foreach ( 0 .. scalar @$images - 1 )
    {
      my $name = $images->[ $_ ]->Get( 'base-filename' );
      push @md5, ( split ' ', qx/ md5 $name / )[ 3 ];
    }
    return [ @md5 ];
  }

  sub compare_as_image
  {
    my $images  = shift;
    my $montage =
      $images->Montage
      ( geometry => '370x500' , tile => '2x2' , label => "[%p] %i %wx%h %b" );

    print "processing...\n";
    $montage->Display();
  }

  sub warnif
  {
    my $value = shift;
    carp $value if $value;
  }

  sub usage
  {
    my ( $exit, $message ) = @_;

    print STDERR $message, "\n" if $exit && $message;

    my $old_fd = select( $exit == 0 ? \*STDOUT : \*STDERR );
    print <<"_USAGE_";
  similar-image - Keep|Delete similar looking images

  similar-image [ -fuzz <avg dev> ]
                [ -corrupt <directory> | -nocorrupt ]
                < files directories >

  This program takes the following options ...

    -f | -fuzz        Permitted average deviation in the vector
                      elements; (set value: $fuzz).

    -c | -corrupt     Move corrupt images into this directory; (set
                      value: $corrupt_dir).

    -nc | -nocorrupt  Do not define a corrupted-image directory (so that
                      files are not moved).
  _USAGE_

    select $old_fd;
    exit( $exit );
  }
  __END__

  - Parv

--