#!/usr/bin/env perl
#
# fs - find same; Ʊեõ
#
# 㡧
#   ʣƤΤ
#   $ find . -type f | fs -t | while read size orig dups; do rm $dups; done
#
# $Id: fs 6 2006-01-07 06:42:50Z ta $

#XXX ֤Υեʤʤǥ쥯ȥʲΥեˤȰפΤõסʤۤΥե롿ǥ쥯ȥϰפ뤫ݤӤˤȤ꤬
#XXX Ʊեɽhoge  ./hoge ȤˤƱեȤ롩
#XXX ֤줫 EOF ˽в񤦤ޤǤϰפĤΥեפ⸡Ф륪ץʤ˥㥯֤Ϥ
#XXX դˤ륪ץ󡣻ȤɤϤȤ⤫ۤΥġĤȤ߹碌ƤȤȤؤ

# ե̾פΤϤ륪ץɬסfs -l  uniq ȤȤ߹碌Ƥ⡤ե̾äƤѥʤΤǥᡣѥե̾˲ùȽϤѥʬʤ
# ʸɤԥɤΰ㤤Ʊ뤹뵡ǽߤʤnkf ȤȤ߹碌ƤФ٤
# ǥ쥯ȥ꡼ƵŪõ뵡ǽߤʤfind  fs - Ȥ߹碌٤򤳤ޤǤ find ȤʤΤϤäʤ
# ե̾񤫤줿եɤൡǽɬסxargs -n ϻȤʤfs ʬ¹ԤƤϤʤʤˤΤ
# ͤ򸫤С֥ե̾ʤॹפΰθ cmpפȤդˤȤ

use strict;
use File::Basename;
use Getopt::Std;

# Ʊͥեɽ
# SO_NONE ʳƱ礬ꤦ롣ξ SO_NONE Ʊɽ롣
sub SO_NONE () { 0 }   # Ȥʤ
sub SO_NAME () { 1 }   # ѥ̾
sub SO_MTIME () { 2 }  # ǽ

# Ʊ̾ǤʤФʤʤ
sub SN_NO () { 0 }    # Ʊ̾ǤʤƤ⤤
sub SN_YES () { 1 }   # Ʊ̾ǤʤФʤʤ
sub SN_WGET () { 2 }  # Ʊ̾ǤʤФʤʤxxxxxx.1xx.2ġĤʤɤƱ̾Ȥߤʤ

my %files;  # ԿեΥѥ
my %same;   # եΥѥƱեΥѥ
my $ignzero = 1;  # Υե̵뤹
my $longfmt = 0;  # ̤Ĺɽ
my $surplus = 0;  # ʣƤ̡ʽʣʤ鸺̡ˤι
my $showsurplus = 0;  # ʣƤ̤ιפ򻻽Ф
my $showsize = 1;  # ʰɽ!$longfmtˤξ˳ƥե̤ɽ
my $samename = SN_NO;  # ե̾פͤФʤʤ
my $samemtime = 0;  # ॹפפͤФʤʤ
my $sort = SO_NONE;
my %opts;

sub HELP_MESSAGE {
    my $progname = basename($0);
    print <<"+";
Usage: $progname [OPT] FILE FILE [...]
Usage: $progname [OPT] [LISTFILE]
    LISTFILE: contains a file name per line (defaut: stdin)
OPT:
  -0    use NUL as the delimiter for LISTFILE.
  -d    report only files with same name.
  -D    similar to -d, but xxx and xxx.1, xxx.2, ... are considered same name.
  -h    print this help.
  -l    list in long format.
  -m    report only files with same last modified time.
  -n    sort by pathname.
  -s    output sum of all duplicating files.
  -t    sort by last modified time.
  -z    don't ignore empty files.
Output format:
  short format:
    size1 file1_1 file1_2 file1_3 ...
    size2 file2_1 file2_2 file2_3 ...
  long format:
     size1 timestamp1_1 file1_1
    =size1 timestamp1_2 file1_2
    =size1 timestamp1_3 file1_3
     size2 timestamp2_1 file2_1
    =size2 timestamp2_2 file2_2
Exit status:
  0   found
  1   not found
  >1  error
+
}

$Getopt::Std::STANDARD_HELP_VERSION = 1;
getopts('0dDhlmnstz', \%opts);
if ($opts{h}) { HELP_MESSAGE(); exit 0; }
$longfmt = 1 if $opts{l};
$ignzero = 0 if $opts{z};
$sort = SO_NAME if $opts{n};
$sort = SO_MTIME if $opts{t};
$showsurplus = 1 if $opts{s};
$samename = SN_YES if $opts{d};
$samename = SN_WGET if $opts{D};  # -d ͥ褵
$samemtime = 1 if $opts{m};


# eq: 0, ne: 1, err: -1
sub filecmp {
    my ($file1, $file2) = @_;
    my ($c1, $c2, $n1, $n2);

    return 1 if $samename == SN_YES && basename($file1) ne basename($file2);
    if ($samename == SN_WGET) {
        my $wfile1 = basename($file1);
        $wfile1 =~ s/\.[1-9][0-9]*$//;
        my $wfile2 = basename($file2);
        $wfile2 =~ s/\.[1-9][0-9]*$//;
        return 1 if $wfile1 ne $wfile2;
    }

    return 1 if $samemtime && (stat($file1))[9] != (stat($file2))[9];

    open(F1, $file1) or return -1;
    open(F2, $file2) or return -1;
    binmode(F1);
    binmode(F2);
    while (1) {
        #XXX sysread ϥ顼ΤȤ undef ֤
        $n1 = sysread(F1, $c1, 4096);
        $n2 = sysread(F2, $c2, 4096);
        last if $n1 == 0 || $n2 == 0 || $c1 ne $c2;
    }
    close(F1);
    close(F2);
    return (($n1 == 0 && $n2 == 0) || $c1 eq $c2) ? 0 : 1;
}

sub scan {
    my ($path) = @_;

    if (!-e $path) {
        print STDERR basename($0) . ": $path: $!\n";
    } elsif (!$ignzero || -s $path > 0) {
        #XXX argv ͳνʣʤ
        my $found = 0;

        # Ǥե̵뤹
        foreach my $i (@{$files{-s $path}}) {
            return if $i eq $path;
        }

        foreach my $i (@{$files{-s $path}}) {
            if (filecmp($i, $path) == 0) {  #XXX errhandle
                $found = 1;
                push @{$same{$i}}, $path;
                $surplus += -s $path if $showsurplus;
                last;
            }
        }
        push @{$files{-s $path}}, $path unless $found;
    }
}

if (@ARGV < 2) {
    my $listfile = @ARGV == 0 ? '-' : $ARGV[0];
    my $delim = $/;

    $/ = "\0" if $opts{0};
    open(IN, $listfile) or die basename($0) . ": $listfile: $!\n";
    map { chomp($_); scan $_ } <IN>;
    close(IN);  #XXX stdin  close Ƥ⤤Ρ
    $/ = $delim if $opts{0};
} else {
    map { scan $_ } @ARGV;
}

while (my ($k, $v) = each %same) {
    unshift @$v, $k;
    my @s = $sort == SO_NONE ? @$v : sort { $sort == SO_MTIME ? (stat($a))[9] <=> (stat($b))[9] : $a cmp $b } @$v;
    if (!$longfmt) {
        #print((-s $s[0]) . " @s\n");
        print(($showsize? (-s $s[0]) . ' ' : '') . "@s\n");
    } else {
        print " " . join("=", map {
                my $f = $_;
                my @t = localtime((stat($f))[9]);
                sprintf("%d %04d%02d%02d%02d%02d.%02d %s\n",
                    -s $f,
                    $t[5] + 1900, $t[4] + 1, $t[3], $t[2], $t[1], $t[0],
                    $f)
            } @s);
    }
}
print "$surplus\n" if $showsurplus;

exit !%same;
