#!/usr/bin/perl use strict; use warnings; use File::Find; use Audio::Digest::MP3; use Getopt::Std; use Term::Activity; my %opts; getopts( 'n', \%opts ); my $ta = new Term::Activity or die "Problem with Term::Activity\n"; @ARGV = (".") unless @ARGV; my %seen; #list directories we have already worked thru my $numTotalMp3 = 0; # count all mp3 files my $checkedAlready = 0; # count those files already determined my $numDeleted = 0; # count deleted files my $saved_size = 0; # count kilobytes freed by deleting these files my $numWarn = 0; # count problem files print "Process each dir with mp3s looking for checksum duplicates in that dir\n"; print "Operate in no-delete mode!\n" if $opts{n}; print "Read dir tree (from @ARGV).\n"; find( { wanted => \&wanted }, @ARGV ); $ta->DESTROY; # does not work? Is defined, though... $ta=""; #this seems to work mostly although too brutal! #print "\n"; #sort alphanumerical, I assume foreach my $dir ( sort keys %seen ) { print "Determining checksums in dir (so far $checkedAlready of $numTotalMp3 files):\n$dir\n"; my %checksumPerDir; foreach my $item ( @{ $seen{$dir} } ) { if ( -f $item ) { my $streaminfo = Audio::Digest::MP3->scan( $item, 'MD5' ); $checkedAlready++; # print "$item ($streaminfo)\n"; push @{ $checksumPerDir{$streaminfo} }, $item; action($item) if ( $#{ $checksumPerDir{$streaminfo} } > 0 ); } else { $numWarn++; warn "Problem with $item. Cannot determine checksum.\n"; } } } #REPORT RESULTS print "\n\nREPORT\n"; my $unit = "bytes"; if ( $saved_size > 1024 ) { $saved_size = sprintf( "%.2f", $saved_size / 1024 ); $unit = "KB"; if ( $saved_size > 1024 ) { $saved_size = sprintf( "%.2f", $saved_size / 1024 ); $unit = "MB"; if ( $saved_size > 1024 ) { $saved_size = sprintf( "%.2f", $saved_size / 1024 ); $unit = "GB"; } } } #Ratio $numTotalMp3-$numDeleted = Real ones #Retio RealOnes : $numDeleted ,e.g. 1:1.2 my $ratio = sprintf( "%.2f", $numDeleted / ($numTotalMp3 - $numDeleted) ); unless ( $opts{n} ) { print "Deleted $numDeleted checksum duplicate(s) of $numTotalMp3 " ."encountered mp3 files.\n"; print "Warnings encountered: $numWarn\n"; print "Ratio (kept : replicas): 1.00 : $ratio\n"; print "Saved disk space: $saved_size $unit.\n"; } else { print "Warnings encountered:$numWarn\n"; print "Would have deleted $numDeleted checksum duplicate(s) of $numTotalMp3 encountered mp3 files (1:$ratio).\n"; print "Would have saved $saved_size $unit.\n"; } exit 1; sub wanted { $ta->tick; # print "."; if ( $File::Find::name =~ /.mp3$|.MP3$/ ) { ++$numTotalMp3; push @{ $seen{$File::Find::dir} }, $File::Find::name; } } sub action { my $duplicate = shift; $numDeleted++; print "\tDELETE checksum duplicate:\n\t$duplicate\n"; unless ( $opts{n} ) { $saved_size = $saved_size + -s $duplicate; unlink($duplicate) or die "Cannot delete $duplicate\n"; } } =head1 NAME rmMp3Duplicates.pl =head1 VERSION 0.1 February 2009 =head1 SYNOPSIS rmMp3Duplicates [-n] [path] If you don't specify any path, current directory is assumed. -n turn off delete and just display what would be done =head1 DESCRIPTION This little quick and dirty script looks recursively in the specified directory for mp3 files, determines their checksum and DELETES duplicates. =head1 COMMENTS If you have a structure like this: artist/album/track1.mp3 you probably don't need two files with the same checksum. (Maybe the same track is on different albums. In that case it is much more likely that you want to keep both. So this script will not delete those.) This script does not calculate the checksum for the whole mp3 file, but only for the datastream part, i.e. metadata is not considered. It keeps the first file it encounters, and removes all duplicates encountered afterwards. On my system, it works in normal file order according to alphanumerical listing of the files, so that trackname.mp3 is kept and trackname 1.mp3 is deleted. If some of your duplicates contain important metadata and others don't, then you probably do not want to use this script. =head1 KNOWN ISSUES Several instances of this script should run fine next to each other, except when one locks a file that the other is trying to delete. This produces an error and the script dies. Not sure if this is a bug or a feature. Warnings are also raised with filenames containing strange characters. Don't why at the moment. Script continues in these cases, but reports number of problems in the end. =head1 AUTHOR (c) 2009 Maurice Mengel Released under the Perl Artistic License =head1 SEE ALSO / DEPENDENCIES File::Find; Audio::Digest::MP3; Getopt::Std; Term::Activity; MPEG::Audio::Frame =cut