#this script will create the database which will be used
#CAREFUL! IF THERE EXISTS A PREVIOUS DATABASE IT WILL DELETE IT !!!
use strict;
use warnings;
use DBI;
`rm checksum_db.sqlite`;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
$dbh->do("CREATE TABLE checksums (id INTEGER PRIMARY KEY,checksum VARCHAR(42),size INTEGER,last_date_modified DATE,name VARCHAR(200) UNIQUE,is_dir VARCHAR(1),is_file VARCHAR(1),is_link VARCHAR(1),UNIQUE (checksum,name));");
####
minsize: 64
directories:
- path: /usr
dir: 1
file: 1
link: 0
regex: .*
- path: /home/spx2/perlhobby
dir: 1
file: 1
link: 0
regex: .*
- path: /lib
dir: 1
file: 1
link: 0
regex: .*
####
#this will be used just for the first run
#!/usr/bin/perl
use strict;
use warnings;
use File::Find;
use YAML qw/LoadFile/;
use Data::Dumper;
use Digest::SHA1 qw/sha1_hex/;
use DBI;
use DateTime;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
my $config_path = 'config.yml';
my $config = LoadFile($config_path);
#to add columns in db for link,dir,file to know what the name column stands for...
sub add_to_db {
my ($checksum,$last_modif_time,$size,$name)=@_;
#maybe calculating is_* should be done in process_file
my $is_dir = (-d $name)?'Y':'N';
my $is_file = (-f $name)?'Y':'N';
my $is_link = (-l $name)?'Y':'N';
$dbh->do(
sprintf "INSERT INTO checksums (checksum,size,last_date_modified,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");",
$checksum,
$size,
$last_modif_time->ymd,
$name,
$is_dir,
$is_file,
$is_link
);
};
sub delete_from_db {#remains to be completed
my ($name)=@_;
};
sub file2sha1 {
my $file=$_[0];
return '' if -d $file; #have to find out if to prune when a directory is found that doesn't match the regex
open my $f,"<$file";
my $sha1 = Digest::SHA1->new;
$sha1->addfile(*$f);
return $sha1->hexdigest;
}
sub process_file {
my $dir_configs=$_[0];
##optimisation using -d -l -f -s just once for return and also for adding
#if current "file"(unix terminology) is a directory and the yaml configuration
#tells us to eliminate directories from the search we do so by returning from the
#callback
return if -d $File::Find::name && ! $dir_configs->{dir};
return if -l $File::Find::name && ! $dir_configs->{link};
return if -f $File::Find::name && ! $dir_configs->{file};
return if -s $File::Find::name < $config->{minsize};
unless($File::Find::name =~ /$dir_configs->{regex}/) {
if(-d $File::Find::name) {
$File::Find::prune=1;
}
return;
}
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks)
= stat($File::Find::name);
my $last_modif_time=DateTime->from_epoch(epoch=>$mtime);
# printf "%s %s %s %s\n",
# $File::Find::name,
# file2sha1($File::Find::name),
# -s $File::Find::name,
# $last_modif_time;
add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $File::Find::name,$File::Find::name);
#print Dumper $dir_configs;
};
for my $searched_dir_hash (@{ $config->{directories} }) {
# we skip the entry if it does not exist or it is not a directory
next unless (-e $searched_dir_hash->{path} && -d $searched_dir_hash->{path});
#we pass to the process_file function the yml configuration for the current directory that is searched
find(
{ wanted=> sub { process_file($searched_dir_hash);} },
$searched_dir_hash->{path}
);
}
####
#!/usr/bin/perl
use strict;
use warnings;
use File::Find;
use YAML qw/LoadFile/;
use Data::Dumper;
use Digest::SHA1 qw/sha1_hex/;
use DBI;
use DateTime;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
my $config_path = 'config.yml';
my $config = LoadFile($config_path);
sub add_to_db {
my ($checksum,$last_modif_time,$size,$name)=@_;
#maybe calculating is_* should be done in process_file
my $is_dir = (-d $name)?'Y':'N';
my $is_file = (-f $name)?'Y':'N';
my $is_link = (-l $name)?'Y':'N';
$dbh->do(
sprintf "INSERT INTO checksums (checksum,size,last_date_modified,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");",
$checksum,
$size,
$last_modif_time->ymd,
$name,
$is_dir,
$is_file,
$is_link
);
}
sub update {
my ($name,$checksum,$last_modif_time)=@_;
$dbh->do(sprintf "UPDATE checksums SET checksum=\"%s\",last_date_modified=\"%s\" WHERE name=\"%s\";",$checksum,$name,$last_modif_time->ymd);
}
sub find_or_update {
my ($name,$last_modif_time)=@_;
my $s=$dbh->prepare(sprintf "SELECT last_date_modified FROM checksums WHERE name=\"%s\" ;",$name);
$s->execute;
my $results = $s->fetchall_arrayref;
if($results) {
#found it in the db;
return 2 if $last_modif_time->ymd eq $results->[0]->[0] ; #return 2 if the entry is up to date
update($name,file2sha1($name),$last_modif_time);
return 1;# the entry is not up to date
}
return 0; #the entry has not be found- should be updated
};
sub file2sha1 {
my $file=$_[0];
return '' if -d $file; #have to find out if to prune when a directory is found that doesn't match the regex
open my $f,"<$file";
my $sha1 = Digest::SHA1->new;
$sha1->addfile(*$f);
return $sha1->hexdigest;
}
sub process_file {
my $dir_configs=$_[0];
return if -d $File::Find::name && ! $dir_configs->{dir};
return if -l $File::Find::name && ! $dir_configs->{link};
return if -f $File::Find::name && ! $dir_configs->{file};
return if -s $File::Find::name < $config->{minsize};
unless($File::Find::name =~ /$dir_configs->{regex}/) {
if(-d $File::Find::name) {
$File::Find::prune=1;
}
return;
}
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks)
= stat($File::Find::name);
my $last_modif_time=DateTime->from_epoch(epoch=>$mtime);
#find out if entry needs update and update it if necesary
#find_or_update returns 0 only if it hasnt found the file in the checksum db
unless(find_or_update($File::Find::name,$last_modif_time)) {
add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $File::Find::name,$File::Find::name);
#add it to db
};
};
for my $searched_dir_hash (@{ $config->{directories} }) {
next unless (-e $searched_dir_hash->{path} && -d $searched_dir_hash->{path});
find(
{ wanted=> sub { process_file($searched_dir_hash);} },
$searched_dir_hash->{path}
);
}
####
#!/usr/bin/perl
use strict;
use warnings;
use DBI;
use DateTime;
use Data::Dumper;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
open my $script,">duplicate_erase_script.sh";
sub get_unique_checksums {
my $sql="SELECT checksum as groupsize FROM checksums GROUP BY size HAVING groupsize > 1;";
#because groups of size 1 cannot have duplicates
my $sth=$dbh->prepare($sql);
$sth->execute;
my $results=$sth->fetchall_arrayref;
return map { $_->[0] } @{$results};
};
sub checksum2names {
my ($checksum)=@_;
my $sql=sprintf "SELECT name FROM checksums WHERE checksum=\"%s\";",$checksum;
my $sth=$dbh->prepare($sql);
$sth->execute;
my $results=$sth->fetchall_arrayref;
return map { $_->[0] } @{$results};
};
for my $checksum (get_unique_checksums()) {
my @same_checksum=checksum2names($checksum);
my $leader = shift @same_checksum;#take aside on element of the group making it the leader
print $script "# duplicates of $leader follow:\n";
for my $name (@same_checksum) {#get all the others and write commands to delete them
print $script "# rm $name\n";
}
};
close $script;