#this script will create the database which will be used
#CAREFUL! IF THERE EXISTS A PREVIOUS DATABASE IT WILL DELETE IT !!!
use strict;
use warnings;
use DBI;
`rm checksum_db.sqlite`;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
$dbh->do("CREATE TABLE checksums (id INTEGER PRIMARY KEY,checksum VARCHAR(42),size INTEGER,last_date_modified DATE,name VARCHAR(200) UNIQUE,is_dir VARCHAR(1),is_file VARCHAR(1),is_link VARCHAR(1),UNIQUE (checksum,name));");

##</code><code>##

minsize: 64
directories:
  - path: /usr
    dir: 1
    file: 1
    link: 0
    regex: .*
  - path: /home/spx2/perlhobby
    dir: 1
    file: 1
    link: 0
    regex: .*
  - path: /lib
    dir: 1
    file: 1
    link: 0
    regex: .*

##</code><code>##

#this will be used just for the first run
#!/usr/bin/perl 
use strict;
use warnings;
use File::Find;
use YAML qw/LoadFile/;
use Data::Dumper;
use Digest::SHA1 qw/sha1_hex/;
use DBI;
use DateTime;

my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");

my $config_path = 'config.yml';
my $config = LoadFile($config_path);

#to add columns in db for link,dir,file to know what the name column stands for...
sub add_to_db {
	my ($checksum,$last_modif_time,$size,$name)=@_;
	#maybe calculating is_* should be done in process_file
	my $is_dir  = (-d $name)?'Y':'N';
	my $is_file = (-f $name)?'Y':'N';
	my $is_link = (-l $name)?'Y':'N';
	$dbh->do(
		sprintf "INSERT INTO checksums (checksum,size,last_date_modified,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");",
		$checksum,
		$size,
		$last_modif_time->ymd,
		$name,
		$is_dir,
		$is_file,
		$is_link
	);
};

sub delete_from_db {#remains to be completed
	my ($name)=@_;
};


sub file2sha1 {
	my $file=$_[0];
	return '' if -d $file; #have to find out if to prune when a directory is found that doesn't match the regex
	open my $f,"<$file";
	my $sha1 = Digest::SHA1->new;
	$sha1->addfile(*$f);
	return $sha1->hexdigest;
}


sub process_file {
	my $dir_configs=$_[0];
	##optimisation using -d -l -f -s just once for return and also for adding

	#if current "file"(unix terminology) is a directory and the yaml configuration
	#tells us to eliminate directories from the search we do so by returning from the
	#callback
	return if -d $File::Find::name && ! $dir_configs->{dir};
	return if -l $File::Find::name && ! $dir_configs->{link};
	return if -f $File::Find::name && ! $dir_configs->{file};
	return if -s $File::Find::name < $config->{minsize};
	unless($File::Find::name =~  /$dir_configs->{regex}/) {
		if(-d $File::Find::name) {
			$File::Find::prune=1;
		}
		return;
	}

	my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
		$atime,$mtime,$ctime,$blksize,$blocks)
	= stat($File::Find::name);
	my $last_modif_time=DateTime->from_epoch(epoch=>$mtime);
#	printf "%s %s %s %s\n",
#	$File::Find::name,
#	file2sha1($File::Find::name),
#	-s $File::Find::name,
#	$last_modif_time;

	add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $File::Find::name,$File::Find::name);
	#print Dumper $dir_configs;
};


for my $searched_dir_hash (@{ $config->{directories} }) {
	# we skip the entry if it does not exist or it is not a directory
	next unless (-e $searched_dir_hash->{path} && -d $searched_dir_hash->{path}); 
	#we pass to the process_file function the yml configuration for the current directory that is searched
	find(
		{  wanted=> sub { process_file($searched_dir_hash);}  }, 
		$searched_dir_hash->{path} 
	);
}



##</code><code>##

#!/usr/bin/perl 
use strict;
use warnings;
use File::Find;
use YAML qw/LoadFile/;
use Data::Dumper;
use Digest::SHA1 qw/sha1_hex/;
use DBI;
use DateTime;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");

my $config_path = 'config.yml';
my $config = LoadFile($config_path);

sub add_to_db {
	my ($checksum,$last_modif_time,$size,$name)=@_;
	#maybe calculating is_* should be done in process_file
	my $is_dir  = (-d $name)?'Y':'N';
	my $is_file = (-f $name)?'Y':'N';
	my $is_link = (-l $name)?'Y':'N';
	$dbh->do(
		sprintf "INSERT INTO checksums (checksum,size,last_date_modified,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");",
		$checksum,
		$size,
		$last_modif_time->ymd,
		$name,
		$is_dir,
		$is_file,
		$is_link
	);
}

sub update {
	my ($name,$checksum,$last_modif_time)=@_;
	$dbh->do(sprintf "UPDATE checksums SET checksum=\"%s\",last_date_modified=\"%s\" WHERE name=\"%s\";",$checksum,$name,$last_modif_time->ymd);
}

sub find_or_update {
	my ($name,$last_modif_time)=@_;
	my $s=$dbh->prepare(sprintf "SELECT last_date_modified FROM checksums WHERE name=\"%s\" ;",$name);
	$s->execute;
	my $results = $s->fetchall_arrayref;
	if($results) {
		#found it in the db;
		return 2 if $last_modif_time->ymd eq $results->[0]->[0] ; #return 2 if the entry is up to date
		update($name,file2sha1($name),$last_modif_time);
		return 1;# the entry is not up to date
	}
	return 0; #the entry has not be found- should be updated
};


sub file2sha1 {
	my $file=$_[0];
	return '' if -d $file; #have to find out if to prune when a directory is found that doesn't match the regex
	open my $f,"<$file";
	my $sha1 = Digest::SHA1->new;
	$sha1->addfile(*$f);
	return $sha1->hexdigest;
}


sub process_file {
	my $dir_configs=$_[0];
	return if -d  $File::Find::name && ! $dir_configs->{dir};
	return if -l  $File::Find::name && ! $dir_configs->{link};
	return if -f  $File::Find::name && ! $dir_configs->{file};
	return if -s  $File::Find::name < $config->{minsize};
	unless($File::Find::name =~  /$dir_configs->{regex}/) {
		if(-d $File::Find::name) {
			$File::Find::prune=1;
		}
		return;
	}

	my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
		$atime,$mtime,$ctime,$blksize,$blocks)
	= stat($File::Find::name);
	my $last_modif_time=DateTime->from_epoch(epoch=>$mtime);

	#find out if entry needs update and update it if necesary
	#find_or_update returns 0 only if it hasnt found the file in the checksum db
	unless(find_or_update($File::Find::name,$last_modif_time)) {
		add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $File::Find::name,$File::Find::name);
		#add it to db
	};
};


for my $searched_dir_hash (@{ $config->{directories} }) {
	next unless (-e $searched_dir_hash->{path} && -d $searched_dir_hash->{path}); 
	find(
		{  wanted=> sub { process_file($searched_dir_hash);}  }, 
		$searched_dir_hash->{path} 
	);
}


##</code><code>##

#!/usr/bin/perl 
use strict;
use warnings;
use DBI;
use DateTime;
use Data::Dumper;

my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");

open my $script,">duplicate_erase_script.sh";

sub get_unique_checksums {
	my $sql="SELECT checksum as groupsize FROM checksums GROUP BY size HAVING groupsize > 1;";
		#because groups of size 1 cannot have duplicates
	my $sth=$dbh->prepare($sql);
	$sth->execute;
	my $results=$sth->fetchall_arrayref;
	return map { $_->[0] } @{$results};
};

sub checksum2names {
	my ($checksum)=@_;
	my $sql=sprintf "SELECT name FROM checksums WHERE checksum=\"%s\";",$checksum;
	my $sth=$dbh->prepare($sql);
	$sth->execute;
	my $results=$sth->fetchall_arrayref;
	return map { $_->[0] } @{$results};
};


for my $checksum (get_unique_checksums()) {
	my @same_checksum=checksum2names($checksum);
	my $leader = shift @same_checksum;#take aside on element of the group making it the leader
	print $script "# duplicates of $leader follow:\n";
	for my $name (@same_checksum) {#get all the others and write commands to delete them
		print $script "# rm $name\n";
	}
};

close $script;