#!/usr/bin/perl
################################################################################################
# QUINLANS ALGORITHM FOR DECISION TREE
# Author : Sandhya Manickavasagam email- smanicak@syr.edu
################################################################################################
use DBI;
use Tree::DAG_Node;
use AI::DecisionTree;
use Graph::Easy;
$table ="income";
my $file_to_append = "C:\\Documents and Settings\\smanicka\\Desktop\\quinlans\\output_quinlans.txt";
open (MYFILE,">$file_to_append")or warn "Can't open file to append";
print MYFILE "\t\t\tCSE 787 - ANALYTICAL DATA MINING - PROJECT 2\n================================================================================\n\n";
print MYFILE "\n\n\t\t\tQUINLAN'S DECISION TREE ALGORITHM\n---------------------------------------------------------------------------------\n";
print MYFILE "\n\t\t\tName :\t Sandhya Manickavasagam\n\t\t\tSU-ID:660185882\n================================================================================\n\n";
my $dtree = new AI::DecisionTree(noise_mode=>'pick_best');
$dbh = DBI->connect('dbi:ODBC:driver=microsoft access driver (*.mdb);dbq=C:\Documents and Settings\smanicka\Desktop\quinlans\sample.mdb') or warn("Sorry,Cant connect to the table\n$DBI::errstr \n") or warn " Cannot connect to database";
my %data;
$query = qq(select count(*) from $table);
$sth = $dbh->prepare($query) or print "cannot prepare";
$sth ->execute();
$count_row = $sth->fetchrow_array();
my $query = "select * from $table";
print MYFILE "\nExecuting $query \n";
$sth = $dbh->prepare($query) or print "cannot prepare";
$sth ->execute();
while (@rows=$sth->fetchrow_array()) {
$count_col =0;
foreach $row (@rows){
# print "$row,";
$count_col ++;
}
# print "\n";
}
print MYFILE "There are $count_row rows of data in this table\n";
print MYFILE "\nThere are $sth->{NUM_OF_FIELDS} columns\n\nAssuming that the first column is the transaction number and hence not a part of the data used for mining : \n\nThe columns under consideration are :\n\n\t$sth->{NAME}->[1]\n\t$sth->{NAME}->[2]\n\t$sth->{NAME}->[3]\n\t$sth->{NAME}->[4]\n\t$sth->{NAME}->[5]\n";
for (my $i=1;$i<$count_col;$i++){
push (@columns_list,"$sth->{NAME}->[$i]");
}
print MYFILE "\nThe Distinct entries are : \n ";
foreach $c (@columns_list){
print MYFILE "\n$c\n================\n";
$query = "select distinct $c from $table";
#print "$query\n";
$sth = $dbh->prepare($query) or print "cannot prepare";
$sth ->execute();
while (@rows = $sth->fetchrow_array()) {
foreach $row (@rows){
print MYFILE "$row\n";
push((@{"$c"}),$row);
}
}
}
print MYFILE "\n";
foreach $c (@columns_list){
foreach $k ( @{"$c"}) {
#print "$k \n";
$occ = 0;
$query = qq(select $c from $table where $c = '$k');
$stmnt = qq(select $c from $table where $c = $k);
$sth = $dbh->prepare($query) or print "cannot prepare\n";
$sth ->execute() or (($sth = $dbh->prepare($stmnt) or print "cannot prepare") and $sth ->execute());
while (@rows = $sth->fetchrow_array()) {
$occ++;
}
print MYFILE "$k occurs $occ times\n";
$ratio = $occ/$count_row;
#${$c}{"$k"} = $ratio ;
$data{"$k"} = $ratio;
}
print MYFILE "\n";
}
# print "\n";
#foreach $c (@columns_list){
#print "\n$c\n";
#foreach $r (@{"$c"}){
# print "$r \t";
#}
#}
print MYFILE "\nStoring the entries into a hash table\n------------------------------------------------------\n";
while (($key, $value) = each(%data)){
print MYFILE $key.", ".$value."\n";
}
# Calculate ENTROPY
sub log2 {
my $n = $_;
return log($n)/log(2);
}
#gain of each column should be calculated and column with max gain should become root
#my $root = Tree::DAG_Node->new();
#$root->name("Outlook");
#$new_daughter = Tree::DAG_Node->new();
# $new_daughter->name("");
# $root->add_daughter($new_daughter);
# A set of training data for deciding whether to play
$query = qq(select * from $table);
print MYFILE "\nExecuting $query \n";
$sth = $dbh->prepare($query) or print "cannot prepare";
$sth ->execute();
while(@rows = $sth->fetchrow_array()){
#if($rows[4] eq 1){
#$rows[4] = "yes";
#}
#else{
#$rows[4] = "no";
#}
foreach $r (@rows){
print "$r \t\t";
}
print "\n";
#}
################################################################################
# For the Most part, the entire program will work for any given data.If a new #
# table needs to be used, please change the mappings below to reflect the new #
# table values. #
################################################################################
$dtree->add_instance
(attributes=> {Income=> qq($rows[1]),
Student=> qq ($rows[2]),
Credit_Rating=> qq($rows[3])
},
result => qq($rows[4]));
}
$dtree ->train();
@rules = $dtree->rule_statements();
print MYFILE "\n Mining the Data \n==========================================\n";
$i =0;
print MYFILE "\n\nDecision Rules \n================================================\n";
foreach $rule (@rules){
print MYFILE "$rule \n";
$i = $i + 1;
@{"arr".$i} = split (/\s+/,$rule);
foreach $a (@{"arr".$i}){
#print "$a \t";
if ($a eq "if") {
next;
}
if ($a eq "and"){
last;
}
else{
if ($a =~ m/=/){
@ar = split (/=/,$a);
$root_of_tree = "$ar[0] \n";
}
}
}
}
print MYFILE "\n\n DECISION TREE |N=======================================\n\n$root_of_tree\n";
print "\n$root_of_tree\n";
@rules = $dtree->rule_statements();
$k = 0;
foreach $rule (@rules){
$k ++;
}
foreach $rules (@rules){
@rule_arr = split (/\s+/,$rules);
for($t =0;$t< $#rule_arr;$t++){
my @node;
if ($rule_arr[$t] =~m/=/){
@leaf = split (/\=/,$rule_arr[$t]);
push(@node," ");
}
if ($rule_arr[$t] =~ m/\-\>/){
print "\n \"$rule_arr[$t+1]\"\n\n";
print MYFILE "\n \"$rule_arr[$t+1]\"\n\n";
}
foreach $n (@node){
print "$n \n";
print MYFILE "$n \n";
}
}
}
#if ($rule_arr[$t] =~ m/\=/){
# @leaf = split (/\=/,$rule_arr);
# print "Leaf node : $leaf[0]\n";
# }
#}
#}