#!/usr/bin/perl ###################################################################### +########################## # QUINLANS ALGORITHM FOR DECISION TREE # Author : Sandhya Manickavasagam email- smanicak@syr.edu ###################################################################### +########################## use DBI; use Tree::DAG_Node; use AI::DecisionTree; use Graph::Easy; $table ="income"; my $file_to_append = "C:\\Documents and Settings\\smanicka\\Desktop\\q +uinlans\\output_quinlans.txt"; open (MYFILE,">$file_to_append")or warn "Can't open file to append"; print MYFILE "\t\t\tCSE 787 - ANALYTICAL DATA MINING - PROJECT 2\n==== +===================================================================== +=======\n\n"; print MYFILE "\n\n\t\t\tQUINLAN'S DECISION TREE ALGORITHM\n----------- +--------------------------------------------------------------------- +-\n"; print MYFILE "\n\t\t\tName :\t Sandhya Manickavasagam\n\t\t\tSU-ID:660 +185882\n============================================================= +===================\n\n"; my $dtree = new AI::DecisionTree(noise_mode=>'pick_best'); $dbh = DBI->connect('dbi:ODBC:driver=microsoft access driver (*.mdb);d +bq=C:\Documents and Settings\smanicka\Desktop\quinlans\sample.mdb') o +r warn("Sorry,Cant connect to the table\n$DBI::errstr \n") or warn " +Cannot connect to database"; my %data; $query = qq(select count(*) from $table); $sth = $dbh->prepare($query) or print "cannot prepare"; $sth ->execute(); $count_row = $sth->fetchrow_array(); my $query = "select * from $table"; print MYFILE "\nExecuting $query \n"; $sth = $dbh->prepare($query) or print "cannot prepare"; $sth ->execute(); while (@rows=$sth->fetchrow_array()) { $count_col =0; foreach $row (@rows){ # print "$row,"; $count_col ++; } # print "\n"; } print MYFILE "There are $count_row rows of data in this table\n"; print MYFILE "\nThere are $sth->{NUM_OF_FIELDS} columns\n\nAssuming th +at the first column is the transaction number and hence not a part of + the data used for mining : \n\nThe columns under consideration are : +\n\n\t$sth->{NAME}->[1]\n\t$sth->{NAME}->[2]\n\t$sth->{NAME}->[3]\n\t +$sth->{NAME}->[4]\n\t$sth->{NAME}->[5]\n"; for (my $i=1;$i<$count_col;$i++){ push (@columns_list,"$sth->{NAME}->[$i]"); } print MYFILE "\nThe Distinct entries are : \n "; foreach $c (@columns_list){ print MYFILE "\n$c\n================\n"; $query = "select distinct $c from $table"; #print "$query\n"; $sth = $dbh->prepare($query) or print "cannot prepare"; $sth ->execute(); while (@rows = $sth->fetchrow_array()) { foreach $row (@rows){ print MYFILE "$row\n"; push((@{"$c"}),$row); } } } print MYFILE "\n"; foreach $c (@columns_list){ foreach $k ( @{"$c"}) { #print "$k \n"; $occ = 0; $query = qq(select $c from $table where $c = '$k'); $stmnt = qq(select $c from $table where $c = $k); $sth = $dbh->prepare($query) or print "cannot prepare\n"; $sth ->execute() or (($sth = $dbh->prepare($stmnt) or print "cannot pr +epare") and $sth ->execute()); while (@rows = $sth->fetchrow_array()) { $occ++; } print MYFILE "$k occurs $occ times\n"; $ratio = $occ/$count_row; #${$c}{"$k"} = $ratio ; $data{"$k"} = $ratio; } print MYFILE "\n"; } # print "\n"; #foreach $c (@columns_list){ #print "\n$c\n"; #foreach $r (@{"$c"}){ # print "$r \t"; #} #} print MYFILE "\nStoring the entries into a hash table\n--------------- +---------------------------------------\n"; while (($key, $value) = each(%data)){ print MYFILE $key.", ".$value."\n"; } # Calculate ENTROPY sub log2 { my $n = $_; return log($n)/log(2); } #gain of each column should be calculated and column with max gain sho +uld become root #my $root = Tree::DAG_Node->new(); #$root->name("Outlook"); #$new_daughter = Tree::DAG_Node->new(); # $new_daughter->name(""); # $root->add_daughter($new_daughter); # A set of training data for deciding whether to play $query = qq(select * from $table); print MYFILE "\nExecuting $query \n"; $sth = $dbh->prepare($query) or print "cannot prepare"; $sth ->execute(); while(@rows = $sth->fetchrow_array()){ #if($rows[4] eq 1){ #$rows[4] = "yes"; #} #else{ #$rows[4] = "no"; #} foreach $r (@rows){ print "$r \t\t"; } print "\n"; #} ###################################################################### +########## # For the Most part, the entire program will work for any given data.I +f a new # # table needs to be used, please change the mappings below to reflect +the new # # table values. + # ###################################################################### +########## $dtree->add_instance (attributes=> {Income=> qq($rows[1]), Student=> qq ($rows[2]), Credit_Rating=> qq($rows[3]) }, result => qq($rows[4])); } $dtree ->train(); @rules = $dtree->rule_statements(); print MYFILE "\n Mining the Data \n=================================== +=======\n"; $i =0; print MYFILE "\n\nDecision Rules \n=================================== +=============\n"; foreach $rule (@rules){ print MYFILE "$rule \n"; $i = $i + 1; @{"arr".$i} = split (/\s+/,$rule); foreach $a (@{"arr".$i}){ #print "$a \t"; if ($a eq "if") { next; } if ($a eq "and"){ last; } else{ if ($a =~ m/=/){ @ar = split (/=/,$a); $root_of_tree = "<root>$ar[0]</root> \n"; } } } } print MYFILE "\n\n DECISION TREE |N=================================== +====\n\n$root_of_tree\n"; print "\n$root_of_tree\n"; @rules = $dtree->rule_statements(); $k = 0; foreach $rule (@rules){ $k ++; } foreach $rules (@rules){ @rule_arr = split (/\s+/,$rules); for($t =0;$t< $#rule_arr;$t++){ my @node; if ($rule_arr[$t] =~m/=/){ @leaf = split (/\=/,$rule_arr[$t]); push(@node," <branch node = \"$leaf[0]\" attribute =\"$leaf[1]\" \/>") +; } if ($rule_arr[$t] =~ m/\-\>/){ print "\n <Class value>\"$rule_arr[$t+1]\"</Class value>\n\n"; print MYFILE "\n <Class value>\"$rule_arr[$t+1]\"</Class value>\n\n"; } foreach $n (@node){ print "$n \n"; print MYFILE "$n \n"; } } } #if ($rule_arr[$t] =~ m/\=/){ # @leaf = split (/\=/,$rule_arr); # print "Leaf node : $leaf[0]\n"; # } #} #}
[download]
1 	high 	no 	fair      No 		
2 	high 	no 	excellent No 		
3 	high 	no 	fair 	  Yes 		
4 	medium 	no 	fair 	  Yes 		
5 	low 	yes 	fair 	  Yes 		
6 	low 	yes 	excellent No 		
7 	low 	yes 	excellent Yes 		
8 	medium 	no 	fair 	  No 		
9 	low 	yes 	fair 	  Yes 		
10 	medium 	yes 	fair 	  Yes 		
11 	medium 	yes 	excellent Yes 		
12 	medium 	no 	excellent Yes 		
13 	high 	yes 	fair 	  Yes 		
14 	medium 	no 	excellent No