comment on

I currently have a requirement to measure disk usage on a NAS box. This is for the purposes of billing out customers. The filesystems are quite neatly organised into a tree of /fsnnn/customer/dept
So our Local service centre, for out IT centre, would be /fs006/ITC/LSC.
The problem is, that I need to know how much disk is used by 'ITC' as a whole (there would be other directories below for other depts), and also as each of the individual departments.
The code I have come up with is listed below, but I suffer from a major problem. Since I am having to access these filesystems via NFS, the workstation that I run this from gets horribly nailed whilst this script is running (due to the fact that I'm doing a very large number of NFS stat calls, which eats up kernel processor time).
Can anyone suggest 'optimisations' that I could make to reduce this problem? I've considered trying to parallelise the different filesystems, but the bottleneck is likely to be the network so that won't help overly.
Or alternatively point me at a solution that does this in an efficient fashion so I don't have to go re-inventing. (Yes, I know it's traditional to do a directory recursion algorithym as part of the perl 'growing up' process :))

#!/usr/bin/perl

use File::Find;
use FileHandle;
use strict;
use warnings;

#proggie to processs filesystems on celerra and work out which departm
+ents
#own what.

#5Mb, if it's smaller then it doesn't bother recursing.
my $size_threshold = 5242880;
my $config_file = "disk_usage.conf";
# file looks like:
# /fs001:SYS:Any
# /fs001/Tech/res:Tech:Research
# ...

my @dirs = ( "/fs001", "/fs002", "/fs003", "/fs004", "/fs005", "/fs006
+", 
          "/fs007", "/fs008", "/fs009", "/fs010", "/fs011", "/fs012", 
+);
my $report_file = "disk_usage.html";
my $debug = 1;
my $global_recurse = 2; #default value but used for some interesting c
+alcs.
my @excl_directories = ( '.', '..' ); #cos they're unusual
my %is_excluded;
for ( @excl_directories ) { $is_excluded{$_} = 1; }

my %sizes;
my %customers;
my %totals;

sub data
{
  my $factor = 1024;
  my @sequence = ('b ', 'kb', 'Mb', 'Gb', 'Tb' );
  foreach my $input_number (@_)
  {
    my $seq_num = 0;
    my $output_number = $input_number;
    while ( $output_number / $factor > 1 )
    {
      $seq_num++;
      $output_number /= $factor;
    }
    return sprintf("%3.2f$sequence[$seq_num]", $output_number);
  }
}

sub getsize
{
#watch out for this. It's a _very_ expensive call.
#look out for optimisations lower down in the chain
  my @args = @_;
  my $sum = 0;
 
  if ( $debug ) { print "Getting size for @args\n"; }

  find sub { if ( -s ) { $sum += -s } }, @args;

  return $sum;
}

sub get_size_of_files
{
  my @args= @_;
  my $sum = 0;
  if ( $debug ) { print "Sizing files in @args\n"; }
  foreach my $thisdir ( @args )
  {
    opendir(IDIR, $thisdir) or 
      print "WARNING: Couldn't open $thisdir\n";
    while ( my $fname = readdir ( IDIR ) )
    {
      #print ( "$thisdir/$fname" );
      if ( ( ! -d "$thisdir/$fname" ) and 
             ( -s "$thisdir/$fname" ) )
    { $sum += -s "$thisdir/$fname" }
    }
    close (IDIR);
  }
  return $sum;
}

sub dusage
{
  my $startpoint = shift(@_) || '.';
  my $recurse_depth = shift(@_) || 0;
  my @dusage_list;
  #function to show disk usage of all subdirectories.

  if ( $debug ) { print "Reading $startpoint\n"; }
  if ( -d $startpoint)
  {
    #if ( $recurse_depth-- > 0  && $stuff{$startpoint} > $size_thresho
+ld )
    if ( $recurse_depth-- > 0 )
    {
      $sizes{$startpoint} = get_size_of_files($startpoint);
      if ( $debug ) { print "adding $sizes{$startpoint} to $startpoint
+\n"; }

      #my $tmp = $startpoint;
      #$tmp =~ s,/[A-Za-z0-9_\.\,\- ]+$,,g;
      # $stuff{$tmp} += $stuff{$startpoint};
      # if ( $debug ) { print "in lo adding value of $startpoint ( $st
+uff{$startpoint} ) to $tmp = $stuff{$tmp}\n" };
    
      opendir ( DIR, $startpoint);
      while ( my $filename = readdir(DIR) )
      {
        if ( -d "$startpoint/$filename" && !($is_excluded{$filename}) 
+)
        {
          $dusage_list[++$#dusage_list] = "$startpoint/$filename";
        }
      }
      for my $dir (@dusage_list)
      {
        dusage("$dir", $recurse_depth);
      }
    }
    else
    {
   #only process the expensive bit, if we're not going to recurse 'dee
+per'
      $sizes{$startpoint} = getsize($startpoint);
    }
  }
}


sub do_output
{
  open ( REPORT, ">$report_file" );
  my $base_indent = ( pop(@_) =~ tr,/,, );
  my %output = %sizes;
  my %basic_sizes = %sizes;
  
  foreach my $item ( sort ( keys ( %sizes ) ) ) 
  {
    print ("directory size: $sizes{$item} = $item \n");
  }
  while ( keys(%sizes) )
  {
    foreach my $value ( sort ( keys ( %sizes ) ) ) 
    {
      my $upd = $value;
      $upd =~ s,/[A-Za-z0-9_\.\,\- ]+$,,g;
      print "upd = $upd value = $value\n";
      if ( ! ( "$upd" eq "$value" ) )
      {
        $output{$upd} += $sizes{$value};
        $sizes{$upd} += $sizes{$value};
        if ( $debug ) { print "adding $value ( $sizes{$value} ) to $up
+d\n" }
      }
      delete($sizes{$value});
    }
  }
  if ( $debug ) 
  {
    foreach my $item ( sort ( keys ( %output ) ) )
    {
      print ("$output{$item} = $item \n");
    }
  }
  print REPORT "<TABLE BORDER=1>\n";

  foreach my $item ( sort ( keys ( %output ) ))
  {
    if ( $output{$item} > $size_threshold )
    {
      my $base_object = $item; 
      $base_object =~ s,.*/,/,g;
      my $indent_depth = ( $item =~ tr,/,, ) - $base_indent;
      my $indent_html = '</TD><TD>' x $indent_depth;
      $indent_depth *= 8;
      #now we work out who 'owns' that data by doing substring matches
      #with the config array.
      
      my @dir_list = split("/", $item);
      my $owner = "";
      while ( !$owner && @dir_list )
      {
        my $srch_string = join("/", @dir_list);
                         #the 'dir' to look for in the customers array
+.
                         #might or might not have a trailing '/'
        $srch_string =~ 's,/$,,g'; #strip training /
        #if ( $debug ) { print "$item: checking for \"$srch_string\"\n
+"; }
        if ( $customers{$srch_string} ) 
       {
       $owner = $customers{$srch_string};
        }
        pop (@dir_list);
      }
      if ( $debug ) { print "$owner\n"; }
      print REPORT "<TR><TD>", $indent_html;
      print REPORT data($output{$item});
      print REPORT "</TD><TD>", $base_object, "</TD>\n";
      printf ("%${indent_depth}s", data($output{$item}));
      print ("\t $base_object\n");
      if ( !$owner ) { $owner = "unknown:unknown" };

      my ( $customer, $dept ) = split (":", $owner );
      $totals{$customer}{'Total'}{'du'} += $basic_sizes{$item};
      push ( @{$totals{$customer}{'Total'}{'dirs'}}, $item );
      if ( $debug ) { print "$customer $dept = $basic_sizes{$item}\n";
+ }
      $totals{$customer}{$dept}{'du'} += $basic_sizes{$item};
      push ( @{$totals{$customer}{$dept}{'dirs'}}, $item ); 
      #push ( @customer_chain, join(" ", $owner, $item, data($output{$
+item}) ) );
    }
  }

  print "Listing by Customer and Department\n";
  print REPORT "</TABLE><BR><BR><TABLE BORDER=1><TR><TD>\n";
  print REPORT "Listing by Customer and Department\n";
  print REPORT "</TD></TR>";
  foreach my $customer ( sort ( keys ( %totals ) ) )
  {
    #print "key: $customer\n";
    foreach my $dept ( sort ( keys ( %{$totals{$customer}} ) ) )
    {
    #  print "key: $customer $dept\n";
      print "$customer $dept ", data($totals{$customer}{$dept}{'du'}),
+ "\n";
      print REPORT "<TR><TD>$customer $dept ", data($totals{$customer}
+{$dept}{'du'}), "</TD></TR>\n";
      #foreach my $dir ( @{$totals{$customer}{$dept}{'dirs'}} )
      #{
      #  print "   $dir\n";
      #}
    }
  }

close REPORT;
}

#MAIN

#print get_size_of_files("test");

if ( -f $config_file ) 
{
  open ( CONF, "$config_file");
  while ( <CONF> )
  {
    chomp;
    my ( $fs, $cust, $dept ) = split(":");
    $customers{$fs} = join(":", $cust, $dept);
    if ( $debug ) { print "got $fs - $cust - $dept\n"; }
  }
  close ( CONF );
}

if ( $debug ) { print keys ( %customers ); } 

STDOUT -> autoflush(1);

foreach my $dir ( @dirs )
{
  if ( $debug ) { print "\nSTARTING $dir\n"; }
  dusage ( $dir, $global_recurse );
}
do_output ( "/fs001" );
[download]

--
It's not pessimism if there is a worse option, it's not paranoia when they are and it's not cynicism when you're right.

In reply to Disk usage by customer by Preceptor

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.