#!/usr/bin/perl use IO::File; use POSIX ":sys_wait_h"; open(JUNKD,">test_rsh-commands.txt"); for($j=1;$j<=10;$j++) # Number of time to resubmit the 24 subprocesses { # Phase 1: Setup phase to spawn jobs &spawn_jobs(1,handle_child); for($i=1;$i<=24;$i++) # BEGIN: Number of subprocesses { $proc = "sp" . "$i"; # remote node to run command on $cmd = "date"; # simple test command # Phase 2: Spawn the jobs &spawn_jobs(2,$proc,$cmd,1,2,3); } # END: loop # Phase 3: Wait for the jobs to finish &spawn_jobs(3,26); close JUNKD; } ## BEGIN: Spwan children jobs on slave nodes ## sub spawn_jobs { my @a=@_; my $phase,$i,$proc,$nt,$mc,$um,$sleep,$sub; $phase = $a[0]; if($phase==1) { $sub = $a[1] } elsif($phase==2) { ($proc,$cmd,$nt,$mc,$um) = @a[1..5] } elsif($phase==3) { $sleep = $a[1] } if($phase==1) # Setup phase { # set up child signal handler $SIG{'CHLD'} = \&$sub; $|++; %fhlist; %fhlist2; %fhlist3; } elsif($phase==2) # Spawn the jobs phase { # Create an anonymous file handle $pid = fork(); if($pid < 0 or not defined $pid) { print LOG "$#-> Can't fork! Bad kernel!"; close LOG; die "$#-> Can't fork! Bad kernel!"; } elsif($pid == 0) { # child process print JUNKD "/usr/bin/rsh $proc $cmd\n"; # system("/usr/bin/rsh $proc $cmd"); # I'm commmenting out the above line, since not everyone # has 24 remote nodes to run on. # system("$cmd"); exec("$cmd"); exit(0); } else { # Parent process, toss child file handle into the hash and move on with # our lives. $fhlist{"$pid"} = $nt; $fhlist2{"$pid"} = $mc; $fhlist3{"$pid"} = $um; } } elsif($phase==3) # Wait till the children are done phase { while(1) { @kl = keys(%fhlist); if($#kl >= 0) { # mo' to do... sleep($sleep); } else { last; } } } } ### END: Spwan children jobs on slave nodes ## sub handle_child { # This gets called when a child dies... maybe more than one # died at the same time, so it's best to do this in a loop my $temp, $mcopy, $umbr, $nbias, $nmat; while(($dead_kid = waitpid(-1, WNOHANG)) > 0) { $temp = $fhlist{"$dead_kid"}; # get the file descriptor back $mcopy = $fhlist2{"$dead_kid"}; $umbr = $fhlist3{"$dead_kid"}; delete($fhlist{"$dead_kid"}); delete($fhlist2{"$dead_kid"}); delete($fhlist3{"$dead_kid"}); } } #### p243~/>ps -u user PID TTY TIME CMD 10319 ? 00:00:00 tcsh 10320 ? 00:00:00 pbs_demux 10341 ? 00:00:00 439291.biobos.S 10367 ? 00:02:09 mubrex_mpi_biow 20933 ? 00:00:00 mubrex_mpi_biow 20934 ? 00:00:00 rsh 20935 ? 00:00:00 mubrex_mpi_biow 20936 ? 00:00:00 rsh 20937 ? 00:00:00 mubrex_mpi_biow 20938 ? 00:00:00 rsh 20939 ? 00:00:00 mubrex_mpi_biow 20940 ? 00:00:00 rsh 20941 ? 00:00:00 mubrex_mpi_biow 20942 ? 00:00:00 rsh 20944 ? 00:00:00 mubrex_mpi_biow 20946 ? 00:00:00 mubrex_mpi_biow 20947 ? 00:00:00 rsh 20948 ? 00:00:00 mubrex_mpi_biow 20949 ? 00:00:00 rsh 20950 ? 00:00:00 mubrex_mpi_biow 20951 ? 00:00:00 rsh 20952 ? 00:00:00 mubrex_mpi_biow 20953 ? 00:00:00 rsh 20954 ? 00:00:00 mubrex_mpi_biow 20955 ? 00:00:00 rsh 20956 ? 00:00:00 mubrex_mpi_biow 20958 ? 00:00:00 mubrex_mpi_biow 20945 ? 00:00:00 rsh 20957 ? 00:00:00 rsh 20959 ? 00:00:00 rsh 20960 ? 00:00:00 rsh 20961 ? 00:00:00 rsh 20962 ? 00:00:00 rsh 20963 ? 00:00:00 tcsh 20964 ? 00:00:00 rsh 20965 ? 00:00:00 rsh 20968 ? 00:00:00 rsh 20969 ? 00:00:00 rsh 20972 ? 00:00:00 rsh 20973 ? 00:00:00 rsh 20974 ? 00:00:00 rsh 20976 ? 00:00:00 rsh 20978 ? 00:00:00 rsh 20980 ? 00:00:00 rsh