(Looking at the html version is easier to parse mentally, ..., but seemed like a harder problem initially.)92861 APMA 109 0001 GI LC CALCULUS I 4.0 0900-0950 M W F OLS 011 OBERHAUSER JP 055 002 O 0830-0920 T OLS 005 90063 APMA 109 0002 GI LC CALCULUS I 4.0 1000-1050 M W F OLS 120 BECK M 055 004 O 0830-0920 R OLS 120 91589 APMA 109 0003 GI LC CALCULUS I 4.0 1100-1150 M W F OLS 120 BECK M 055 006 O 0830-0920 T MEC 205 93778 APMA 109 0004 GI LC CALCULUS I 4.0 1200-1250 M W F OLS 120 BECK M 055 004 O 0830-0920 T MEC 205
#!/usr/bin/perl # parseCOD.perl # Attempts to parse .txt COD files for UVa use strict; use warnings; use Data::Dumper; # coures{mneumonic}->[sections] = # [ "Name", "ID", "Credit", "CurrEnroll", "MaxEnroll", [start time], + [end time], [days], [location], [instructor] ] # # So, for instance, to find out all the professors of each section of +SPAN411: # for (@{$coures{SPAN411}->[9]}) { print } # my %courses; my ($id, $mneumonic, $sect, $maxEnroll, $currEnroll, $name, $credit); my (@startTime, @endTime, @days, @location, @instructor); my $line; my $file = "webcod.enf.txt"; my $i = 1; open COD,$file or die "No go fo' $file\n"; open OUT,">out.txt" or die "No go fo' out.txt\n"; for (1..10) { my $junk = <COD>; } # Don't need first ten lines ... while ($line = <COD>) { SECTION: if ($line =~ /\d\d\d\d\d/) { # Start of a course section if ($line =~ /(\d\d\d\d\d)\s((?:\W|\S){2,5}\S*?\d\d\d\w?)\s+(\d\d\ +d\d)\s(.*)(\d\.\d)/) { $id = $1; $mneumonic = $2; $sect = $3; $name = $4; $credit = $5; } else { die "Line not properly parsed! Choking, choking ... dead +. Line:\n'$line'"; } # Now, get times, dates, etc. $line = <COD>; if ($line =~ /TBA/) { $line = <COD>; goto SECTION; } if ($line =~ /\s*(\d+)-(\d+)\s*([MTWRFS ]+?)\s*(\w\w\w\s*\w?\d*\w? +)\s*(\w*[,'-]?\w*\s*\w*)\s*(\d\d\d)\s*(\d\d\d)/) { $startTime[0] = $1; $endTime[0] = $2; $days[0] = $3; $location[0 +] = $4; $instructor[0] = $5; $maxEnroll = $6; $currEnroll = $7; } else { die "Oof! Malformed line. Line:\n'$line'"; } while ($line = <COD>) { if ($line =~ /\d\d\d\d\d/) { $line = <COD>; goto SECTION; } if ($line =~ /GRAD ENGR/) { $line = <COD>; goto SECTION; } if ($line =~ /TBA/) { $line = <COD>; goto SECTION; } if ($line =~ /RESTRICTED TO/) { $line = <COD>; goto SECTION; } if ($line =~ /^$/) { $line = <COD>; goto SECTION; } if ($line =~ /\s+([a-zA-Z]+\s+[a-zA-Z])/) { push @instructor, $1; } elsif ($line =~ /\s*(\d+)-(\d+)\s*([MTWRFS ]+?)\s*(\w\w\w\s*\w +?\d*)\s*(\w*[,'-]?\w*\s*\w*)/) { push @startTime, @$1; push @endTime, @$2; push @days, @$3; pus +h @location, @$4; if ($5) { push @instructor, @$5; } else { push @instructor, @{$instructor[0]}; } } else { die "Doh! Silly out of format line. Line:\n'$line'"; +} } } } continue { # Remove spaces from $mneumonic $mneumonic =~ s/\s+//g; $courses{$mneumonic}->[$sect] = [$name, $id, $credit, $currEnroll, $ +maxEnroll, \@startTime, \@endTime, \@days, \ +@location, \@instructor]; print "Assigned ", $i++, "\n"; }
In reply to Parsing COD text help by dimmesdale
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |