# here it is with comments (not updated from original draft) # there are two inconsistencies in what you ask for both # relating to the substitution of 1 NAME for 1 TEXT or is # it 2 TEXT or is it not wanted at all! In the spec you # specify after 0 @nnn@ SOUR that the first 1 NAME goes to # 1 TITL - this is consistent with sample. For the next # 1 NAME you suggest both 1 TEXT and 2 TEXT in different # examples. Unfortunately there is no n TEXT in the sample # output. you currently get 1 TEXT with instruction of how # to change this to 2 TEXT of 2 CONT as desired. #!/usr/bin/perl -w # note this code is only 30 or so lines long, all the # rest is comments for your benefit, like this. # lines that start with a # char are comments # like these lines, they are use to explain the code # the -w on the top 'shebang' line makes perl issue # warnings about probable errors in the code. on unix # the shebang line specifies the path to the perl # interpretter. On mac and windows it does not but # the -w is still looked for and acted on so we use it. # this line forces us to declare variables with the 'my' # declaration you see used below. amongst other things # it forces good coding practices or Perl complains use strict; # we declare a scalar variable called $file to hold the # name of the file we will munge (process) # the $ char lets Perl know we want a scalar variable to # hold a single value # we can set it to the name of a file (hard code the name) # here but for flexibility we don't we just set it to # a null string '' which along with the values 0 and # undef (undefined) equate to false in Perl. if a variable # holds one of these three values it is false otherwise it # is true my $file = ''; # we now set file to any command line argument which # we shift out of the @ARGV array that contains the # command line arguments (if if any) which are passed # to the script when you call it like this: # perl myscript.pl myfile_to_process # the unless $file bit means that we do this # unless $file holds a true value (set above) $file = shift @ARGV unless $file; # here we use an unless again. unless $file holds a true # value (from either of the options above) we execute the # block of code between the curly braces { ... } unless ($file) { # this line prints a prompt print "Please input a filename: "; # the <> is the input operator, used with nothing # between the < and the > it waits for input on # STDIN - the standard input ie the keyboard # we stick this input into $file and then chomp # off the newline that occurs when you hit enter chomp ($file = <>); } # we now declare an associative array (also called a hash). # the leading % char lets Perl know we want it to generate # a hash data structure. Hashes are very convenient as they # allow us to access a data value via a key # for example if we said print $change{'HIST'} perl looks # in the %change hash for the 'HIST' key and returns the # value 'Biography:' - without the quotes # you can probably see where this is going - we are going # to look for the keys and replace them with the values my %change = ( HIST => 'NOTE Biography:', EDUC => 'NOTE Educated:', RESI => 'NOTE Resided:', OCCU => 'NOTE Occupation:'); # here we open our file onto a filehandle called FILE # if this fails we die printing an error message # the $! special variable contains the error open (FILE, "<$file") or die "Unable to open $file: $!"; # here we read all the data in the file into an array # data structure called @data. the leading @ tells Perl # this is an array. you see we use the input operator # again but this time aimed at our file handle FILE # You have now met all the major Perl # data structures: # $ -> scalars which hold one value # % -> associative arrays that hold key value pairs # @ -> arrays that hold lists of values (lots of scalars) my @data = ; # this is self explanatory - we close the file handle # that we opened above with the close FILE; close FILE; # this next line is a little complex. first we declare # a new scalar var called $change_this. we then assign # the result of the join statement to this var. # what the join does it get all the keys from our # % change hash and join them together with the '|' char # thus the net result is that: # $change_this = 'HIST|EDUC|RESI|OCCU'; my $change_this = join "|", keys %change; # this bit changes $change_this into a compiled regular # expression that will match any one of HIST... # the brackets () also call parentheses/parenths are # inportant as when we match on of these four keys the # match will be stored in the magical perl variable $1 $change_this = qr/($change_this)/; # because 'SOUR' occurs in several contexts we define # a scalar variable $found_sour and set it to false. # we will use this as a flag for when we find a 0 @nn@ SOUR my $found_sour = 0; # the for (@data) iterates (loops) over the list of values # in the @data array. as a result each line of the source # file will be set to the magical $_ variable in turn. # $_ is magical in a number of ways. first if we change it # within this loop we will change the value stored in @data # second it is the default argument for regexes so if we # write m/^0/ what we are saying is match a '0' char at # the begining of the string in $_ (the ^ char specifies # the at the begining bit. similarly the s/this/that/ # rexex will substitute the string 'this' for the string # 'that' in $_ for (@data) { # this line does the search and replace substitutuion # for HIST.... as we capture the bit we matched in $1 # and this will be a key to our %change hash the # replacement will be the value of that key-value pair # the /g specifies do all cases in $_ without it only # the first match will get substituted - we don't # actually need this but I put it in anyway! s/$change_this/$change{$1}/g; # by observation a leading '0' specifies the begining # of a section in this file format so if we m/^0/ - # match a '0' at the begining of our line we reset # our $found_sour flag $found_sour = 0 if m/^0/; # here we set the found sour flag to true (1) if # we find a line that starts in 0 and ends in SOUR # the ^ says at the begining, the 0 is a literal '0' # the . matches anything and the + makes it match 1 or # more charachters of anything, the SOUR is a literal # 'SOUR', the \s* allows 0 or more trailing spaces, # and the $ pecifies the end of the string. phew $found_sour = 1 if m/^0.+SOUR\s*$/; # after the first SOUR we do this as $fouund_sour == 1 if ($found_sour == 1) { # we increment our flag count to 2 if we # succeed in substituting the literals as shown $found_sour++ if s/1 NAME/1 TITL/; } # same but this time second instance, this is in your # specification but not your sample output. if it is # wrong comment these three lines out by adding a # # character at the begining of the line like this: # elsif ($found_sour == 2) { # $found_sour++ if s/1 NAME/1 TEXT/; # } # the code above is a demo of how to kill this code # by commenting it out. you can change the 1 TEXT to # 2 TEXT depending on which you want - if either. elsif ($found_sour == 2) { $found_sour++ if s/1 NAME/1 TEXT/; } # this substitution will occur for further insances # of '1 NAME' as $found_sour will be true (it will be 3) elsif ($found_sour) { s/1 NAME/2 CONT/; } # the if/elsif/elsif structure means we may only do # one of these things on each line. we may also do # none if $found_sour is false ie == 0 } # this just prints out our munged data to a new file called # $file.new where $file is the name of the source file # as a result the output will be in a file of the same name # and in the same dir as the source but with a .new suffix open (FILE, ">$file.new") or die "Unable to create $file.new: $!"; print FILE @data; close FILE; # let the user know we are done print "Processed $file OK!";