#!/usr/bin/perl -w use strict; my $file = ''; $file = shift @ARGV unless $file; unless ($file) { print "Please input a filename: "; chomp ($file = <>); } my %change = ( HEAL => 'NOTE Medical:', HIST => 'NOTE Biography:', EDUC => 'NOTE Educated:', RESI => 'NOTE Resided:', OCCU => 'NOTE Occupation:'); open (FILE, "<$file") or die "Unable to open $file: $!"; my @data = ; close FILE; my $change_this = join "|", keys %change; $change_this = qr/($change_this)/; my $found_sour = 0; my $last_line = ''; for (@data) { # make the IBM DOS to ASCII substitution s/1 CHAR IBM DOS/O CHAR ASCII/; s/$change_this/$change{$1}/g; $found_sour = 0 if m/^0/; $found_sour = 1 if m/^0.+SOUR\s*$/; if ($found_sour == 1) { $found_sour++ if s/1 NAME/1 TITL/; } elsif ($found_sour) { s/1 NAME/2 CONT/; } # fix the name problem if ($last_line =~ m/\bINDI\s*$/) { if (m/1\s*NAME/) { s|([\w.]+)|($1 eq "NAME")? $1 :ucfirst lc $1|eg; # the MacDonald fix, the \b (boundary) condition # and the quote check should limit appropriately # Names like Mace and Mack are treated as special # cases as this is the only way to deal with them s # substitute regex / # opening delimiter for regex (?; unless ($input =~ m/y/i) { print "Aborting!\n"; exit; } } open (FILE, ">$file") or die "Unable to create $file: $!"; print FILE @data; close FILE; print "Data written to $file OK!"; #### # here it is with comments (not updated from original draft) # there are two inconsistencies in what you ask for both # relating to the substitution of 1 NAME for 1 TEXT or is # it 2 TEXT or is it not wanted at all! In the spec you # specify after 0 @nnn@ SOUR that the first 1 NAME goes to # 1 TITL - this is consistent with sample. For the next # 1 NAME you suggest both 1 TEXT and 2 TEXT in different # examples. Unfortunately there is no n TEXT in the sample # output. you currently get 1 TEXT with instruction of how # to change this to 2 TEXT of 2 CONT as desired. #!/usr/bin/perl -w # note this code is only 30 or so lines long, all the # rest is comments for your benefit, like this. # lines that start with a # char are comments # like these lines, they are use to explain the code # the -w on the top 'shebang' line makes perl issue # warnings about probable errors in the code. on unix # the shebang line specifies the path to the perl # interpretter. On mac and windows it does not but # the -w is still looked for and acted on so we use it. # this line forces us to declare variables with the 'my' # declaration you see used below. amongst other things # it forces good coding practices or Perl complains use strict; # we declare a scalar variable called $file to hold the # name of the file we will munge (process) # the $ char lets Perl know we want a scalar variable to # hold a single value # we can set it to the name of a file (hard code the name) # here but for flexibility we don't we just set it to # a null string '' which along with the values 0 and # undef (undefined) equate to false in Perl. if a variable # holds one of these three values it is false otherwise it # is true my $file = ''; # we now set file to any command line argument which # we shift out of the @ARGV array that contains the # command line arguments (if if any) which are passed # to the script when you call it like this: # perl myscript.pl myfile_to_process # the unless $file bit means that we do this # unless $file holds a true value (set above) $file = shift @ARGV unless $file; # here we use an unless again. unless $file holds a true # value (from either of the options above) we execute the # block of code between the curly braces { ... } unless ($file) { # this line prints a prompt print "Please input a filename: "; # the <> is the input operator, used with nothing # between the < and the > it waits for input on # STDIN - the standard input ie the keyboard # we stick this input into $file and then chomp # off the newline that occurs when you hit enter chomp ($file = <>); } # we now declare an associative array (also called a hash). # the leading % char lets Perl know we want it to generate # a hash data structure. Hashes are very convenient as they # allow us to access a data value via a key # for example if we said print $change{'HIST'} perl looks # in the %change hash for the 'HIST' key and returns the # value 'Biography:' - without the quotes # you can probably see where this is going - we are going # to look for the keys and replace them with the values my %change = ( HIST => 'NOTE Biography:', EDUC => 'NOTE Educated:', RESI => 'NOTE Resided:', OCCU => 'NOTE Occupation:'); # here we open our file onto a filehandle called FILE # if this fails we die printing an error message # the $! special variable contains the error open (FILE, "<$file") or die "Unable to open $file: $!"; # here we read all the data in the file into an array # data structure called @data. the leading @ tells Perl # this is an array. you see we use the input operator # again but this time aimed at our file handle FILE # You have now met all the major Perl # data structures: # $ -> scalars which hold one value # % -> associative arrays that hold key value pairs # @ -> arrays that hold lists of values (lots of scalars) my @data = ; # this is self explanatory - we close the file handle # that we opened above with the close FILE; close FILE; # this next line is a little complex. first we declare # a new scalar var called $change_this. we then assign # the result of the join statement to this var. # what the join does it get all the keys from our # % change hash and join them together with the '|' char # thus the net result is that: # $change_this = 'HIST|EDUC|RESI|OCCU'; my $change_this = join "|", keys %change; # this bit changes $change_this into a compiled regular # expression that will match any one of HIST... # the brackets () also call parentheses/parenths are # inportant as when we match on of these four keys the # match will be stored in the magical perl variable $1 $change_this = qr/($change_this)/; # because 'SOUR' occurs in several contexts we define # a scalar variable $found_sour and set it to false. # we will use this as a flag for when we find a 0 @nn@ SOUR my $found_sour = 0; # the for (@data) iterates (loops) over the list of values # in the @data array. as a result each line of the source # file will be set to the magical $_ variable in turn. # $_ is magical in a number of ways. first if we change it # within this loop we will change the value stored in @data # second it is the default argument for regexes so if we # write m/^0/ what we are saying is match a '0' char at # the begining of the string in $_ (the ^ char specifies # the at the begining bit. similarly the s/this/that/ # rexex will substitute the string 'this' for the string # 'that' in $_ for (@data) { # this line does the search and replace substitutuion # for HIST.... as we capture the bit we matched in $1 # and this will be a key to our %change hash the # replacement will be the value of that key-value pair # the /g specifies do all cases in $_ without it only # the first match will get substituted - we don't # actually need this but I put it in anyway! s/$change_this/$change{$1}/g; # by observation a leading '0' specifies the begining # of a section in this file format so if we m/^0/ - # match a '0' at the begining of our line we reset # our $found_sour flag $found_sour = 0 if m/^0/; # here we set the found sour flag to true (1) if # we find a line that starts in 0 and ends in SOUR # the ^ says at the begining, the 0 is a literal '0' # the . matches anything and the + makes it match 1 or # more charachters of anything, the SOUR is a literal # 'SOUR', the \s* allows 0 or more trailing spaces, # and the $ pecifies the end of the string. phew $found_sour = 1 if m/^0.+SOUR\s*$/; # after the first SOUR we do this as $fouund_sour == 1 if ($found_sour == 1) { # we increment our flag count to 2 if we # succeed in substituting the literals as shown $found_sour++ if s/1 NAME/1 TITL/; } # same but this time second instance, this is in your # specification but not your sample output. if it is # wrong comment these three lines out by adding a # # character at the begining of the line like this: # elsif ($found_sour == 2) { # $found_sour++ if s/1 NAME/1 TEXT/; # } # the code above is a demo of how to kill this code # by commenting it out. you can change the 1 TEXT to # 2 TEXT depending on which you want - if either. elsif ($found_sour == 2) { $found_sour++ if s/1 NAME/1 TEXT/; } # this substitution will occur for further insances # of '1 NAME' as $found_sour will be true (it will be 3) elsif ($found_sour) { s/1 NAME/2 CONT/; } # the if/elsif/elsif structure means we may only do # one of these things on each line. we may also do # none if $found_sour is false ie == 0 } # this just prints out our munged data to a new file called # $file.new where $file is the name of the source file # as a result the output will be in a file of the same name # and in the same dir as the source but with a .new suffix open (FILE, ">$file.new") or die "Unable to create $file.new: $!"; print FILE @data; close FILE; # let the user know we are done print "Processed $file OK!"; #### 0 HEAD 1 SOUR REUNION 2 VERS V4.0 2 CORP Leister Productions 1 DEST ANSTFILE 1 DATE 8 JUL 2001 1 SUBM @S1@ 1 FILE Sample 1 GEDC 2 VERS 5.01 1 CHAR IBM DOS 0 @I1@ INDI 1 NAME Joe /SLAVEN/ 1 SEX M 1 BIRT 2 DATE 1945 2 SOUR @S4@ 2 PLAC Burwood, NSW 1 OCCU Engineer 1 EDUC Christian Bros 1 RESI Sydney, NSW; Townsville, QLD 1 NOTE Joe works as a planning manager in Queensland's largest electricity utility . He has worked there for 18 years. 1 HEAL Joe was recently hospitlised for atrium fibrillation in 2001 1 HIST Joe is named after both grandfathers 1 FAMS @F1@ 1 FAMC @F2@ 0 @I2@ INDI 1 NAME Janet /McDONALD/ 1 SEX F 1 BIRT 2 DATE 1948 2 PLAC Townsville 1 OCCU Librarian 1 EDUC St Pats College, The Strand, Townsville 1 RESI Tpownsville 1 FAMS @F1@ 0 @I3@ INDI 1 NAME Arthur /SLAVEN/ 1 SEX M 1 BIRT 2 DATE 1916 2 PLAC Koorawatha, NSW 1 DEAT 2 DATE 1956 2 PLAC Concord Hospital, Sydney 2 SOUR @S1@ 1 BURI 2 PLAC Rookwood Cemetery, NSW 1 OCCU Farmer, Bus driver 1 EDUC Crowther School 1 RESI Crowther, Sydney 1 NOTE On the death of his father, he took up an offer by his father's employer (Mr. Bragg, Crowther) for work on his farm 1 HEAL Arthur suffered from Hydatids cysts caught while in a WW2 training camp in WA. He had many major surgical operations before his death 1 FAMS @F2@ 0 @I4@ INDI 1 NAME Michael /SLAVEN/ 1 SEX M 1 BIRT 2 DATE 1972 2 PLAC Townsville, QLD 2 SOUR @S2@ 1 OCCU Plumber 1 EDUC Ignatius Park College, Townsville 1 RESI Townsville 1 FAMC @F1@ 0 @S1@ SUBM 1 NAME Joe SLAVEN 1 ADDR Townsville 2 CONT QLD 2 CONT Australiae 0 @S1@ SOUR 1 NAME NSW BDM Register 1 NAME Year 1956 1 NAME Name: Arthur SLAVEN 1 NAME Parents: Joseph SLAVEN & Mary FOGARTY 1 NAME Buried: Rookwood Cemetery 0 @S2@ SOUR 1 NAME NSW Birth Register 1 NAME Name: Michael SLAVEN 1 NAME Parents: Joseph and Janet 1 NAME Date 1972 1 NAME Baptised: 1972 1 NAME Location: Townsville General Hospital 1 NAME Transcript held by Joe Slaven 0 @S3@ SOUR 1 NAME QLD Marriage Register 1 NAME Year: 1970 1 NAME Location: St Joseph's , Townsville 0 @S4@ SOUR 1 NAME From information suppleid by Joe Slaven, June 2001 0 @F1@ FAM 1 HUSB @I1@ 1 WIFE @I2@ 1 CHIL @I4@ 1 MARR 2 DATE 1970 2 PLAC Townsville, QLD 2 SOUR @S3@ 0 @F2@ FAM 1 HUSB @I3@ 1 CHIL @I1@ 0 TRLR #### 0 HEAD 1 SOUR REUNION 2 VERS V4.0 2 CORP Leister Productions 1 DEST ANSTFILE 1 DATE 8 JUL 2001 1 SUBM @S1@ 1 FILE Sample 1 GEDC 2 VERS 5.01 1 CHAR IBM DOS 0 @I1@ INDI 1 NAME Joe /SLAVEN/ 1 SEX M 1 BIRT 2 DATE 1945 2 SOUR @S4@ 2 PLAC Burwood, NSW 1 OCCU Engineer 1 NOTE Educated: Christian Bros 1 NOTE Resided: Sydney, NSW; Townsville, QLD 1 NOTE Joe works as a planning manager in Queensland's largest electricity utility . He has worked there for 18 years. 1 NOTE Medical: Joe was recently hospitlised for atrium fibrillation in 2001 1 NOTE Biographical: Joe is named after both grandfathers 1 FAMS @F1@ 1 FAMC @F2@ 0 @I2@ INDI 1 NAME Janet /McDONALD/ 1 SEX F 1 BIRT 2 DATE 1948 2 PLAC Townsville 1 OCCU Librarian 1 NOTE Educated: St Pats College, The Strand, Townsville 1 NOTE Resided: Tpownsville 1 FAMS @F1@ 0 @I3@ INDI 1 NAME Arthur /SLAVEN/ 1 SEX M 1 BIRT 2 DATE 1916 2 PLAC Koorawatha, NSW 1 DEAT 2 DATE 1956 2 PLAC Concord Hospital, Sydney 2 SOUR @S1@ 1 BURI 2 PLAC Rookwood Cemetery, NSW 1 OCCU Farmer, Bus driver 1 NOTE Educated: Crowther School 1 NOTE Resided: Crowther, Sydney 1 NOTE On the death of his father, he took up an offer by his father's employer (Mr. Bragg, Crowther) for work on his farm 1 NOTE Medical: Arthur suffered from Hydatids cysts caught while in a WW2 training camp in WA. He had many major surgical operations before his death 1 FAMS @F2@ 0 @I4@ INDI 1 NAME Michael /SLAVEN/ 1 SEX M 1 BIRT 2 DATE 1972 2 PLAC Townsville, QLD 2 SOUR @S2@ 1 OCCU Plumber 1 NOTE Educated: Ignatius Park College, Townsville 1 NOTE Resided: Townsville 1 FAMC @F1@ 0 @S1@ SUBM 1 NAME Joe SLAVEN 1 ADDR Townsville 2 CONT QLD 2 CONT Australiae 0 @S1@ SOUR 1 TITL NSW BDM Register 2 CONT Year 1956 2 CONT Name: Arthur SLAVEN 2 CONT Parents: Joseph SLAVEN & Mary FOGARTY 2 CONT Buried: Rookwood Cemetery 0 @S2@ SOUR 1 TITL NSW Birth Register 2 CONT Name: Michael SLAVEN 2 CONT Parents: Joseph and Janet 2 CONT Date 1972 2 CONT Baptised: 1972 2 CONT Location: Townsville General Hospital 2 CONT Transcript held by Joe Slaven 0 @S3@ SOUR 1 TITL QLD Marriage Register 2 CONT Year: 1970 2 CONT Location: St Joseph's , Townsville 0 @S4@ SOUR 1 TITL From information suppleid by Joe Slaven, June 2001 0 @F1@ FAM 1 HUSB @I1@ 1 WIFE @I2@ 1 CHIL @I4@ 1 MARR 2 DATE 1970 2 PLAC Townsville, QLD 2 SOUR @S3@ 0 @F2@ FAM 1 HUSB @I3@ 1 CHIL @I1@ 0 TRLR