#!usr/bin/perl -w use strict; # to process the file from: # http://www.treasury.gov/ofac/downloads/sdnlist.txt # this is about a 93K line file # that means that it easily fits into memory # # to get the valid "records" # (1) separate the records based upon them having # an extra \n between them # The records are "paragraphs". # (2) "squeeze" the lines together so that hyphenated # names will get put "back together" # This is needed so that simple searches will work. # (3) Apply hueristics to get rid of the extraneous # records, here a "valid input record": # (a) can't start with [ and must # (b) have a comma or 'a.k.a' in the first 50 characters # (c) get rid of leading ' if it is there # cannot get rid of ' globally because there are # records where this does have meaning. my @records = map {s/^'//;$_} # another hueristic grep{ !/^\s*\[/ and # huerististic (rule-of-thmub) substr ($_,0,50) =~/,|\Qa.k.a.\E/} map{s/\n//g; $_} # squeeze lines back together do { local $/= "\n\n"; ()}; # at this point, there are <12K records # from the 93K lines that we started with foreach (@records) { # your regex to select a record could maybe go here.. # also possible to make a translation table # of any name back to one of these records print "$_\n"; } __DATA__ Output from: http://www.treasury.gov/ofac/downloads/sdnlist.txt goes here... ALPHABETICAL LISTING OF SPECIALLY DESIGNATED NATIONALS AND BLOCKED PERSONS ("SDN List"): This publication of Treasury's Office of Foreign Assets Control ("OFAC") is designed as a reference tool providing actual notice of actions by OFAC with respect to Specially Designated Nationals and ...blah... 17 NOVEMBER (a.k.a. EPANASTATIKI ORGANOSI 17 NOEMVRI; a.k.a. REVOLUTIONARY ORGANIZATION 17 NOVEMBER) [FTO] [SDGT] 32 COUNTY SOVEREIGNTY COMMITTEE (a.k.a. 32 COUNTY SOVEREIGNTY MOVEMENT; a.k.a. IRISH REPUBLICAN PRISONERS WELFARE ASSOCIATION; a.k.a. REAL IRA; a.k.a. REAL IRISH REPUBLICAN ARMY; a.k.a. REAL OGLAIGH NA HEIREANN; a.k.a. RIRA) [FTO] [SDGT] 32 COUNTY SOVEREIGNTY MOVEMENT (a.k.a. 32 COUNTY SOVEREIGNTY COMMITTEE; a.k.a. IRISH REPUBLICAN PRISONERS WELFARE ASSOCIATION; a.k.a. REAL IRA; a.k.a. REAL IRISH REPUBLICAN ARMY; a.k.a. REAL OGLAIGH NA HEIREANN; a.k.a. RIRA) [FTO] [SDGT] 101 DAYS CAMPAIGN (a.k.a. CHARITY COALITION; a.k.a. COALITION OF GOOD; a.k.a. ETELAF AL-KHAIR; a.k.a. ETILAFU EL-KHAIR; a.k.a. I'TILAF AL-KHAIR; a.k.a. I'TILAF AL-KHAYR; a.k.a. UNION OF GOOD), P.O. Box 136301, Jeddah 21313, Saudi Arabia [SDGT] ..etc..