package TLD; use strict; use vars qw ( @ISA @EXPORT_OK $TLD $ccTLD $SLD $VERSION ); $VERSION = "0.02"; require Exporter; @ISA = qw(Exporter); @EXPORT_OK = qw( $TLD $ccTLD $SLD valid_tld get_domain remove_tld ); $TLD = { com => 1, edu => 1, gov => 1, int => 1, mil => 1, net => 1, org => 1, arpa => 1, nato => 1, biz => 1, info => 1, name => 1, pro => 1, aero => 1, coop => 1, museum => 1, }; $ccTLD = { ac => 'Ascension Island', ad => 'Andorra', ae => 'United Arab Emirates', af => 'Afghanistan', ag => 'Antigua and Barbuda', ai => 'Anguilla', al => 'Albania', am => 'Armenia', an => 'Netherlands Antilles', ao => 'Angola', aq => 'Antarctica', ar => 'Argentina', as => 'American Samoa', at => 'Austria', au => 'Australia', aw => 'Aruba', az => 'Azerbaijan', ba => 'Bosnia and Herzegovina', bb => 'Barbados', bd => 'Bangladesh', be => 'Belgium', bf => 'Burkina Faso', bg => 'Bulgaria', bh => 'Bahrain', bi => 'Burundi', bj => 'Benin', bm => 'Bermuda', bn => 'Brunei Darussalam', bo => 'Bolivia', br => 'Brazil', bs => 'Bahamas', bt => 'Bhutan', bv => 'Bouvet Island', bw => 'Botswana', by => 'Belarus', bz => 'Belize', ca => 'Canada', cc => 'Cocos (Keeling) Islands', cd => 'Congo, Democratic Republic of the', cf => 'Central African Republic', cg => 'Congo, Republic of', ch => 'Switzerland', ci => 'Cote d\'Ivoire', ck => 'Cook Islands', cl => 'Chile', cm => 'Cameroon', cn => 'China', co => 'Colombia', cr => 'Costa Rica', cu => 'Cuba', cv => 'Cap Verde', cx => 'Christmas Island', cy => 'Cyprus', cz => 'Czech Republic', de => 'Germany', dj => 'Djibouti', dk => 'Denmark', dm => 'Dominica', do => 'Dominican Republic', dz => 'Algeria', ec => 'Ecuador', ee => 'Estonia', eg => 'Egypt', eh => 'Western Sahara', er => 'Eritrea', es => 'Spain', et => 'Ethiopia', fi => 'Finland', fj => 'Fiji', fk => 'Falkland Islands (Malvina)', fm => 'Micronesia, Federal State of', fo => 'Faroe Islands', fr => 'France', fx => 'France', ga => 'Gabon', gd => 'Grenada', ge => 'Georgia', gf => 'French Guiana', gg => 'Guernsey', gh => 'Ghana', gi => 'Gibraltar', gl => 'Greenland', gm => 'Gambia', gn => 'Guinea', gp => 'Guadeloupe', gq => 'Equatorial Guinea', gr => 'Greece', gs => 'South Georgia and the South Sandwich Islands', gt => 'Guatemala', gu => 'Guam', gw => 'Guinea-Bissau', gy => 'Guyana', hk => 'Hong Kong', hm => 'Heard and McDonald Islands', hn => 'Honduras', hr => 'Croatia/Hrvatska', ht => 'Haiti', hu => 'Hungary', id => 'Indonesia', ie => 'Ireland', il => 'Israel', im => 'Isle of Man', in => 'India', io => 'British Indian Ocean Territory', iq => 'Iraq', ir => 'Iran (Islamic Republic of)', is => 'Iceland', it => 'Italy', je => 'Jersey', jm => 'Jamaica', jo => 'Jordan', jp => 'Japan', ke => 'Kenya', kg => 'Kyrgyzstan', kh => 'Cambodia', ki => 'Kiribati', km => 'Comoros', kn => 'Saint Kitts and Nevis', kp => 'Korea, Democratic People\'s Republic', kr => 'Korea, Republic of', kw => 'Kuwait', ky => 'Cayman Islands', kz => 'Kazakhstan', la => 'Lao People\'s Democratic Republic', lb => 'Lebanon', lc => 'Saint Lucia', li => 'Liechtenstein', lk => 'Sri Lanka', lr => 'Liberia', ls => 'Lesotho', lt => 'Lithuania', lu => 'Luxembourg', lv => 'Latvia', ly => 'Libyan Arab Jamahiriya', ma => 'Morocco', mc => 'Monaco', md => 'Moldova, Republic of', mg => 'Madagascar', mh => 'Marshall Islands', mk => 'Macedonia, Former Yugoslav Republic', ml => 'Mali', mm => 'Myanmar', mn => 'Mongolia', mo => 'Macau', mp => 'Northern Mariana Islands', mq => 'Martinique', mr => 'Mauritania', ms => 'Montserrat', mt => 'Malta', mu => 'Mauritius', mv => 'Maldives', mw => 'Malawi', mx => 'Mexico', my => 'Malaysia', mz => 'Mozambique', na => 'Namibia', nc => 'New Caledonia', ne => 'Niger', nf => 'Norfolk Island', ng => 'Nigeria', ni => 'Nicaragua', nl => 'Netherlands', no => 'Norway', np => 'Nepal', nr => 'Nauru', nu => 'Niue', nz => 'New Zealand', om => 'Oman', pa => 'Panama', pe => 'Peru', pf => 'French Polynesia', pg => 'Papua New Guinea', ph => 'Philippines', pk => 'Pakistan', pl => 'Poland', pm => 'St. Pierre and Miquelon', pn => 'Pitcairn Island', pr => 'Puerto Rico', ps => 'Palestinian Territories', pt => 'Portugal', pw => 'Palau', py => 'Paraguay', qa => 'Qatar', re => 'Reunion Island', ro => 'Romania', ru => 'Russian Federation', rw => 'Rwanda', sa => 'Saudi Arabia', sb => 'Solomon Islands', sc => 'Seychelles', sd => 'Sudan', se => 'Sweden', sg => 'Singapore', sh => 'St. Helena', si => 'Slovenia', sj => 'Svalbard and Jan Mayen Islands', sk => 'Slovak Republic', sl => 'Sierra Leone', sm => 'San Marino', sn => 'Senegal', so => 'Somalia', sr => 'Suriname', st => 'Sao Tome and Principe', sv => 'El Salvador', sy => 'Syrian Arab Republic', sz => 'Swaziland', tc => 'Turks and Caicos Islands', td => 'Chad', tf => 'French Southern Territories', tg => 'Togo', th => 'Thailand', tj => 'Tajikistan', tk => 'Tokelau', tm => 'Turkmenistan', tn => 'Tunisia', to => 'Tonga', tp => 'East Timor', tr => 'Turkey', tt => 'Trinidad and Tobago', tv => 'Tuvalu', tw => 'Taiwan', tz => 'Tanzania', ua => 'Ukraine', ug => 'Uganda', uk => 'United Kingdom', um => 'US Minor Outlying Islands', us => 'United States', uy => 'Uruguay', uz => 'Uzbekistan', va => 'Holy See (City Vatican State)', vc => 'Saint Vincent and the Grenadines', ve => 'Venezuela', vg => 'Virgin Islands (British)', vi => 'Virgin Islands (USA)', vn => 'Vietnam', vu => 'Vanuatu', wf => 'Wallis and Futuna Islands', ws => 'Western Samoa', ye => 'Yemen', yt => 'Mayotte', yu => 'Yugoslavia', za => 'South Africa', zm => 'Zambia', zw => 'Zimbabwe', }; $SLD = { 'mx' => { 'org' => 'Para entidades sin fines de lucro mexicanas y personas físicas mexicanas', 'net' => 'Para proveedores de servicios de Internet localizados en México.', 'com' => 'Cualquier entidad', 'edu' => 'Para instituciones mexicanas de educación o investigación', 'gob' => 'Para instituciones u oficinas del Gobierno Mexicano (Federal, Estatal o Local)' }, 'jp' => { 'go' => 'Government organizations, Government-affiliated organizations', 'ed' => 'Schools including primary, junior and senior high schools', 'lg' => 'Local Government', 'gr' => 'Arbitrary organization', 'geo' => 'Geographic Type', 'ne' => 'Network services', 'or' => 'Foundation, Aggregate corporation', 'ac' => 'Universities, Technical schools, Incorporated schools', 'co' => 'Incorporated companies, Limited companies', 'ad' => 'Members of JPNIC' }, 'cn' => { 'aadn' => 'Administration Area Domain Name', 'org' => 'Nonprofit-making Organization', 'net' => 'Network Information Centre or Network Operation Centre', 'ac' => 'Institution of Science and Research', 'gov' => 'Government', 'edu' => 'Institution of Education', 'com' => 'Enterprise of Industry, Business, Finance etc.' }, 'uk' => { 'sch' => 'Schools', 'net' => 'Internet Service Providers', 'org' => 'Non-commercial organisations', 'mod' => 'Ministry of Defence Establishments', 'gov' => 'Government Bodies', 'police' => 'UK Police Forces', 'plc' => 'Registered company names only', 'me' => 'Personal domains', 'nhs' => 'NHS Organisations', 'ltd' => 'Registered company names only', 'co' => 'Commercial enterprises (the largest SLD in the UK)', 'ac' => 'Academic Establishments' }, 'nz' => { 'org' => 'Not-for-profit organisations', 'net' => 'Organisations and service providers directly related to the NZ Internet', 'gen' => 'Individuals and other organisations not covered elsewhere', 'govt' => 'National, regional and local government organisations operating with statutory powers', 'ac' => 'Tertiary educational institutions and related organisations', 'co' => 'Organisations pursuing commercial aims and purposes', 'school' => 'Primary, secondary and pre-schools and related organisations', 'iwi' => 'A traditional Maori tribe, mandated by the local Iwi Authority', 'cri' => 'Crown Research Institutes', 'maori' => 'A space for Indigenous-related entities (described here) that do not meet the .iwi.nz rule', 'mil' => 'Military organisations of the NZ Government' }, 'au' => { 'asn' => 'Incorporated bodies, political parties, trade unions, sporting and special interest clubs', 'otc' => 'Former OTC X.400 gateway. This 2LDs is no longer in general use.', 'org' => 'A catch-all, for registered organisations that do not fit within other 2LDs such as .com.au or .asn.au', 'net' => 'Commercial entities, such as companies (with ACN as registered through ASIC), and businesses (registered with state governments). The 2LD was formerly restricted to ISPs and other network service providers', 'gov' => 'Federal government bodies are registered below .gov.au. State or local government are registered in their state\'s 3LD, such as name.sa.gov.au', 'edu' => 'Educational institutions (Universities and other bodies of national significance are registered directly in this domain. State based bodies are registered in a state 3LD, for example name.nsw.edu.au)', 'info' => 'Used to identify major Australian information resources. This 2LDs is no longer in general use.', 'conf' => 'Short duration conference and exhibitions (once the conference or exhibition is finished the domain must be returned). This 2LDs is no longer in general use.', 'telememo' => 'A gateway to the X.400 email service. This 2LDs is no longer in general use.', 'csiro' => 'Units of the Commonwealth Scientific & Industry Research Organisation (CSIRO), the federal research & development organisation', 'gw' => 'Former AARNet gateway. This 2LDs is no longer in general use.', 'id' => 'Individuals who are Australian citizens or residents. The space was formerly distinguished with a set of third level domains named after flora and fauna (eg dropbear.id.au, echidna.id.au, emu.id.au)', 'com' => 'Commercial entities (The domain name must match the business name registered with the state/federal government or a registered Trade Mark or application for Trade Mark or otherwise have a \'close and substantial\' connection)', 'oz' => 'Mr Elz\'s machine at Melbourne University and a handful of other machines connected to ACSnet' }, 'in' => { 'firm' => '1', 'org' => '1', 'net' => '1', 'gov' => '1', 'ind' => '1', 'gen' => '1', 'co' => '1', 'ac' => '1', 'res' => '1', 'mil' => '1' } }; sub remove_tld { my ( $url ) = @_; return( [],[] ) unless $url; $url =~ s!^\w+://!!; my ($domain) = split '/', $url; # canonize domain name to lc all $domain = lc $domain; my @bits = split /\./, $domain; # remove top-level domain or country-code my @discard; if ( $TLD->{$bits[-1]} ) { push @discard, pop @bits; } elsif ( $ccTLD->{$bits[-1]} ) { my $cc = pop @bits; push @discard, $cc; # remove second-level domain for this country-code if ( $SLD->{$cc} and $SLD->{$cc}->{$bits[-1]} ) { unshift @discard, pop @bits; } else { # remove a top-level domain name in the second-level position; # this is our best guess for countries that have no $SLD entry above. if ( $TLD->{$bits[-1]} ) { unshift @discard, pop @bits; } } } else { # no valid TLD or ccTLD! return ([],[]); } return ( \@bits, \@discard ); } sub get_domain { my ($url, $flag) = @_; return '' unless $url; $flag ||= 0; $url =~ s!^\w+://!!; my ($domain) = (split '/', $url); # canonize domain name to lc all $domain = lc $domain; my @bits = split /\./, $domain; my @discard; # remove top-level domain or country-code if ( $TLD->{$bits[-1]} ) { unshift @discard, pop @bits; } elsif ( $ccTLD->{$bits[-1]} ) { my $cc = pop @bits; unshift @discard, $cc; # remove second-level domain for this country-code if ( $SLD->{$cc} and $SLD->{$cc}->{$bits[-1]} ) { unshift @discard, pop @bits; } else { # remove a top-level domain name in the second-level position; # this is our best guess for countries that have no $SLD entry above. if ( $TLD->{$bits[-1]} ) { unshift @discard, pop @bits; } } } else { # no valid TLD or ccTLD! Possibly a dot quad or just invalid.... return ''; } if ( $flag == 1 ) { # when flag=1, return the base domain name return $bits[-1] if $flag == 1; } elsif ( $flag == 2 ) { # when flag=2, return the (subdomain).domain part return wantarray ? @bits : (join '.', @bits) if $flag == 2; } else { # by default, flatten the domain and server name into a single word. # this is for our Bayes engine s/[^A-Za-z0-9]//g for @bits; return join '', @bits; } } sub valid_tld { my ( $domain ) = @_; return 0 unless $domain; $domain =~ s!^\w+://!!; ($domain) = split '/', $domain; my ( $tld ) = $domain =~ m!\.([^\.]+)$!; return 0 unless $tld; return 1 if exists $TLD->{$tld}; return 1 if exists $ccTLD->{$tld}; return 1 if $tld =~ m!^\d+$! and $tld > 0 and $tld < 256 and $_[0] =~ m!(?:^|/)\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}!; return 0; } 1;