in reply to Stripping domain names from URLs

There is consistency with how domains are arranged vis TLDs and ccTLDs and SLDs but it varies by country. I whipped up this module some time ago and expect it does what you want. It has all the TLDs, ccTLDs, and the SLDs for the major and easily accessible ccTLDs. Sorry there are no docs but it is pretty simple to RTFS. It is basically 400 lines of data with about 50 lines of code at the end. get_domain( URL, FLAG ) is probably what you want. See the source for what the flags do but you need to pass either 1 or 2 to get the domain only or subdomain(s).domain respectively.

package TLD; use strict; use vars qw ( @ISA @EXPORT_OK $TLD $ccTLD $SLD $VERSION ); $VERSION = "0.02"; require Exporter; @ISA = qw(Exporter); @EXPORT_OK = qw( $TLD $ccTLD $SLD valid_tld get_domain remove_tld ); $TLD = { com => 1, edu => 1, gov => 1, int => 1, mil => 1, net => 1, org => 1, arpa => 1, nato => 1, biz => 1, info => 1, name => 1, pro => 1, aero => 1, coop => 1, museum => 1, }; $ccTLD = { ac => 'Ascension Island', ad => 'Andorra', ae => 'United Arab Emirates', af => 'Afghanistan', ag => 'Antigua and Barbuda', ai => 'Anguilla', al => 'Albania', am => 'Armenia', an => 'Netherlands Antilles', ao => 'Angola', aq => 'Antarctica', ar => 'Argentina', as => 'American Samoa', at => 'Austria', au => 'Australia', aw => 'Aruba', az => 'Azerbaijan', ba => 'Bosnia and Herzegovina', bb => 'Barbados', bd => 'Bangladesh', be => 'Belgium', bf => 'Burkina Faso', bg => 'Bulgaria', bh => 'Bahrain', bi => 'Burundi', bj => 'Benin', bm => 'Bermuda', bn => 'Brunei Darussalam', bo => 'Bolivia', br => 'Brazil', bs => 'Bahamas', bt => 'Bhutan', bv => 'Bouvet Island', bw => 'Botswana', by => 'Belarus', bz => 'Belize', ca => 'Canada', cc => 'Cocos (Keeling) Islands', cd => 'Congo, Democratic Republic of the', cf => 'Central African Republic', cg => 'Congo, Republic of', ch => 'Switzerland', ci => 'Cote d\'Ivoire', ck => 'Cook Islands', cl => 'Chile', cm => 'Cameroon', cn => 'China', co => 'Colombia', cr => 'Costa Rica', cu => 'Cuba', cv => 'Cap Verde', cx => 'Christmas Island', cy => 'Cyprus', cz => 'Czech Republic', de => 'Germany', dj => 'Djibouti', dk => 'Denmark', dm => 'Dominica', do => 'Dominican Republic', dz => 'Algeria', ec => 'Ecuador', ee => 'Estonia', eg => 'Egypt', eh => 'Western Sahara', er => 'Eritrea', es => 'Spain', et => 'Ethiopia', fi => 'Finland', fj => 'Fiji', fk => 'Falkland Islands (Malvina)', fm => 'Micronesia, Federal State of', fo => 'Faroe Islands', fr => 'France', fx => 'France', ga => 'Gabon', gd => 'Grenada', ge => 'Georgia', gf => 'French Guiana', gg => 'Guernsey', gh => 'Ghana', gi => 'Gibraltar', gl => 'Greenland', gm => 'Gambia', gn => 'Guinea', gp => 'Guadeloupe', gq => 'Equatorial Guinea', gr => 'Greece', gs => 'South Georgia and the South Sandwich Islands', gt => 'Guatemala', gu => 'Guam', gw => 'Guinea-Bissau', gy => 'Guyana', hk => 'Hong Kong', hm => 'Heard and McDonald Islands', hn => 'Honduras', hr => 'Croatia/Hrvatska', ht => 'Haiti', hu => 'Hungary', id => 'Indonesia', ie => 'Ireland', il => 'Israel', im => 'Isle of Man', in => 'India', io => 'British Indian Ocean Territory', iq => 'Iraq', ir => 'Iran (Islamic Republic of)', is => 'Iceland', it => 'Italy', je => 'Jersey', jm => 'Jamaica', jo => 'Jordan', jp => 'Japan', ke => 'Kenya', kg => 'Kyrgyzstan', kh => 'Cambodia', ki => 'Kiribati', km => 'Comoros', kn => 'Saint Kitts and Nevis', kp => 'Korea, Democratic People\'s Republic', kr => 'Korea, Republic of', kw => 'Kuwait', ky => 'Cayman Islands', kz => 'Kazakhstan', la => 'Lao People\'s Democratic Republic', lb => 'Lebanon', lc => 'Saint Lucia', li => 'Liechtenstein', lk => 'Sri Lanka', lr => 'Liberia', ls => 'Lesotho', lt => 'Lithuania', lu => 'Luxembourg', lv => 'Latvia', ly => 'Libyan Arab Jamahiriya', ma => 'Morocco', mc => 'Monaco', md => 'Moldova, Republic of', mg => 'Madagascar', mh => 'Marshall Islands', mk => 'Macedonia, Former Yugoslav Republic', ml => 'Mali', mm => 'Myanmar', mn => 'Mongolia', mo => 'Macau', mp => 'Northern Mariana Islands', mq => 'Martinique', mr => 'Mauritania', ms => 'Montserrat', mt => 'Malta', mu => 'Mauritius', mv => 'Maldives', mw => 'Malawi', mx => 'Mexico', my => 'Malaysia', mz => 'Mozambique', na => 'Namibia', nc => 'New Caledonia', ne => 'Niger', nf => 'Norfolk Island', ng => 'Nigeria', ni => 'Nicaragua', nl => 'Netherlands', no => 'Norway', np => 'Nepal', nr => 'Nauru', nu => 'Niue', nz => 'New Zealand', om => 'Oman', pa => 'Panama', pe => 'Peru', pf => 'French Polynesia', pg => 'Papua New Guinea', ph => 'Philippines', pk => 'Pakistan', pl => 'Poland', pm => 'St. Pierre and Miquelon', pn => 'Pitcairn Island', pr => 'Puerto Rico', ps => 'Palestinian Territories', pt => 'Portugal', pw => 'Palau', py => 'Paraguay', qa => 'Qatar', re => 'Reunion Island', ro => 'Romania', ru => 'Russian Federation', rw => 'Rwanda', sa => 'Saudi Arabia', sb => 'Solomon Islands', sc => 'Seychelles', sd => 'Sudan', se => 'Sweden', sg => 'Singapore', sh => 'St. Helena', si => 'Slovenia', sj => 'Svalbard and Jan Mayen Islands', sk => 'Slovak Republic', sl => 'Sierra Leone', sm => 'San Marino', sn => 'Senegal', so => 'Somalia', sr => 'Suriname', st => 'Sao Tome and Principe', sv => 'El Salvador', sy => 'Syrian Arab Republic', sz => 'Swaziland', tc => 'Turks and Caicos Islands', td => 'Chad', tf => 'French Southern Territories', tg => 'Togo', th => 'Thailand', tj => 'Tajikistan', tk => 'Tokelau', tm => 'Turkmenistan', tn => 'Tunisia', to => 'Tonga', tp => 'East Timor', tr => 'Turkey', tt => 'Trinidad and Tobago', tv => 'Tuvalu', tw => 'Taiwan', tz => 'Tanzania', ua => 'Ukraine', ug => 'Uganda', uk => 'United Kingdom', um => 'US Minor Outlying Islands', us => 'United States', uy => 'Uruguay', uz => 'Uzbekistan', va => 'Holy See (City Vatican State)', vc => 'Saint Vincent and the Grenadines', ve => 'Venezuela', vg => 'Virgin Islands (British)', vi => 'Virgin Islands (USA)', vn => 'Vietnam', vu => 'Vanuatu', wf => 'Wallis and Futuna Islands', ws => 'Western Samoa', ye => 'Yemen', yt => 'Mayotte', yu => 'Yugoslavia', za => 'South Africa', zm => 'Zambia', zw => 'Zimbabwe', }; $SLD = { 'mx' => { 'org' => 'Para entidades sin fines de lucro mexicanas y pe +rsonas físicas mexicanas', 'net' => 'Para proveedores de servicios de Internet locali +zados en México.', 'com' => 'Cualquier entidad', 'edu' => 'Para instituciones mexicanas de educación o inve +stigación', 'gob' => 'Para instituciones u oficinas del Gobierno Mexic +ano (Federal, Estatal o Local)' }, 'jp' => { 'go' => 'Government organizations, Government-affiliated o +rganizations', 'ed' => 'Schools including primary, junior and senior high + schools', 'lg' => 'Local Government', 'gr' => 'Arbitrary organization', 'geo' => 'Geographic Type', 'ne' => 'Network services', 'or' => 'Foundation, Aggregate corporation', 'ac' => 'Universities, Technical schools, Incorporated sch +ools', 'co' => 'Incorporated companies, Limited companies', 'ad' => 'Members of JPNIC' }, 'cn' => { 'aadn' => 'Administration Area Domain Name', 'org' => 'Nonprofit-making Organization', 'net' => 'Network Information Centre or Network Operation +Centre', 'ac' => 'Institution of Science and Research', 'gov' => 'Government', 'edu' => 'Institution of Education', 'com' => 'Enterprise of Industry, Business, Finance etc.' }, 'uk' => { 'sch' => 'Schools', 'net' => 'Internet Service Providers', 'org' => 'Non-commercial organisations', 'mod' => 'Ministry of Defence Establishments', 'gov' => 'Government Bodies', 'police' => 'UK Police Forces', 'plc' => 'Registered company names only', 'me' => 'Personal domains', 'nhs' => 'NHS Organisations', 'ltd' => 'Registered company names only', 'co' => 'Commercial enterprises (the largest SLD in the UK +)', 'ac' => 'Academic Establishments' }, 'nz' => { 'org' => 'Not-for-profit organisations', 'net' => 'Organisations and service providers directly rel +ated to the NZ Internet', 'gen' => 'Individuals and other organisations not covered +elsewhere', 'govt' => 'National, regional and local government organis +ations operating with statutory powers', 'ac' => 'Tertiary educational institutions and related org +anisations', 'co' => 'Organisations pursuing commercial aims and purpos +es', 'school' => 'Primary, secondary and pre-schools and relate +d organisations', 'iwi' => 'A traditional Maori tribe, mandated by the local + Iwi Authority', 'cri' => 'Crown Research Institutes', 'maori' => 'A space for Indigenous-related entities (descr +ibed here) that do not meet the .iwi.nz rule', 'mil' => 'Military organisations of the NZ Government' }, 'au' => { 'asn' => 'Incorporated bodies, political parties, trade un +ions, sporting and special interest clubs', 'otc' => 'Former OTC X.400 gateway. This 2LDs is no longer + in general use.', 'org' => 'A catch-all, for registered organisations that d +o not fit within other 2LDs such as .com.au or .asn.au', 'net' => 'Commercial entities, such as companies (with ACN + as registered through ASIC), and businesses (registered with state g +overnments). The 2LD was formerly restricted to ISPs and other networ +k service providers', 'gov' => 'Federal government bodies are registered below . +gov.au. State or local government are registered in their state\'s 3L +D, such as name.sa.gov.au', 'edu' => 'Educational institutions (Universities and other + bodies of national significance are registered directly in this doma +in. State based bodies are registered in a state 3LD, for example nam +e.nsw.edu.au)', 'info' => 'Used to identify major Australian information r +esources. This 2LDs is no longer in general use.', 'conf' => 'Short duration conference and exhibitions (once + the conference or exhibition is finished the domain must be returned +). This 2LDs is no longer in general use.', 'telememo' => 'A gateway to the X.400 email service. This +2LDs is no longer in general use.', 'csiro' => 'Units of the Commonwealth Scientific & Industr +y Research Organisation (CSIRO), the federal research & development o +rganisation', 'gw' => 'Former AARNet gateway. This 2LDs is no longer in +general use.', 'id' => 'Individuals who are Australian citizens or reside +nts. The space was formerly distinguished with a set of third level d +omains named after flora and fauna (eg dropbear.id.au, echidna.id.au, + emu.id.au)', 'com' => 'Commercial entities (The domain name must match +the business name registered with the state/federal government or a r +egistered Trade Mark or application for Trade Mark or otherwise have +a \'close and substantial\' connection)', 'oz' => 'Mr Elz\'s machine at Melbourne University and a h +andful of other machines connected to ACSnet' }, 'in' => { 'firm' => '1', 'org' => '1', 'net' => '1', 'gov' => '1', 'ind' => '1', 'gen' => '1', 'co' => '1', 'ac' => '1', 'res' => '1', 'mil' => '1' } }; sub remove_tld { my ( $url ) = @_; return( [],[] ) unless $url; $url =~ s!^\w+://!!; my ($domain) = split '/', $url; # canonize domain name to lc all $domain = lc $domain; my @bits = split /\./, $domain; # remove top-level domain or country-code my @discard; if ( $TLD->{$bits[-1]} ) { push @discard, pop @bits; } elsif ( $ccTLD->{$bits[-1]} ) { my $cc = pop @bits; push @discard, $cc; # remove second-level domain for this country-code if ( $SLD->{$cc} and $SLD->{$cc}->{$bits[-1]} ) { unshift @discard, pop @bits; } else { # remove a top-level domain name in the second-level posit +ion; # this is our best guess for countries that have no $SLD e +ntry above. if ( $TLD->{$bits[-1]} ) { unshift @discard, pop @bits; } } } else { # no valid TLD or ccTLD! return ([],[]); } return ( \@bits, \@discard ); } sub get_domain { my ($url, $flag) = @_; return '' unless $url; $flag ||= 0; $url =~ s!^\w+://!!; my ($domain) = (split '/', $url); # canonize domain name to lc all $domain = lc $domain; my @bits = split /\./, $domain; my @discard; # remove top-level domain or country-code if ( $TLD->{$bits[-1]} ) { unshift @discard, pop @bits; } elsif ( $ccTLD->{$bits[-1]} ) { my $cc = pop @bits; unshift @discard, $cc; # remove second-level domain for this country-code if ( $SLD->{$cc} and $SLD->{$cc}->{$bits[-1]} ) { unshift @discard, pop @bits; } else { # remove a top-level domain name in the second-level posit +ion; # this is our best guess for countries that have no $SLD e +ntry above. if ( $TLD->{$bits[-1]} ) { unshift @discard, pop @bits; } } } else { # no valid TLD or ccTLD! Possibly a dot quad or just invalid.... return ''; } if ( $flag == 1 ) { # when flag=1, return the base domain name return $bits[-1] if $flag == 1; } elsif ( $flag == 2 ) { # when flag=2, return the (subdomain).domain part return wantarray ? @bits : (join '.', @bits) if $flag == 2; } else { # by default, flatten the domain and server name into a single + word. # this is for our Bayes engine s/[^A-Za-z0-9]//g for @bits; return join '', @bits; } } sub valid_tld { my ( $domain ) = @_; return 0 unless $domain; $domain =~ s!^\w+://!!; ($domain) = split '/', $domain; my ( $tld ) = $domain =~ m!\.([^\.]+)$!; return 0 unless $tld; return 1 if exists $TLD->{$tld}; return 1 if exists $ccTLD->{$tld}; return 1 if $tld =~ m!^\d+$! and $tld > 0 and $tld < 256 and $_[0] = +~ m!(?:^|/)\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}!; return 0; } 1;

cheers

tachyon

Replies are listed 'Best First'.
Re^2: Stripping domain names from URLs
by robharper (Pilgrim) on Sep 08, 2004 at 13:58 UTC

    Thankyou, tachyon, that looks very helpful.

    Thanks also to everyone else who commented -- I will take a closer look at URI::URL and at learning how to be more precise with defining my problems. :o)