#!/usr/bin/perl use strict; use warnings; my %id; my $next_id = 10000; while (<>) { next unless m!^\S+ (\S+) .+ "GET ([^"]+) HTTP/\d\.\d" 200!; my ($ip,$path) = ($1,$2); study $path; # Skip directories next if $path =~ /\/$/; # Directory next if $path =~ /\/\?/; # Directory with sort parms # Skip certain directories next if $path =~ /^\/(icons|misc|ports|src)\//; # Skip certain file extensions next if $path =~ /\.(rss|html|meta|readme)$/; # Skip CPAN & distro maintenance stuff next if $path =~ /CHECKSUMS$/; next if $path =~ /MIRRORING/; # Module list stuff next if $path =~ /\Q00whois./; next if $path =~ /\Q01mailrc./; next if $path =~ /\Q02packages.details/; next if $path =~ /\Q03modlist./; my $id = ($id{$ip} ||= ++$next_id); print "$id $path\n"; }