#!/bin/perl use v5.36; use strict; use warnings; use Getopt::Long qw/GetOptions/; my %files_with_matches = (); my %options = ( 'line_numbers' => 0, 'files_with_matches' => 0, 'print_filename' => -1, ); sub parse_args { GetOptions( 'line-numbers' => \$options{line_numbers}, 'files-with-matches' => \$options{files_with_matches}, 'print-filename!' => \$options{print_filename}, ); if (!@ARGV or $ARGV[0] eq "-") { $options{line_numbers} = 0; $options{files_with_matches} = 0; $options{print_filename} = 0; } else { my $total_files = scalar @ARGV; # -1 indicates this option was not modified. Fallback to # default values depending on how many files are provided. if ($options{print_filename} == -1) { $options{print_filename} = $total_files == 1 ? 0 : 1; } } } # see: https://www.rfc-editor.org/rfc/rfc3986#appendix-A my $hexdig = qr/[a-f0-9]/i; my $pct_encoded = qr/%${hexdig}{2}/; my $unreserved = qr/[a-z0-9\-\._~]/i; my $sub_delims = qr/[!\$&'\(\)\*\+,;=]/; my $pchar = qr/(${unreserved}|${pct_encoded}|${sub_delims}|:|\@)/n; my $dec_oct = qr((\d)|(\d\d)|(1\d\d)|(2[0-4]\d)|(25[0-5]))n; my $ipv4 = qr(${dec_oct}\.${dec_oct}\.${dec_oct}\.${dec_oct}); my $h16 = qr(${hexdig}{1,4}); my $ls32 = qr((${h16} : ${h16}) | ${ipv4})xn; my $ipv6 = qr( ( (${h16} :){6} ${ls32}) | ( :: (${h16} :){5} ${ls32}) | (( ${h16})? :: (${h16} :){4} ${ls32}) | (((${h16} :){0,1} ${h16})? :: (${h16} :){3} ${ls32}) | (((${h16} :){0,2} ${h16})? :: (${h16} :){2} ${ls32}) | (((${h16} :){0,3} ${h16})? :: (${h16} :){1} ${ls32}) | (((${h16} :){0,4} ${h16})? :: ${ls32}) | (((${h16} :){0,5} ${h16})? :: ${h16} ) | (((${h16} :){0,6} ${h16})? :: ) )xn; my $ipvf = qr/v${hexdig}{1,}\.(${unreserved}|${sub_delims}|:){1,}/xn; my $ip_literal = qr{\[(${ipv6}|${ipvf})\]}n; my $re = qr( \b( ([a-z][a-z0-9+-.]*) :// ( ( (${unreserved}|${pct_encoded}|${sub_delims}|:)*@ )? ( ${ip_literal} | ${ipv4} | (${unreserved}|${pct_encoded}|${sub_delims})* ) (:\d+)? ) ( (\/ ( ${pchar}+(\/ ${pchar}*)*)?) | (\/ ${pchar}*)* | (${pchar}+(\/ ${pchar}*)*) | (${pchar}{0}) ) (\?(${pchar}|\/|\?)*)? (\#(${pchar}|\/|\?)*)? )\b )xin; sub match_url { my $row = shift; my $file = shift; while ($row =~ /$re/g) { if ($options{files_with_matches}) { if (! $files_with_matches{$file}) { $files_with_matches{$file} = 1; say $file; } return; } print "$file " if $options{print_filename}; print "$. " if $options{line_numbers}; say "$&"; } } sub main { parse_args(); if (!@ARGV or $ARGV[0] eq "-") { chomp(@ARGV = ); match_url($_) for @ARGV; return; } # I went back to this form instead of while(<>) as I wanted to # provide the filename to the match_url subroutine. This was the # only way I knew how to do that. foreach (@ARGV) { open my $fh, '<', $_ or die("Error opening file $_ ($!)\n"); while (my $row = <$fh>) { chomp $row; match_url($row, $_); } close($fh); } } main();