# quick-sanename.pl use strict; use warnings; use feature qw[ state ]; use File::Copy qw[ move ]; @ARGV or die qq[Give a file name to sanitize.\n]; my $dry_run = 0; my $noisy = 1; my $lowercase = 1 ; my $for_windows = 1; my $clean_past_255 = 1; # General cleansing of base names. my %cleansed = run_cleanser( \&Cleanser::cleanse, @ARGV ); if ( $for_windows ) { if ( ! %cleansed ) { %cleansed = run_cleanser( \&Cleanser::cleanse_for_windows, @ARGV ); } else { # Work on the changes of general cleansing. while( my ( $old, $once ) = each %cleansed ) { my $again = Cleanser::cleanse_for_windows( $once ) or next; $cleansed{ $old } = $again; } # Take care of those which were skipped during general cleansing. my @todo = grep { ! exists $cleansed{ $_ } } @ARGV; my %win_cleansed = run_cleanser( \&Cleanser::cleanse_for_windows, @todo ); %cleansed = ( %cleansed, %win_cleansed ); } } %cleansed or die qq[No new file names were generated.\n]; # Move file. for my $old ( sort keys %cleansed ) { my $new = $cleansed{ $old }; if ( $noisy || $dry_run ) { printf qq['%s' -> '%s'\n] , $old, $new; } $dry_run and next; if ( -e $new ) { warn qq[Skipped rename of "$old", "$new" already exists.\n]; next; } if ( ! move( $old, $new ) ) { warn qq[Could not move "$old" to "$new": $!\n]; } } exit; sub run_cleanser { my ( $clean_type, @old_path ) = @_; @old_path or return (); my %out; for my $one ( @old_path ) { my $new = $clean_type->( $one ) or next; $out{ $one } = $new; } return %out; } BEGIN { package Cleanser; use File::Basename qw[ fileparse ]; use File::Spec::Functions qw[ canonpath catfile ]; sub path_if_diff { my ( $old, $dir, $cleaned_base ) = @_; $lowercase and $cleaned_base = lc $cleaned_base; my $new = canonpath( catfile( $dir, $cleaned_base ) ); return $old ne $new ? $new : undef; } # Returns a cleaned path if possible; else C. # # Substitues various characters with "_" as minimally as possible. sub cleanse { my ( $old_path ) = @_; # Yes, I do mean to keep any word & digit in any writing script (language). #state $alnum = 'a-zA-Z0-9'; state $alnum = '\w\d'; # quotemeta() does not escape "(" which causes warning that it would be # deprecated in 5.30. state $left_brace = '\\{'; state $covered = q/}()[]/; state $meta = $left_brace . quotemeta( qq/${covered}@/ ); state $punc = q/-=,._/; my $no_keep = qq/[^${punc}${alnum}${meta}]+/; $no_keep = qr/$no_keep/u; state $punc_or = join( q/|/, $left_brace, map { split '', quotemeta $_ } ( $covered, $punc ) ); state $many_seq = qr/[${punc}]{2,}/; state $pre_seq = qr/[${punc}]+_/; state $post_seq = qr/_[${punc}]+/; my ( $base, $dir ) = fileparse( $old_path ); for ( $base ) { s/$no_keep/_/g; # Collapse same. s/($punc_or)\1/$1/g; # Collapse any sequence. s/$pre_seq/_/g; s/$post_seq/_/g; s/$many_seq/_/g; } return path_if_diff( $old_path, $dir, $base ); } # Returns a cleaned path if possible; else C. # # It tries to keep a file path be a smaller set of characters for files on # Microsoft Windows. # # Nothing is replaced, only a warning is issued for file names that match ... # # CON, PRN, AUX, NUL, COM0, COM1, COM2, COM3, COM4, COM5, COM6, COM7, # COM8, COM9, LPT0, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, and # LPT9 # # See https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file that lists # ... # Use any character in the current code page for a name, including Unicode # characters and characters in the extended character set (128–255), except # for the following: # # The following reserved characters: # < (less than) # > (greater than) # : (colon) # " (double quote) # / (forward slash) # \ (backslash) # | (vertical bar or pipe) # ? (question mark) # * (asterisk) # # Integer value zero, sometimes referred to as the ASCII NUL # character. # # Characters whose integer representations are in the range from 1 # through 31, except for alternate data streams where these characters # are allowed # ... # Do not use the following reserved names for the name of a file: # # CON, PRN, AUX, NUL, COM0, COM1, COM2, COM3, COM4, COM5, COM6, COM7, # COM8, COM9, LPT0, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, and # LPT9. Also avoid these names followed immediately by an extension; for # example, NUL.txt and NUL.tar.gz are both equivalent to NUL. For more # information, see Namespaces. # # Do not end a file or directory name with a space or a period. Although # the underlying file system may support such names, the Windows shell and # user interface does not. However, it is acceptable to specify a period # as the first character of a name. For example, ".temp". # ... # sub cleanse_for_windows { my ( $old_path ) = @_; state $bad_char = q[<>:"|?*] . '\\' . join( q[], map { chr } 0..31 ) ; my %sub_replace = ( qr/[^\x00-\xff]+/ => q[^], q/(?:[.]|[ ]+)$/ => q[_], qq/[$bad_char]/ => q[-], ); my ( $base, $dir ) = fileparse( $old_path ); $base = prefix_windows_reserved( $base ); for ( $base ) { for my $found ( keys %sub_replace ) { my $repl = $sub_replace{ $found }; s{$found}{$repl}g; } } return path_if_diff( $old_path, $dir, $base ); } # Returns the base name prefixed with "_" if it matches a reserved word. sub prefix_windows_reserved { my ( $base ) = @_; # Prefix with "_". state $prefix = q[_]; state $reserved = join( q[|], qw[ CON PRN AUX NUL COM0 COM1 COM2 COM3 COM4 COM5 COM6 COM7 COM8 COM9 LPT0 LPT1 LPT2 LPT3 LPT4 LPT5 LPT6 LPT7 LPT8 LPT9 ] ); state $regex = qq/^( (?: $reserved )(?:[.].+)? )\$/; $base =~ s{$regex}{$prefix$1}xi; return $base; } }