A program written in a hurry some time ago to munge file paths generally for file systems for Unix(-like) OSen & specifically for FAT32.
Learned the hard way that NTFS would allow file names to be written to FAT32 even if some characters are outside of FAT32 specification. Problematic characters seemed to be en- & em-dash, fancy quotes, pipe, Unicode "?", & possibly few others (web pages saved with title as the file name). Mounting FAT32 file system on FreeBSD with specific codepage(s), or "nowin95" or "shortnames" mount options did not help (mount_msdosfs(8)). Munging it was then🤷🏽♂️
# quick-sanename.pl use strict; use warnings; use feature qw[ state ]; use File::Copy qw[ move ]; @ARGV or die qq[Give a file name to sanitize.\n]; my $dry_run = 0; my $noisy = 1; my $lowercase = 1 ; my $for_windows = 1; my $clean_past_255 = 1; # General cleansing of base names. my %cleansed = run_cleanser( \&Cleanser::cleanse, @ARGV ); if ( $for_windows ) { if ( ! %cleansed ) { %cleansed = run_cleanser( \&Cleanser::cleanse_for_windows, @ARGV + ); } else { # Work on the changes of general cleansing. while( my ( $old, $once ) = each %cleansed ) { my $again = Cleanser::cleanse_for_windows( $once ) or next; $cleansed{ $old } = $again; } # Take care of those which were skipped during general cleansing +. my @todo = grep { ! exists $cleansed{ $_ } } @ARGV; my %win_cleansed = run_cleanser( \&Cleanser::cleanse_for_windows +, @todo ); %cleansed = ( %cleansed, %win_cleansed ); } } %cleansed or die qq[No new file names were generated.\n]; # Move file. for my $old ( sort keys %cleansed ) { my $new = $cleansed{ $old }; if ( $noisy || $dry_run ) { printf qq['%s' -> '%s'\n] , $old, $new; } $dry_run and next; if ( -e $new ) { warn qq[Skipped rename of "$old", "$new" already exists.\n]; next; } if ( ! move( $old, $new ) ) { warn qq[Could not move "$old" to "$new": $!\n]; } } exit; sub run_cleanser { my ( $clean_type, @old_path ) = @_; @old_path or return (); my %out; for my $one ( @old_path ) { my $new = $clean_type->( $one ) or next; $out{ $one } = $new; } return %out; } BEGIN { package Cleanser; use File::Basename qw[ fileparse ]; use File::Spec::Functions qw[ canonpath catfile ]; sub path_if_diff { my ( $old, $dir, $cleaned_base ) = @_; $lowercase and $cleaned_base = lc $cleaned_base; my $new = canonpath( catfile( $dir, $cleaned_base ) ); return $old ne $new ? $new : undef; } # Returns a cleaned path if possible; else C<undef>. # # Substitues various characters with "_" as minimally as possible. sub cleanse { my ( $old_path ) = @_; # Yes, I do mean to keep any word & digit in any writing script +(language). #state $alnum = 'a-zA-Z0-9'; state $alnum = '\w\d'; # quotemeta() does not escape "(" which causes warning that it w +ould be # deprecated in 5.30. state $left_brace = '\\{'; state $covered = q/}()[]/; state $meta = $left_brace . quotemeta( qq/${covered}@/ ); state $punc = q/-=,._/; my $no_keep = qq/[^${punc}${alnum}${meta}]+/; $no_keep = qr/$no_keep/u; state $punc_or = join( q/|/, $left_brace, map { split '', quotemeta $_ } ( $covered, +$punc ) ); state $many_seq = qr/[${punc}]{2,}/; state $pre_seq = qr/[${punc}]+_/; state $post_seq = qr/_[${punc}]+/; my ( $base, $dir ) = fileparse( $old_path ); for ( $base ) { s/$no_keep/_/g; # Collapse same. s/($punc_or)\1/$1/g; # Collapse any sequence. s/$pre_seq/_/g; s/$post_seq/_/g; s/$many_seq/_/g; } return path_if_diff( $old_path, $dir, $base ); } # Returns a cleaned path if possible; else C<undef>. # # It tries to keep a file path be a smaller set of characters for f +iles on # Microsoft Windows. # # Nothing is replaced, only a warning is issued for file names that + match ... # # CON, PRN, AUX, NUL, COM0, COM1, COM2, COM3, COM4, COM5, COM6, + COM7, # COM8, COM9, LPT0, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, L +PT8, and # LPT9 # # See https://learn.microsoft.com/en-us/windows/win32/fileio/naming +-a-file that lists # ... # Use any character in the current code page for a name, includin +g Unicode # characters and characters in the extended character set (128–25 +5), except # for the following: # # The following reserved characters: # < (less than) # > (greater than) # : (colon) # " (double quote) # / (forward slash) # \ (backslash) # | (vertical bar or pipe) # ? (question mark) # * (asterisk) # # Integer value zero, sometimes referred to as the ASCII NUL # character. # # Characters whose integer representations are in the range +from 1 # through 31, except for alternate data streams where these +characters # are allowed # ... # Do not use the following reserved names for the name of a file: # # CON, PRN, AUX, NUL, COM0, COM1, COM2, COM3, COM4, COM5, COM6, + COM7, # COM8, COM9, LPT0, LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, L +PT8, and # LPT9. Also avoid these names followed immediately by an exten +sion; for # example, NUL.txt and NUL.tar.gz are both equivalent to NUL. F +or more # information, see Namespaces. # # Do not end a file or directory name with a space or a period. A +lthough # the underlying file system may support such names, the Windows +shell and # user interface does not. However, it is acceptable to specify +a period # as the first character of a name. For example, ".temp". # ... # sub cleanse_for_windows { my ( $old_path ) = @_; state $bad_char = q[<>:"|?*] . '\\' . join( q[], map { chr } 0..31 ) ; my %sub_replace = ( qr/[^\x00-\xff]+/ => q[^], q/(?:[.]|[ ]+)$/ => q[_], qq/[$bad_char]/ => q[-], ); my ( $base, $dir ) = fileparse( $old_path ); $base = prefix_windows_reserved( $base ); for ( $base ) { for my $found ( keys %sub_replace ) { my $repl = $sub_replace{ $found }; s{$found}{$repl}g; } } return path_if_diff( $old_path, $dir, $base ); } # Returns the base name prefixed with "_" if it matches a reserved +word. sub prefix_windows_reserved { my ( $base ) = @_; # Prefix with "_". state $prefix = q[_]; state $reserved = join( q[|], qw[ CON PRN AUX NUL COM0 COM1 COM2 COM3 COM4 COM5 COM6 COM7 +COM8 COM9 LPT0 LPT1 LPT2 LPT3 LPT4 LPT5 LPT6 LPT7 +LPT8 LPT9 ] ); state $regex = qq/^( (?: $reserved )(?:[.].+)? )\$/; $base =~ s{$regex}{$prefix$1}xi; return $base; } }
In reply to Munging file name, to be safe- & usable enough on Unix-like OSen & FAT32 file system by parv
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |