#!/usr/bin/perl -w use strict; use warnings; # # This perl script searches for PL, PM, CGI, HTM, HTML, # CSS, JS, TXT and LOG files in the current directory and # changes the line-breaks to DOS, Linux or OSX format. # Tested with TinyPerl 5.8 under Windows 7. # Written by Zsolt in November 2019. (zsnp@juno.com) # # Usage: perl newline.pl [options [path [files]]] # # Options : -O = -OSX = Convert all to OSX format # -L = -LINUX = Convert all to Linux format # (Converts to DOS format by default, no option needed) # -R = Recursive # -M = Trim spaces from the end of lines # -B = Create BAK files # -G = Process large files too, not just under 1MB # -A = Always ask before making any changes # -T = Update file's last-modified time # -K = Skip read-only files # -Q = Quiet mode: Don't display anything # -Z = Print file list only, No changes made. # # Note: Multiple options MUST be written together, and they # don't need to be preceded by the minus sign. Example: # perl newline.pl linuxtrkz # # To convert just one file, you must provide options, # path and the exact file name. Example: # perl newline.pl linuxrbmv . recipes.txt # # To convert just one type of file, you must provide # options, path and a file mask. Example: # perl newline.pl osxq /mnt/sdb1 *.xml # # To convert many different types of files, you must provide # options, path, and file masks separated by space. Example: # perl newline.pl rtk C:\WORK *.lst *.txt *.ini # # Running this program with no arguments will automatically # convert files to DOS format! # # This program will not process DOC files even if told to # do so, because DOC files contain a lot of binary data. # A file must contain at least 95% plain text, # otherwise this program will not touch it. # ################################################## ## SETTINGS my $BR = "\r\n"; # Insert DOS line break sequence by default my $MAXSIZE = 1000000; # Skip very large files (over 1MB) my $MINTEXT = 95; # File must be at least 95% plain text my $RECURSIVE = 0; # Recursive mode? my $TRIM = 1; # Remove whitespace from the end of lines? my $FORCE = 1; # Overwrite read-only files? my $CREATE_BAK = 0; # Create BAK files? my $DONT_WRITE = 0; # Don't save any changes? my $ALWAYS_ASK = 0; # Always ask before making any changes? my $TOUCH = 0; # Update a file's last-modified time? my $QUIET = 0; # Don't print anything? # Process files with the following extensions: my $FIND = '.PM .PL .CGI .HTM .HTML .JS .CSS .TXT .LOG '; my @FILES; ################################################## ## VARIABLES # This script uses a lot of global variables # instead of passing a bunch of parameters around. my $i; # file ptr my $M; # File mode/attribute my $T; # File last-modified date&time my $OS; # operating system my $DIR; # work directory my $BRU; # Line break for unicode files my $LINES = 0; # Total number of lines in a file my $INPUT = ''; # Read buffer my $OUTPUT = ''; # Write buffer my $UNICODE = 0; # Unicode format? my $text_end; # Marks the end of text my $line_start; # Beginning of line ################################################## ## PROGRAM STARTS HERE $OS = GetOS(); $DIR = GetCWD(); ProcessArguments(); CheckDIR($DIR); exit; ################################################## # # This function interprets the arguments and # makes sure that none of them are conflicting. # Usage: ProcessArguments() # sub ProcessArguments { my $OPT = @ARGV ? uc(shift(@ARGV)) : ''; my $PATH = @ARGV ? shift(@ARGV) : '.'; $PATH eq '.' or $DIR = $PATH; @FILES = @ARGV; if (Find($OPT, '?', 'H', 'HELP')) { About(); exit; } if (Find($OPT, 'OSX', 'LINUX') == 3) { Abort('Do you want to convert to OSX or Linux? You can\'t do both. +'); } $MAXSIZE = 999999999 if (Find($OPT, 'G')); $BR = "\r" if (Find($OPT, 'OSX')); $BR = "\n" if (Find($OPT, 'LINUX')); $CREATE_BAK |= Find($OPT, 'B'); $ALWAYS_ASK |= Find($OPT, 'A'); $DONT_WRITE |= Find($OPT, 'Z'); $RECURSIVE |= Find($OPT, 'R'); $TOUCH |= Find($OPT, 'T'); $FORCE |= Find($OPT, 'K'); $QUIET |= Find($OPT, 'Q'); $TRIM |= Find($OPT, 'M'); if ($QUIET && $ALWAYS_ASK && !$DONT_WRITE) { Abort('Do not select quiet mode when asking to be prompted before +changes.'); } # Convert line break sequence to unicode format: my @A = split(//, $BR); $BRU = join("\0", @A) . "\0"; } ################################################## # # This function reads the contents of a folder and # calls CatchFile() for each file that was found. # Usage: CheckDIR(PATH) # sub CheckDIR { my $PATH = defined $_[0] ? $_[0] : ''; length($PATH) or return; # Change / to \ on Windows and DOS computers if ($OS < 3) { $PATH =~ tr#/#\\#; } if ($QUIET == 0 && $DONT_WRITE == 0) { print "Reading directory: $PATH\n\n"; } # Make sure that PATH ends with a backslash or forward slash if (index("/\\", substr($PATH, length($PATH)-1, 1)) < 0) { $PATH .= ($OS < 3 ? "\\" : '/'); } my $FULLNAME; local *DIR; opendir(DIR, $PATH) or return; my $NAME = 1; while ($NAME) { $NAME = readdir(DIR); defined $NAME or last; $FULLNAME = "$PATH$NAME"; if (-d($FULLNAME)) { # Check into subdirectory if RECURSIVE == 1 # Skip directory if its name starts with "." if ($RECURSIVE) { CheckDIR($FULLNAME) unless (vec($NAME, 0, 8) == 46); } next; } CatchFile($PATH, $NAME); } closedir(DIR); } ################################################## # # This function is automatically called # by CheckDIR() every time a file is found. # Usage: CatchFile(PATH_ONLY, NAME_ONLY) # sub CatchFile { my ($P, $F) = @_; length($F) > 2 or return; if (@FILES) { foreach my $W (@FILES) { if (isMatch($F, $W)) # Match file pattern { UpdateFile("$P$F"); return; } } } else # Match file extension only { my $EXT = rindex($F, '.'); $EXT >= 0 or return; $EXT = uc(substr($F, $EXT, length($F))) . ' '; index($FIND, $EXT) < 0 or UpdateFile("$P$F"); } } ################################################## # # This function scans the file contents and replaces # various line-break sequences with a new sequence. # Usage: UpdateFile(FILENAME) # sub UpdateFile { my $F = shift; if (-s $F > $MAXSIZE) { $QUIET or print "$F - File is bigger than $MAXSIZE bytes ---SKIPPE +D.\n\n"; return; } print "$F"; if ($QUIET || $DONT_WRITE) { print "\n"; } return if ($DONT_WRITE); $QUIET or print ' ...'; $INPUT = ''; $OUTPUT = ''; # We do a series of checks first. Return if we # can't read the file or if the file size is zero. my $SIZE = ReadFile($F); if ($SIZE == 0) { $QUIET or print "Nothing to read. ---SKIPPED.\n\n"; return; } $QUIET or print Commify($SIZE), " BYTES READ.\n"; my $X = AnalyzeFile(); my $PERCENT_TEXT = $X & 255; my $FLAGS = ($X >> 8) & 7; $UNICODE = $X & 0x800 ? 1 : 0; $QUIET or print ' '; # Return if the file contains less than 95% plain text # or whatever minimum amount is specified in $MINTEXT. if ($UNICODE && $PERCENT_TEXT > 45) { $QUIET or print 'UNICODE TEXT. '; } elsif ($PERCENT_TEXT < $MINTEXT) { $QUIET or print "File is only $PERCENT_TEXT% plain text! Should be + at least $MINTEXT%. ---SKIPPED.\n\n"; return; } # File contains NO line-breaks at all? if ($FLAGS == 4 && !$TRIM) { $QUIET or print "File is all one line! ---SKIPPED.\n\n"; return; } # Input file is in Linux format? my $SAVE_AND_RETURN = 0; if ($FLAGS == 1 && !$TRIM) { # Converting from Linux to Linux: if ($BR eq "\n") { $QUIET or print "File is already in Linux forma +t! ---SKIPPED.\n\n"; return; } # Converting from Linux to DOS: if ($BR eq "\r\n") { if ($UNICODE) { $INPUT =~ s/\n/\r\0\n/g; } el +se { $INPUT =~ s/\n/\r\n/g; } } # Converting from Linux to OSX: elsif ($BR eq "\r") { $INPUT =~ tr#\n#\r#; } $SAVE_AND_RETURN = 1; } # Input file is in OSX format? if ($FLAGS == 2 && !$TRIM) { # Converting from OSX to OSX: if ($BR eq "\r") { $QUIET or print "File is already in OSX format! + ---SKIPPED.\n\n"; return; } # Converting from OSX to DOS: if ($BR eq "\r\n") { if ($UNICODE) { $INPUT =~ s/\r/\r\0\n/g; } el +se { $INPUT =~ s/\r/\r\n/g; } } # Converting from OSX to Linux: elsif ($BR eq "\n") { $INPUT =~ tr#\r#\n#; } $SAVE_AND_RETURN = 1; } # Input file might be in DOS format? if ($FLAGS == 7 && !$TRIM) { # If the file contains complete CR-LF pairs, # in other words, if it's a pure DOS text file, # then eliminating all CR-LF pairs means that # there will be no loose LF CR characters left! my $TEST = $INPUT; $TEST =~ s/\r\n/XX/g; # Remove normal line breaks $TEST =~ s/\r\0\n\0/XXXX/g; # Remove unicode line breaks $TEST = Find($TEST, "\r", "\n"); # Find loose \r or \n chars if ($TEST == 0) { # Converting from DOS to DOS: if ($BR eq "\r\n") { $QUIET or print "File is already in DOS for +mat! ---SKIPPED.\n\n"; return; } # Converting from DOS to OSX: if ($BR eq "\r") { if ($UNICODE) { $INPUT =~ tr#\n\0##d; } else +{ $INPUT =~ tr#\n##d; } } # Converting from DOS to Linux: elsif ($BR eq "\n") { if ($UNICODE) { $INPUT =~ tr#\r\0##d; } el +se { $INPUT =~ tr#\r##d; } } $SAVE_AND_RETURN = 1; } } if ($SAVE_AND_RETURN == 1) { ($OUTPUT, $INPUT) = ($INPUT, ''); SaveFile($F); return; } # If we get to this point, it means our input file # contains mixed content. So, we will read it # slowly, processing one byte at a time. my $c = 0; # current character my $prev; # previous character $LINES = 0; $text_end = 0; $line_start = -2; my $RESTORE = $BR; $BR = $BRU if ($UNICODE); for ($i = 0; $i < $SIZE; $i++) { $c = vec($INPUT, $i, 8); next if ($UNICODE && $c == 0); if ($c == 10 || $c == 13) # Detect LF CR { if (PrintLine()) { } # PrintLine() returns 0 if we had nothing to print, # so then we check if the previous character was # also the same new-line character... elsif ($prev == $c) { # OK. What we have here is two LF-LF characters # or two CR-CR characters one after the other, # which means we should print a blank line. $OUTPUT .= $BR; $LINES++; } # Okay, what if the previous character was a # different new-line character? elsif ($prev == 10 || $prev == 13) { # We may have the second byte of a CR/LF pair # here, so just skip this byte. And print a # New-Line sequence in the next cycle. $line_start = -2; } } else # All kinds of other characters: { if ($line_start < 0) { $line_start = $i; } ($c == 32 || $c == 9) or $text_end = $i; } $prev = $c; # Save previous char } if ($line_start >= 0) { PrintLine(); } # Print the last line (if any) $BR = $RESTORE; SaveFile($F); # Overwrite original file } ################################################## # # This function sends one line to a global variable # named $OUTPUT which collects all the data to # be written to a file. The line to be stored may # contain text only, or it may be just a blank line. # # When $line_start == -2, we need to store a blank # line. When $line_start == -1, we're in the middle # of a line-break sequence. This function returns 1 # if something was stored, or 0 otherwise. # # Usage: INTEGER = PrintLine() # sub PrintLine { if ($line_start == -1) # Nothing to print yet? { return 0; } if ($line_start >= 0) # Got something to print! { if ($UNICODE) { $OUTPUT .= substr($INPUT, $line_start, ($TRIM ? $text_end + 1 : +$i) - $line_start + 1); } else { $OUTPUT .= substr($INPUT, $line_start, ($TRIM ? $text_end + 1 : +$i) - $line_start); } } $OUTPUT .= $BR; # Print New-line sequence $LINES++; $line_start = -1; return 1; } ################################################## # # This function writes the contents of $OUTPUT # to a file, replacing the existing file, # and prints a little summary. # sub SaveFile { my $F = shift; if ($INPUT eq $OUTPUT) { $QUIET or print "---NO CHANGES NEEDED.\n\n"; return; } if ($DONT_WRITE) { $QUIET or print "---NO CHANGES MADE!\n\n"; return; } if ($ALWAYS_ASK) { print 'File is about to be overwritten. Proceed (Y/N)? '; my $A = <STDIN>; if (Find(uc($A), 'Y') == 0) { print "---USER SKIPPED FILE.\n\n"; return; } } if ($CREATE_BAK) { system("ren $F *.BAK"); } if ($FORCE) { chmod $M | 0660, $F; } # Make file writable if (open(FH, ">$F") == 0) { $QUIET or print "\tCAN'T WRITE!\n\n"; return; } binmode FH; print FH $OUTPUT; close FH; print "\n\t", Commify($LINES), ' line(s) ', Commify(length($OUTPUT)) +, " bytes written.\n\n"; chmod $M, $F; # Restore original mode # The date & time the file was last modified will be reset # back to original, but the file last-access date # will most likely change to the current time. $TOUCH or utime time, $T, $F; } ################################################## # # This function reads the entire contents of a # file into a global variable named $INPUT and # returns the number of bytes read OR # returns 0 if something went wrong. # # Usage: INTEGER = ReadFile(FILENAME) # sub ReadFile { my $F = shift; $LINES = 0; $INPUT = ''; $OUTPUT = ''; my $SIZE = -s $F; $SIZE or return 0; local *FH; sysopen(FH, $F, 0) or return 0; my @INFO = stat(FH); $M = $INFO[2]; # Get file mode/attribute $T = $INFO[9]; # Get last-modified date binmode FH; sysread(FH, $INPUT, $SIZE) or return 0; close FH; $SIZE == length($INPUT) or return 0; return $SIZE; } ################################################## # # This function scans the content of $INPUT string # looking for special characters and determines # what percentage of the string is plain text, and # also tries to determine the text format. # # Returns an integer whose lower 8 bits is the percentage (0-100). # Bit 9 will be set if any LF characters were found. # Bit 10 will be set if any CR characters were found. # Bit 11 will be set if there are equal number of CR and LF # characters in the string. # Bit 12 will be set if there are lots of zeros in the string, # which is an indication that it may be unicode format. # These can be interpreted as follows: # # .000 = Format is undetermined. # .001 = LINUX string (LF only) # .010 = OSX string (CR only) # .011 = MIXED format # .111 = DOS text (CR-LF pairs) # # Usage: INTEGER = AnalyzeFile() # sub AnalyzeFile { my $L = length($INPUT); # We will simply count the number of plain text characters # and the number of CR and LF characters in the string. my $c; my $TOTAL = $L; # Total length of string my $TX = 0; # Number of plain text characters my $CR = 0; # Number of 0D characters my $LF = 0; # Number of 0A characters my $NUL = 0; # Number of 00 characters while ($L--) { $c = vec($INPUT, $L, 8); next if ($c > 126); if ($c > 31 || $c == 9) { $TX++; next; } $LF++ if ($c == 10); $CR++ if ($c == 13); $NUL++ if ($c == 0); } $LINES = ($LF > $CR) ? $LF : $CR; # Number of line breaks # Now, we will try to determine what type of string # we're dealing with. There are 5 possibilities: # LINUX, DOS, OSX, MIXED, or "undetermined." # # Explanation of formats: # * OSX files contain CR characters as line break. # * Linux text files contain LF characters as line break. # * DOS text files contain an equal number of CR and LF # characters in pairs. # * "MIXED" means that the string contains an unequal number of # both CR and LF characters, so this may be a binary data. # * "Undeteremined" means that the string does not contain # any line break characters at all, so it could be either # a DOS text or Linux text or anything. $c = $LF ? 0x100 : 0; # Now we use $c to store the string format. $c |= 0x200 if ($CR); $c |= 0x400 if ($CR == $LF); # Check for possible unicode format $NUL = int($NUL / $TOTAL * 100); # Calculate % of null chars $c |= 0x800 if ($NUL > 40 && $NUL < 60 && vec($INPUT, 0, 8) == 255); # The percentage is stored in the lower 8 bits, # and the format is stored in bits 9-12. return $c | int(($TX+$LF+$CR) / $TOTAL * 100); } ################################################## # # This function returns true if a filename matches a # certain wildcard pattern. There may be several # question marks in the search pattern, but only # one asterisk is allowed! The matching is # NOT case sensitive! # Usage: INTEGER = isMatch(FILENAME, WILDCARD) # # Example: isMatch("New_Document.txt", "n*.txt") = 1 # sub isMatch { @_ > 1 or return 0; my $F = shift; defined $F or return 0; length($F) or return 0; my $W = shift; defined $W or return 0; length($W) or return 0; $F = uc($F); $W = uc($W); # If there are invalid characters... if (CountChars($W.$F, '<|>')) { return 0; } # If there aren't any wildcards at all... if (CountChars($W, '*?') == 0) { return ($F eq $W) ? 1 : 0; } # Match what's before the asterisk... return 0 unless (_isMatch($F, $W, 1)); # Match what comes after the asterisk... return _isMatch($F, $W, -1); } ################################################## # # This function is called by isMatch() # This function compares two strings and returns 1 if # both strings match until the first asterisk. # This function can start comparing strings # from the beginning or starting from the end! # DIRECTION must be either 1 or -1. # Usage: INTEGER = _isMatch(FILENAME, WILDCARD, DIRECTION) # sub _isMatch { my $F = shift; my $f; my $LF = length($F)-1; my $W = shift; my $w; my $LW = length($W)-1; my $DIR = shift; my $STOP = $LW; my $START = 0; my $FSTART = 0; if ($DIR < 0) { $STOP = 0; $START = $LW; $FSTART = $LF; } while ($START != $STOP) { $w = vec($W, $START, 8); # Grab byte from wildcard pattern $f = vec($F, $FSTART, 8); # Grab byte from filename $START += $DIR; $FSTART += $DIR; if ($w == 42) # ASTERISK? { return 1; } else { # If the character is "?" then skip, but if # it's not "?", then the characters must match. if ($w != 63) { ($f == $w) or return 0; } } } return 1; } ################################################## # # This function counts how many times STRING contains # any of the characters of SUBSTR. # Usage: INTEGER = CountChars(STRING, SUBSTR) # sub CountChars { @_ > 1 or return 0; my $S = shift; defined $S or return 0; length($S) or return 0; my $L = shift; defined $L or return 0; length($L) or return 0; my $P; my $i = length($L); my $N = 0; while ($i-- > 0) { $P = 0; while (($P = 1+index($S, substr($L, $i, 1), $P)) > 0) { $N++; } } return $N; } ################################################## # # This function returns the OS type as a number. # 1=DOS 2=WINDOWS 3=LINUX 4=OSX 9=OTHER # sub GetOS { my $OS = uc($^O); index($OS, 'LINUX') >= 0 ? 3 : index($OS, 'MSWIN') >= 0 ? 2 : index($OS, 'DOS') >= 0 ? 1 : index($OS, 'DARWIN') >= 0 ? 4 : 9; } ################################################## # # This function returns the NAME, the PATH, or the # CONTENT of this perl script depending on which # item is requested in the argument. # # Usage: STRING = Self(STRING) # # Examples: # Self('NAME') # Self('PATH') # Self('CONTENT') # sub Self { my $S = defined $_[0] ? uc($_[0]) : ''; my $P = rindex($0, ($OS < 3 ? "\\" : '/')); if ($S eq 'NAME') { return ($P < 0) ? $0 : substr($0, $P+1, length($ +0)); } if ($S eq 'PATH') { return ($P < 0) ? $0 : substr($0, 0, $P); } local *FH; sysopen(FH, $0, 0); binmode FH; sysread(FH, $S, -s $0); close FH; return $S; } ################################################## # # This function removes all whitespace from before # and after text and returns a new string. This # function removes every character whose ASCII # value is less than 33. This includes tab, space, # null, vertical tab, esc, new lines, etc.. # # Usage: STRING = Trim(STRING) # sub Trim { my $S = defined $_[0] ? $_[0] : ''; my $L = length($S) or return ''; my $START = 0; my $LAST = 0; while ($L--) { if (vec($S, $L, 8) > 32) { $START = $L; $LAST or $LAST = $L + 1; } } return substr($S, $START, $LAST - $START); } ################################################## # # This function returns the current working directory. # If the current working directory cannot be determined # then returns the first argument. If there are no # arguments passed, then returns the path where this # script is located. # # Usage: STRING = GetCWD( [DEFAULT] ) # sub GetCWD { my $DIR = Trim($OS < 3 ? `cd` : `pwd`); return length($DIR) ? $DIR : defined $_[0] ? $_[0] : Self('PATH'); } ################################################## # # This function inserts commas into a number at # every 3 digits and returns a string. # Copied from www.PerlMonks.org/?node_id=157725 # sub Commify { my $N = reverse $_[0]; $N =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $N; } ################################################## # # This function works just like the index() function # except it can compare more than one string. It will # return 0 if there is no match. Returns 1 if the first # substring was found. Returns 2 if the second substring # was found or 3 if both the first and second were found... # Usage: INTEGER = Find(STRING, SUBSTR1, [SUBSTR2...]) # sub Find { my $P = 1; my $BIT = 1; my $FOUND = 0; while (defined $_[$P]) { $FOUND |= index($_[0], $_[$P++]) < 0 ? 0 : $BIT; $BIT += $BIT; } return $FOUND; } ################################################## # # Prints the description of this program. # Usage: About() # sub About { my $c; print "\n $0\n\n"; ReadFile($0); (my $S = index($INPUT, '# ')) >= 0 or return; my $E = index($INPUT, '###', $S); for (my $i = $S; $i < $E; $i++) { $c = vec($INPUT, $i, 8); if ($c < 32 || $c == 35) { print "\n" if ($c == 13); next; } print chr($c); } } ################################################## # # Prints an error message and exits. # Usage: Abort(STRING) # sub Abort { print "\nOops. $_[0]\nType perl newline.pl ? for help.\n"; exit; } ##################################################
In reply to Re: dos2unix line endings on Windows
by harangzsolt33
in thread dos2unix line endings on Windows
by flamey
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |