Here is a perl program I wrote to do this job. And here is my disclaimer: An earlier version of this program wiped out my text files in the current directory! This version isn't doing that anymore, but still, use caution! :) This script still has some bugs. It misinterprets certain options.
#!/usr/bin/perl -w
use strict;
use warnings;
#
# This perl script searches for PL, PM, CGI, HTM, HTML,
# CSS, JS, TXT and LOG files in the current directory and
# changes the line-breaks to DOS, Linux or OSX format.
# Tested with TinyPerl 5.8 under Windows 7.
# Written by Zsolt in November 2019. (zsnp@juno.com)
#
# Usage: perl newline.pl [options [path [files]]]
#
# Options : -O = -OSX = Convert all to OSX format
# -L = -LINUX = Convert all to Linux format
# (Converts to DOS format by default, no option needed)
# -R = Recursive
# -M = Trim spaces from the end of lines
# -B = Create BAK files
# -G = Process large files too, not just under 1MB
# -A = Always ask before making any changes
# -T = Update file's last-modified time
# -K = Skip read-only files
# -Q = Quiet mode: Don't display anything
# -Z = Print file list only, No changes made.
#
# Note: Multiple options MUST be written together, and they
# don't need to be preceded by the minus sign. Example:
# perl newline.pl linuxtrkz
#
# To convert just one file, you must provide options,
# path and the exact file name. Example:
# perl newline.pl linuxrbmv . recipes.txt
#
# To convert just one type of file, you must provide
# options, path and a file mask. Example:
# perl newline.pl osxq /mnt/sdb1 *.xml
#
# To convert many different types of files, you must provide
# options, path, and file masks separated by space. Example:
# perl newline.pl rtk C:\WORK *.lst *.txt *.ini
#
# Running this program with no arguments will automatically
# convert files to DOS format!
#
# This program will not process DOC files even if told to
# do so, because DOC files contain a lot of binary data.
# A file must contain at least 95% plain text,
# otherwise this program will not touch it.
#
##################################################
## SETTINGS
my $BR = "\r\n"; # Insert DOS line break sequence by default
my $MAXSIZE = 1000000; # Skip very large files (over 1MB)
my $MINTEXT = 95; # File must be at least 95% plain text
my $RECURSIVE = 0; # Recursive mode?
my $TRIM = 1; # Remove whitespace from the end of lines?
my $FORCE = 1; # Overwrite read-only files?
my $CREATE_BAK = 0; # Create BAK files?
my $DONT_WRITE = 0; # Don't save any changes?
my $ALWAYS_ASK = 0; # Always ask before making any changes?
my $TOUCH = 0; # Update a file's last-modified time?
my $QUIET = 0; # Don't print anything?
# Process files with the following extensions:
my $FIND = '.PM .PL .CGI .HTM .HTML .JS .CSS .TXT .LOG ';
my @FILES;
##################################################
## VARIABLES
# This script uses a lot of global variables
# instead of passing a bunch of parameters around.
my $i; # file ptr
my $M; # File mode/attribute
my $T; # File last-modified date&time
my $OS; # operating system
my $DIR; # work directory
my $BRU; # Line break for unicode files
my $LINES = 0; # Total number of lines in a file
my $INPUT = ''; # Read buffer
my $OUTPUT = ''; # Write buffer
my $UNICODE = 0; # Unicode format?
my $text_end; # Marks the end of text
my $line_start; # Beginning of line
##################################################
## PROGRAM STARTS HERE
$OS = GetOS();
$DIR = GetCWD();
ProcessArguments();
CheckDIR($DIR);
exit;
##################################################
#
# This function interprets the arguments and
# makes sure that none of them are conflicting.
# Usage: ProcessArguments()
#
sub ProcessArguments
{
my $OPT = @ARGV ? uc(shift(@ARGV)) : '';
my $PATH = @ARGV ? shift(@ARGV) : '.';
$PATH eq '.' or $DIR = $PATH;
@FILES = @ARGV;
if (Find($OPT, '?', 'H', 'HELP')) { About(); exit; }
if (Find($OPT, 'OSX', 'LINUX') == 3)
{ Abort('Do you want to convert to OSX or Linux? You can\'t do both.
+'); }
$MAXSIZE = 999999999 if (Find($OPT, 'G'));
$BR = "\r" if (Find($OPT, 'OSX'));
$BR = "\n" if (Find($OPT, 'LINUX'));
$CREATE_BAK |= Find($OPT, 'B');
$ALWAYS_ASK |= Find($OPT, 'A');
$DONT_WRITE |= Find($OPT, 'Z');
$RECURSIVE |= Find($OPT, 'R');
$TOUCH |= Find($OPT, 'T');
$FORCE |= Find($OPT, 'K');
$QUIET |= Find($OPT, 'Q');
$TRIM |= Find($OPT, 'M');
if ($QUIET && $ALWAYS_ASK && !$DONT_WRITE)
{ Abort('Do not select quiet mode when asking to be prompted before
+changes.'); }
# Convert line break sequence to unicode format:
my @A = split(//, $BR);
$BRU = join("\0", @A) . "\0";
}
##################################################
#
# This function reads the contents of a folder and
# calls CatchFile() for each file that was found.
# Usage: CheckDIR(PATH)
#
sub CheckDIR
{
my $PATH = defined $_[0] ? $_[0] : '';
length($PATH) or return;
# Change / to \ on Windows and DOS computers
if ($OS < 3) { $PATH =~ tr#/#\\#; }
if ($QUIET == 0 && $DONT_WRITE == 0)
{ print "Reading directory: $PATH\n\n"; }
# Make sure that PATH ends with a backslash or forward slash
if (index("/\\", substr($PATH, length($PATH)-1, 1)) < 0)
{ $PATH .= ($OS < 3 ? "\\" : '/'); }
my $FULLNAME;
local *DIR;
opendir(DIR, $PATH) or return;
my $NAME = 1;
while ($NAME)
{
$NAME = readdir(DIR);
defined $NAME or last;
$FULLNAME = "$PATH$NAME";
if (-d($FULLNAME))
{
# Check into subdirectory if RECURSIVE == 1
# Skip directory if its name starts with "."
if ($RECURSIVE)
{ CheckDIR($FULLNAME) unless (vec($NAME, 0, 8) == 46); }
next;
}
CatchFile($PATH, $NAME);
}
closedir(DIR);
}
##################################################
#
# This function is automatically called
# by CheckDIR() every time a file is found.
# Usage: CatchFile(PATH_ONLY, NAME_ONLY)
#
sub CatchFile
{
my ($P, $F) = @_;
length($F) > 2 or return;
if (@FILES)
{
foreach my $W (@FILES)
{
if (isMatch($F, $W)) # Match file pattern
{
UpdateFile("$P$F");
return;
}
}
}
else # Match file extension only
{
my $EXT = rindex($F, '.');
$EXT >= 0 or return;
$EXT = uc(substr($F, $EXT, length($F))) . ' ';
index($FIND, $EXT) < 0 or UpdateFile("$P$F");
}
}
##################################################
#
# This function scans the file contents and replaces
# various line-break sequences with a new sequence.
# Usage: UpdateFile(FILENAME)
#
sub UpdateFile
{
my $F = shift;
if (-s $F > $MAXSIZE)
{ $QUIET or print "$F - File is bigger than $MAXSIZE bytes ---SKIPPE
+D.\n\n"; return; }
print "$F";
if ($QUIET || $DONT_WRITE) { print "\n"; }
return if ($DONT_WRITE);
$QUIET or print ' ...';
$INPUT = '';
$OUTPUT = '';
# We do a series of checks first. Return if we
# can't read the file or if the file size is zero.
my $SIZE = ReadFile($F);
if ($SIZE == 0)
{ $QUIET or print "Nothing to read. ---SKIPPED.\n\n"; return; }
$QUIET or print Commify($SIZE), " BYTES READ.\n";
my $X = AnalyzeFile();
my $PERCENT_TEXT = $X & 255;
my $FLAGS = ($X >> 8) & 7;
$UNICODE = $X & 0x800 ? 1 : 0;
$QUIET or print ' ';
# Return if the file contains less than 95% plain text
# or whatever minimum amount is specified in $MINTEXT.
if ($UNICODE && $PERCENT_TEXT > 45)
{ $QUIET or print 'UNICODE TEXT. '; }
elsif ($PERCENT_TEXT < $MINTEXT)
{ $QUIET or print "File is only $PERCENT_TEXT% plain text! Should be
+ at least $MINTEXT%. ---SKIPPED.\n\n"; return; }
# File contains NO line-breaks at all?
if ($FLAGS == 4 && !$TRIM)
{ $QUIET or print "File is all one line! ---SKIPPED.\n\n"; return; }
# Input file is in Linux format?
my $SAVE_AND_RETURN = 0;
if ($FLAGS == 1 && !$TRIM)
{
# Converting from Linux to Linux:
if ($BR eq "\n") { $QUIET or print "File is already in Linux forma
+t! ---SKIPPED.\n\n"; return; }
# Converting from Linux to DOS:
if ($BR eq "\r\n") { if ($UNICODE) { $INPUT =~ s/\n/\r\0\n/g; } el
+se { $INPUT =~ s/\n/\r\n/g; } }
# Converting from Linux to OSX:
elsif ($BR eq "\r") { $INPUT =~ tr#\n#\r#; }
$SAVE_AND_RETURN = 1;
}
# Input file is in OSX format?
if ($FLAGS == 2 && !$TRIM)
{
# Converting from OSX to OSX:
if ($BR eq "\r") { $QUIET or print "File is already in OSX format!
+ ---SKIPPED.\n\n"; return; }
# Converting from OSX to DOS:
if ($BR eq "\r\n") { if ($UNICODE) { $INPUT =~ s/\r/\r\0\n/g; } el
+se { $INPUT =~ s/\r/\r\n/g; } }
# Converting from OSX to Linux:
elsif ($BR eq "\n") { $INPUT =~ tr#\r#\n#; }
$SAVE_AND_RETURN = 1;
}
# Input file might be in DOS format?
if ($FLAGS == 7 && !$TRIM)
{
# If the file contains complete CR-LF pairs,
# in other words, if it's a pure DOS text file,
# then eliminating all CR-LF pairs means that
# there will be no loose LF CR characters left!
my $TEST = $INPUT;
$TEST =~ s/\r\n/XX/g; # Remove normal line breaks
$TEST =~ s/\r\0\n\0/XXXX/g; # Remove unicode line breaks
$TEST = Find($TEST, "\r", "\n"); # Find loose \r or \n chars
if ($TEST == 0)
{
# Converting from DOS to DOS:
if ($BR eq "\r\n") { $QUIET or print "File is already in DOS for
+mat! ---SKIPPED.\n\n"; return; }
# Converting from DOS to OSX:
if ($BR eq "\r") { if ($UNICODE) { $INPUT =~ tr#\n\0##d; } else
+{ $INPUT =~ tr#\n##d; } }
# Converting from DOS to Linux:
elsif ($BR eq "\n") { if ($UNICODE) { $INPUT =~ tr#\r\0##d; } el
+se { $INPUT =~ tr#\r##d; } }
$SAVE_AND_RETURN = 1;
}
}
if ($SAVE_AND_RETURN == 1)
{
($OUTPUT, $INPUT) = ($INPUT, '');
SaveFile($F);
return;
}
# If we get to this point, it means our input file
# contains mixed content. So, we will read it
# slowly, processing one byte at a time.
my $c = 0; # current character
my $prev; # previous character
$LINES = 0;
$text_end = 0;
$line_start = -2;
my $RESTORE = $BR;
$BR = $BRU if ($UNICODE);
for ($i = 0; $i < $SIZE; $i++)
{
$c = vec($INPUT, $i, 8);
next if ($UNICODE && $c == 0);
if ($c == 10 || $c == 13) # Detect LF CR
{
if (PrintLine()) { }
# PrintLine() returns 0 if we had nothing to print,
# so then we check if the previous character was
# also the same new-line character...
elsif ($prev == $c)
{
# OK. What we have here is two LF-LF characters
# or two CR-CR characters one after the other,
# which means we should print a blank line.
$OUTPUT .= $BR;
$LINES++;
}
# Okay, what if the previous character was a
# different new-line character?
elsif ($prev == 10 || $prev == 13)
{
# We may have the second byte of a CR/LF pair
# here, so just skip this byte. And print a
# New-Line sequence in the next cycle.
$line_start = -2;
}
}
else # All kinds of other characters:
{
if ($line_start < 0) { $line_start = $i; }
($c == 32 || $c == 9) or $text_end = $i;
}
$prev = $c; # Save previous char
}
if ($line_start >= 0)
{ PrintLine(); } # Print the last line (if any)
$BR = $RESTORE;
SaveFile($F); # Overwrite original file
}
##################################################
#
# This function sends one line to a global variable
# named $OUTPUT which collects all the data to
# be written to a file. The line to be stored may
# contain text only, or it may be just a blank line.
#
# When $line_start == -2, we need to store a blank
# line. When $line_start == -1, we're in the middle
# of a line-break sequence. This function returns 1
# if something was stored, or 0 otherwise.
#
# Usage: INTEGER = PrintLine()
#
sub PrintLine
{
if ($line_start == -1) # Nothing to print yet?
{
return 0;
}
if ($line_start >= 0) # Got something to print!
{
if ($UNICODE)
{
$OUTPUT .= substr($INPUT, $line_start, ($TRIM ? $text_end + 1 :
+$i) - $line_start + 1);
}
else
{
$OUTPUT .= substr($INPUT, $line_start, ($TRIM ? $text_end + 1 :
+$i) - $line_start);
}
}
$OUTPUT .= $BR; # Print New-line sequence
$LINES++;
$line_start = -1;
return 1;
}
##################################################
#
# This function writes the contents of $OUTPUT
# to a file, replacing the existing file,
# and prints a little summary.
#
sub SaveFile
{
my $F = shift;
if ($INPUT eq $OUTPUT)
{ $QUIET or print "---NO CHANGES NEEDED.\n\n"; return; }
if ($DONT_WRITE)
{ $QUIET or print "---NO CHANGES MADE!\n\n"; return; }
if ($ALWAYS_ASK)
{
print 'File is about to be overwritten. Proceed (Y/N)? ';
my $A = <STDIN>;
if (Find(uc($A), 'Y') == 0)
{ print "---USER SKIPPED FILE.\n\n"; return; }
}
if ($CREATE_BAK)
{ system("ren $F *.BAK"); }
if ($FORCE)
{ chmod $M | 0660, $F; } # Make file writable
if (open(FH, ">$F") == 0)
{
$QUIET or print "\tCAN'T WRITE!\n\n";
return;
}
binmode FH;
print FH $OUTPUT;
close FH;
print "\n\t", Commify($LINES), ' line(s) ', Commify(length($OUTPUT))
+, " bytes written.\n\n";
chmod $M, $F; # Restore original mode
# The date & time the file was last modified will be reset
# back to original, but the file last-access date
# will most likely change to the current time.
$TOUCH or utime time, $T, $F;
}
##################################################
#
# This function reads the entire contents of a
# file into a global variable named $INPUT and
# returns the number of bytes read OR
# returns 0 if something went wrong.
#
# Usage: INTEGER = ReadFile(FILENAME)
#
sub ReadFile
{
my $F = shift;
$LINES = 0;
$INPUT = '';
$OUTPUT = '';
my $SIZE = -s $F;
$SIZE or return 0;
local *FH;
sysopen(FH, $F, 0) or return 0;
my @INFO = stat(FH);
$M = $INFO[2]; # Get file mode/attribute
$T = $INFO[9]; # Get last-modified date
binmode FH;
sysread(FH, $INPUT, $SIZE) or return 0;
close FH;
$SIZE == length($INPUT) or return 0;
return $SIZE;
}
##################################################
#
# This function scans the content of $INPUT string
# looking for special characters and determines
# what percentage of the string is plain text, and
# also tries to determine the text format.
#
# Returns an integer whose lower 8 bits is the percentage (0-100).
# Bit 9 will be set if any LF characters were found.
# Bit 10 will be set if any CR characters were found.
# Bit 11 will be set if there are equal number of CR and LF
# characters in the string.
# Bit 12 will be set if there are lots of zeros in the string,
# which is an indication that it may be unicode format.
# These can be interpreted as follows:
#
# .000 = Format is undetermined.
# .001 = LINUX string (LF only)
# .010 = OSX string (CR only)
# .011 = MIXED format
# .111 = DOS text (CR-LF pairs)
#
# Usage: INTEGER = AnalyzeFile()
#
sub AnalyzeFile
{
my $L = length($INPUT);
# We will simply count the number of plain text characters
# and the number of CR and LF characters in the string.
my $c;
my $TOTAL = $L; # Total length of string
my $TX = 0; # Number of plain text characters
my $CR = 0; # Number of 0D characters
my $LF = 0; # Number of 0A characters
my $NUL = 0; # Number of 00 characters
while ($L--)
{
$c = vec($INPUT, $L, 8);
next if ($c > 126);
if ($c > 31 || $c == 9) { $TX++; next; }
$LF++ if ($c == 10);
$CR++ if ($c == 13);
$NUL++ if ($c == 0);
}
$LINES = ($LF > $CR) ? $LF : $CR; # Number of line breaks
# Now, we will try to determine what type of string
# we're dealing with. There are 5 possibilities:
# LINUX, DOS, OSX, MIXED, or "undetermined."
#
# Explanation of formats:
# * OSX files contain CR characters as line break.
# * Linux text files contain LF characters as line break.
# * DOS text files contain an equal number of CR and LF
# characters in pairs.
# * "MIXED" means that the string contains an unequal number of
# both CR and LF characters, so this may be a binary data.
# * "Undeteremined" means that the string does not contain
# any line break characters at all, so it could be either
# a DOS text or Linux text or anything.
$c = $LF ? 0x100 : 0; # Now we use $c to store the string format.
$c |= 0x200 if ($CR);
$c |= 0x400 if ($CR == $LF);
# Check for possible unicode format
$NUL = int($NUL / $TOTAL * 100); # Calculate % of null chars
$c |= 0x800 if ($NUL > 40 && $NUL < 60 && vec($INPUT, 0, 8) == 255);
# The percentage is stored in the lower 8 bits,
# and the format is stored in bits 9-12.
return $c | int(($TX+$LF+$CR) / $TOTAL * 100);
}
##################################################
#
# This function returns true if a filename matches a
# certain wildcard pattern. There may be several
# question marks in the search pattern, but only
# one asterisk is allowed! The matching is
# NOT case sensitive!
# Usage: INTEGER = isMatch(FILENAME, WILDCARD)
#
# Example: isMatch("New_Document.txt", "n*.txt") = 1
#
sub isMatch
{
@_ > 1 or return 0;
my $F = shift; defined $F or return 0; length($F) or return 0;
my $W = shift; defined $W or return 0; length($W) or return 0;
$F = uc($F);
$W = uc($W);
# If there are invalid characters...
if (CountChars($W.$F, '<|>')) { return 0; }
# If there aren't any wildcards at all...
if (CountChars($W, '*?') == 0) { return ($F eq $W) ? 1 : 0; }
# Match what's before the asterisk...
return 0 unless (_isMatch($F, $W, 1));
# Match what comes after the asterisk...
return _isMatch($F, $W, -1);
}
##################################################
#
# This function is called by isMatch()
# This function compares two strings and returns 1 if
# both strings match until the first asterisk.
# This function can start comparing strings
# from the beginning or starting from the end!
# DIRECTION must be either 1 or -1.
# Usage: INTEGER = _isMatch(FILENAME, WILDCARD, DIRECTION)
#
sub _isMatch
{
my $F = shift; my $f; my $LF = length($F)-1;
my $W = shift; my $w; my $LW = length($W)-1;
my $DIR = shift;
my $STOP = $LW; my $START = 0; my $FSTART = 0;
if ($DIR < 0) { $STOP = 0; $START = $LW; $FSTART = $LF; }
while ($START != $STOP)
{
$w = vec($W, $START, 8); # Grab byte from wildcard pattern
$f = vec($F, $FSTART, 8); # Grab byte from filename
$START += $DIR;
$FSTART += $DIR;
if ($w == 42) # ASTERISK?
{ return 1; }
else
{
# If the character is "?" then skip, but if
# it's not "?", then the characters must match.
if ($w != 63) { ($f == $w) or return 0; }
}
}
return 1;
}
##################################################
#
# This function counts how many times STRING contains
# any of the characters of SUBSTR.
# Usage: INTEGER = CountChars(STRING, SUBSTR)
#
sub CountChars
{
@_ > 1 or return 0;
my $S = shift; defined $S or return 0; length($S) or return 0;
my $L = shift; defined $L or return 0; length($L) or return 0;
my $P;
my $i = length($L);
my $N = 0;
while ($i-- > 0)
{
$P = 0;
while (($P = 1+index($S, substr($L, $i, 1), $P)) > 0)
{ $N++; }
}
return $N;
}
##################################################
#
# This function returns the OS type as a number.
# 1=DOS 2=WINDOWS 3=LINUX 4=OSX 9=OTHER
#
sub GetOS
{
my $OS = uc($^O);
index($OS, 'LINUX') >= 0 ? 3 :
index($OS, 'MSWIN') >= 0 ? 2 :
index($OS, 'DOS') >= 0 ? 1 :
index($OS, 'DARWIN') >= 0 ? 4 : 9;
}
##################################################
#
# This function returns the NAME, the PATH, or the
# CONTENT of this perl script depending on which
# item is requested in the argument.
#
# Usage: STRING = Self(STRING)
#
# Examples:
# Self('NAME')
# Self('PATH')
# Self('CONTENT')
#
sub Self
{
my $S = defined $_[0] ? uc($_[0]) : '';
my $P = rindex($0, ($OS < 3 ? "\\" : '/'));
if ($S eq 'NAME') { return ($P < 0) ? $0 : substr($0, $P+1, length($
+0)); }
if ($S eq 'PATH') { return ($P < 0) ? $0 : substr($0, 0, $P); }
local *FH;
sysopen(FH, $0, 0);
binmode FH;
sysread(FH, $S, -s $0);
close FH;
return $S;
}
##################################################
#
# This function removes all whitespace from before
# and after text and returns a new string. This
# function removes every character whose ASCII
# value is less than 33. This includes tab, space,
# null, vertical tab, esc, new lines, etc..
#
# Usage: STRING = Trim(STRING)
#
sub Trim
{
my $S = defined $_[0] ? $_[0] : '';
my $L = length($S) or return '';
my $START = 0;
my $LAST = 0;
while ($L--)
{
if (vec($S, $L, 8) > 32)
{
$START = $L;
$LAST or $LAST = $L + 1;
}
}
return substr($S, $START, $LAST - $START);
}
##################################################
#
# This function returns the current working directory.
# If the current working directory cannot be determined
# then returns the first argument. If there are no
# arguments passed, then returns the path where this
# script is located.
#
# Usage: STRING = GetCWD( [DEFAULT] )
#
sub GetCWD
{
my $DIR = Trim($OS < 3 ? `cd` : `pwd`);
return length($DIR) ? $DIR : defined $_[0] ? $_[0] : Self('PATH');
}
##################################################
#
# This function inserts commas into a number at
# every 3 digits and returns a string.
# Copied from www.PerlMonks.org/?node_id=157725
#
sub Commify
{
my $N = reverse $_[0];
$N =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g;
return scalar reverse $N;
}
##################################################
#
# This function works just like the index() function
# except it can compare more than one string. It will
# return 0 if there is no match. Returns 1 if the first
# substring was found. Returns 2 if the second substring
# was found or 3 if both the first and second were found...
# Usage: INTEGER = Find(STRING, SUBSTR1, [SUBSTR2...])
#
sub Find
{
my $P = 1;
my $BIT = 1;
my $FOUND = 0;
while (defined $_[$P])
{
$FOUND |= index($_[0], $_[$P++]) < 0 ? 0 : $BIT;
$BIT += $BIT;
}
return $FOUND;
}
##################################################
#
# Prints the description of this program.
# Usage: About()
#
sub About
{
my $c;
print "\n $0\n\n";
ReadFile($0);
(my $S = index($INPUT, '# ')) >= 0 or return;
my $E = index($INPUT, '###', $S);
for (my $i = $S; $i < $E; $i++)
{
$c = vec($INPUT, $i, 8);
if ($c < 32 || $c == 35)
{ print "\n" if ($c == 13); next; }
print chr($c);
}
}
##################################################
#
# Prints an error message and exits.
# Usage: Abort(STRING)
#
sub Abort
{
print "\nOops. $_[0]\nType perl newline.pl ? for help.\n";
exit;
}
##################################################
|