comment on

When UTF-8 ability was introduced in Perl, I wrote a roman numeral module that would play with this ability. Below is my module, which uses the "\N" and "\X{}" string literals, UTF-8 in regular expressions, etc. I found Perl bugs at the time, but the latest Perl handles all this OK.

I found the Roman Numeral codes in Unicode rather interesting, so I played with this. It handles all the fancy letters, multiple forms, and numbers greater than 10000.


=head1 DESCRIPTION

See http://www2.inetdirect.net/~charta/Roman_numerals.html for primary
reference material on Roman numerals.

=over 4

choice of characters

input may be case insensitive, *all* insensitive, mixed forms, etc.  (
+suppliments?)

may use precomposed Unicode number forms for IV etc.

choice of IV or IIII forms of 4's on output

non-strict input takes IC, IIX, etc.

use overbars or apostrophic forms for 10000 and greater powers.
choice of I-with-2 or M-with-1 forms.

=back

=cut


use charnames ':full';
our %ivxlcdm= (i=>1, v=>2, x=>3, l=>4, c=>5, d=>6, m=>7, '('=>8, mid=>
+9, ')'=>10);
my $overbar= "\N{COMBINING OVERLINE}";  #or should it use macron?

package Roman2::parse;
use Carp;
use strict;
use utf8;
use warnings;
use fields qw (suppliments apposmatch_RE translation);
use vars qw (@parse_all %translation);

@parse_all= (\%ivxlcdm,     # all known/allowed forms for parsing.
   [ 'i', 'I', "\N{ROMAN NUMERAL ONE}", "\N{SMALL ROMAN NUMERAL ONE}" 
+],
   [ 'v', 'V', "\N{ROMAN NUMERAL FIVE}", "\N{SMALL ROMAN NUMERAL FIVE}
+" ],
   [ 'x', 'X', "\N{ROMAN NUMERAL TEN}", "\N{SMALL ROMAN NUMERAL TEN}" 
+],
   [ 'l', 'L', "\N{ROMAN NUMERAL FIFTY}", "\N{SMALL ROMAN NUMERAL FIFT
+Y}" ],
   [ 'c', 'C', "\N{ROMAN NUMERAL ONE HUNDRED}", "\N{SMALL ROMAN NUMERA
+L ONE HUNDRED}" ],
   [ 'd', 'D', "\N{ROMAN NUMERAL FIVE HUNDRED}", "\N{SMALL ROMAN NUMER
+AL FIVE HUNDRED}" ],
   [ 'm', 'M', "\N{ROMAN NUMERAL ONE THOUSAND}", "\N{SMALL ROMAN NUMER
+AL ONE THOUSAND}",
        "\N{GREEK CAPITAL LETTER PHI}", "\N{GREEK SMALL LETTER PHI}", 
+"\N{LATIN SMALL LETTER PHI}"],
   [ '(', 'C', 'c' ],
   [ 'i', 'I', "\N{ROMAN NUMERAL ONE}", "\N{SMALL ROMAN NUMERAL ONE}",
+  "\N{GREEK CAPITAL LETTER PHI}", "\N{GREEK SMALL LETTER PHI}", "\N{L
+ATIN SMALL LETTER PHI}"],
   [ ')', "\N{LATIN CAPITAL LETTER OPEN O}", "\N{LATIN SMALL LETTER OP
+EN O}", "\N{ROMAN NUMERAL REVERSED ONE HUNDRED}"]
   );


sub populate_translation (@$)
 {
 my ($list, $val)= @_;
 foreach (@$list) {  $translation{$_} = $val; }
 }

{ my $parse_all= \@parse_all;   #pseudo-hash symtax only works on refe
+rences.
populate_translation ($parse_all->{i}, 1);
populate_translation ($parse_all->{v}, 5);
populate_translation ($parse_all->{x}, 10);
populate_translation ($parse_all->{l}, 50);
populate_translation ($parse_all->{c}, 100);
populate_translation ($parse_all->{d}, 500);
populate_translation ($parse_all->{m}, 1000);
}

sub new
 {
 my $class= shift;
 my $self;
 { no strict 'refs';
   $self = bless [\%{"$class\::FIELDS"}], $class;
   }
 $self->{suppliments}= \@parse_all;   #various forms accepted for inpu
+t
 $self->{translation}= \%translation;  #the values of the various item
+s.
 return $self;
 }

 
sub get_apposmatch_RE()
 {
 my $self= shift;
 # cache the regexp, reuse until suppliments set changes.
 unless (exists $self->{apposmatch_RE}) {
    # the form is (open+)(i|m)(close+).
    my @list= @{$self->{suppliments}{'('}};
    my $open= join ('|', map { "\Q$_\E" }(@list));
    @list= @{$self->{suppliments}{mid}};
    my $mi= join ('|', map { "\Q$_\E" }(@list));
    @list= @{$self->{suppliments}{')'}};
    my $close= join ('|', map { "\Q$_\E" }(@list));
    my $total= "((?:$open)+)($mi)((?:$close)+)";
    $self->{apposmatch_RE}= qr{$total};
    }
 return $self->{apposmatch_RE};
 }

sub _process_appos
 {
 my ($self, $open, $mid, $close)= @_;
 print "found open:$open, mid:$mid, close:$close\n";
 my $count= length($open);
 croak "\"$open$mid$close\" is an unballanced appostrophic form."  if 
+$count != length($close);
 my $midval= $self->{translation}{$mid};
 if ($midval == 1) {
    # I with n parens becomes M with n-1 overbars.
    --$count;
    }
 elsif ($midval == 1000) {
    # Phi with n parens becomes M with n overbars.
    }
 else {
    die "Invalid mid-symbol $mid.  Stopped ";
    # die, not croak, because this is an internal error.  The translat
+ion table doesn't match the suppliments table.
    }
 return 'M' . $overbar x $count;
 }

sub appos2over
 {
 # I with n parens becomes M with n-1 overbars.
 # Phi with n parens becomes M with n overbars.
 my $self= shift;
 # modify value parameter in-place, so caller sees change.
 # >> ...........
 # (open)(i)(close) replaced with function to check $1 and $3 for equa
+l length.
 my $re= $self->get_apposmatch_RE();
 $_[0] =~ s/$re/$self->_process_appos($1,$2,$3)/eg;
 }

sub remove_composite
 {
# =COMMENT      remove due to bug in Perl ?
 my $self= shift;
 $_[0] =~ s/\x{2161}|\x{2171}/II/g;
 $_[0] =~ s/\x{2162}|\x{2172}/III/g;
 $_[0] =~ s/\x{2163}|\x{2173}/IV/g;
 $_[0] =~ s/\x{2165}|\x{2175}/VI/g;
 $_[0] =~ s/\x{2166}|\x{2176}/VII/g;
 $_[0] =~ s/\x{2167}|\x{2177}/VIII/g;
 $_[0] =~ s/\x{2168}|\x{2178}/IX/g;
 $_[0] =~ s/\x{216a}|\x{217a}/XI/g;
 $_[0] =~ s/\x{216b}|\x{217b}/XII/g;
# =cut 
 }

sub _make_list
 {
 my ($self, $value)= @_;
 # This function converts the string of characters into a list of numb
+ers, one number for each character.
 # Each number is the value of the corresponding roman letter.

 ##  while ($value =~ /
 # make sure that an overbar only follows legal characters.
 
 my @result;
 while ($value =~ /(.)/g) {
    if ($1 eq $overbar) { $result[-1] *= 10 }
    else {
       my $val= $self->{translation}{$1};
       croak "Unknown Roman Numeral character \"$1\""  unless $val;
       push @result, $val;
       }
    }
 return @result;
 }
 
sub fromRoman
 {
 my ($self, $value)= @_;
 # replace appos. forms with overbars.  (look for i and phi)
 $self->appos2over ($value);
 # replace composite forms.  Need to do this early so inversions work 
+as visually appear.
 $self->remove_composite ($value);
 # create a list of values
 my @values= $self->_make_list ($value);

 print join (',', @values), "\n";
 # finally, process that list instead of the string.
 #  search for inversions -- smaller preceeding larger.  Make those ne
+gative.
 # add the numbers.
 }



#######################################################

package Roman2;
use utf8;
use warnings;
use strict;
use fields qw (primary use_precomposed);
use vars qw (%known_sets);
use Carp;


%known_sets= (
   ASCII=> [\%ivxlcdm, qw [ I V X L C D M ( I ) ] ],
   ascii=> [\%ivxlcdm, qw [ i v x l c d m ( I ) ] ],
   Phi=> [\%ivxlcdm, qw [ I V X L C D ], "\N{GREEK CAPITAL LETTER PHI}
+", '(', "-\N{GREEK CAPITAL LETTER PHI}", ')' ],
   ROMAN=> [\%ivxlcdm, "\N{ROMAN NUMERAL ONE}", "\N{ROMAN NUMERAL FIVE
+}", "\N{ROMAN NUMERAL TEN}",  "\N{ROMAN NUMERAL FIFTY}", "\N{ROMAN NU
+MERAL ONE HUNDRED}", "\N{ROMAN NUMERAL FIVE HUNDRED}", "\N{ROMAN NUME
+RAL ONE THOUSAND}", '(', "-\N{GREEK CAPITAL LETTER PHI}", ')' ],
   );

sub new 
 {
 my $class= shift;
 my $self;
 { no strict 'refs';
   $self = bless [\%{"$class\::FIELDS"}], $class;
   }
 #start with the default arrangement.
 $self->{use_precomposed}= undef;  #if true, uses U+2161 etc.
 # parameters, if any, override that.
 my $primary_param= shift || 'ASCII';
 $self->set($primary_param);
 return $self;
 }


=head2 set

Sets the letters used for specific number positions.

Usage:

    r->set ('ASCII');  #single string sets a known scheme.
    r->set (M=>'(I)', X=>'*');  #pairs specify specific values
    
=cut

sub set
 {
 my $self= shift;
 my $argcount= scalar @_;
 if ($argcount == 1) {
    # it names a known scheme
    eval {
       my @set= @{ $known_sets{$_[0]} };
       $self->{primary}= \@set;
       }  or  croak "Unknown set '$_[0]'";
    }
 elsif (($argcount&1) != 0) {
    # odd number is error
    croak "Invalid number of arguments: must be 1 or even.";
    }
 else { # pairs 
    while (@_) {
       my $key= lc shift;
       my $value= shift;
       eval { $self->{primary}{$key}= $value; }
          or croak "Invalid argument '$key'";
       }
    }
 }


my @digitmap1= ('' , qw (i ii iii iv v vi vii viii ix));
my @digitmap2= ('' , qw (i ii iii iiii iiiii iiiiii iiiiiii iiiiiiii i
+x));

sub over2apos ($$$$)
 {
 # M with n overbars becomes I with n+1 parens.
 # no way to specify Phi with n parens instead.  Post-process with a s
+imple s/// if you need that.
 my ($input, $L, $R, $I)= @_;
# my $letter= substr ($input, 0, 1);  # first letter -- change prefix 
+to infix
 my $count= length($input);
 --$count  if $I =~ s/^-//;  #one fewer set of parens if marked with a
+ dash.
 return ($L x $count) . $I . ($R x $count);
 }

sub toRoman
 {
 my ($self, $value)= @_;
 die "value must be a non-negative integer"
    if $value < 0 || int($value) != $value;
 my $result= '';  #start with an empty string
 my @digitcodes= @{$self->{primary}};
 shift @digitcodes;  #get rid of pseudohash index.
 for (1..3) {  # units, tens, hundreds all the same logic
    my ($one, $five)= splice (@digitcodes, 0, 2);  #shift off two item
+s
    my $ten= $digitcodes[0];  #look at the third too, but don't remove
+ it.
    my $digit= $value % 10;  #least sig. digit
    $value /= 10;  #removed.
    my $s= $digitmap1[$digit];
    #don't use tr because a code may contain multiple letters.
    $s =~ s/i/$one/g;
    $s =~ s/v/$five/g;
    $s =~ s/x/$ten/g;
    $result= $s . $result;
    }
 if ($value) {  # handle higher-order digits: 1000's and up
    my $one= shift @digitcodes;
    my $ten= "$one$overbar";
    while ($value) {
       my $digit= $value % 10;  #least sig. digit
       $value /= 10;  #removed.
       my $s= $digitmap2[$digit];
       $s =~ s/i/$one/g;
       $s =~ s/x/$ten/g;
       $result= $s . $result;
       # promote everything for the next pass
       $one= $ten;
       $ten .= $overbar;
       }
    }
 # use apostrophic forms, if chars supplied.
 my ($Lapostrophus, $mid, $Rapostrophus)= splice (@digitcodes, 0, 3);
 if (defined $Lapostrophus && defined $Rapostrophus) {
    $result =~ s/$self->{primary}{m}$overbar+/over2apos($&,$Lapostroph
+us,$Rapostrophus,$mid)/eg;
    }
 return $result;   
 }



##########################################################


1;  # file loaded OK
[download]

And here is a test file to call it:

use Roman2;
use warnings;
use strict;

sub test {
   my ($r, $value)= @_;
   my $result= $r->toRoman ($value);
   print "$value produces $result\n";
   }


sub testset1 ($)
 {
   my $r1= shift;
   foreach (1..12) {
      test ($r1, $_);
      }
   test ($r1, 123);
   test ($r1, 2000);
   test ($r1, 1999);
   test ($r1, 4567);
   test ($r1, 9876);
   test ($r1, 1000001);
   test ($r1, 90001);
   }

sub test1
 {
 print "run with ASCII\n";
 my $r1= new Roman2:: ;
 testset1 ($r1);
 print "change X to *\n";
 $r1->set (X=>'*');
 testset1 ($r1);
 print "change to lower-case\n";
 $r1->set ('ascii');
 testset1 ($r1);
 print "change to Phi usage\n";
 $r1->set ("Phi");
 testset1 ($r1);
 print "change to Unicode ROMAN digits\n";
 $r1->set ("ROMAN");
 testset1 ($r1);
 }


sub test2
 {
 my $parser= new Roman2::parse:: ;
 $parser->fromRoman ('((I))(I)Mxvi');
 $parser->fromRoman ('xvii');
 }

 
#main program
test1;
test2;
[download]

In reply to Roman Numerals by John M. Dlugosz

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.