require bytes; if (length($s) > $MAX) { substr($s, $MAX) = ""; } $s =~ s/\X\z// until $MAX > bytes::length($s); #### #!/usr/bin/env perl use 5.12.0; use strict; use autodie; use warnings; use utf8; use open qw<:std :utf8>; use charnames qw< :full >; require bytes; my $MAX_BYTES = 25; my ($MIN_BPC, $MAX_BPC) = (1, 4); my $MAX_CHARS = $MAX_BYTES / $MIN_BPC; sub bytelen(_) { require bytes; return bytes::length($_[0]); } sub graphlen(_) { my $count = 0; $count++ while $_[0] =~ /\X/g; return $count; } sub charlen(_) { return length($_[0]); } sub shorten(_) { my $s = $_[0]; printf "\tstart string has graphlen %d, charlen %d, bytelen %d\n", graphlen($s), charlen($s), bytelen($s); if (charlen($s) > $MAX_CHARS) { printf "\tCHARLEN %d > %d, truncating to %d CHARS\n", length($s), $MAX_BYTES, $MAX_CHARS; substr($s, $MAX_CHARS) = ""; } while (bytelen($s) > $MAX_BYTES) { printf "\tbytelen %d still too long, chopping last grapheme\n", bytes::length($s); $s =~ s/(\X)\z//; printf "\tdeleted grapheme <%s> U+%v04X, charlen -%d, bytelen -%d\n", $1, $1, length($1), bytes::length($1); } printf "\tfinal string has graphlen %d, charlen %d, bytelen %d\n", graphlen($s), charlen($s), bytelen($s); return $s; } my @strings = ( "this lines starts a bit too long", "NFC: cr\N{LATIN SMALL LETTER E WITH GRAVE}me br\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}l\N{LATIN SMALL LETTER E WITH ACUTE}e et cr\N{LATIN SMALL LETTER E WITH GRAVE}me br\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}l\N{LATIN SMALL LETTER E WITH ACUTE}e", "NFC: t\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}te\N{HYPHEN}\N{LATIN SMALL LETTER A WITH GRAVE}\N{HYPHEN}t\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}te t\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}te\N{HYPHEN}\N{LATIN SMALL LETTER A WITH GRAVE}\N{HYPHEN}t\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}te", "NFD: cre\N{COMBINING GRAVE ACCENT}me bru\N{COMBINING CIRCUMFLEX ACCENT}le\N{COMBINING ACUTE ACCENT}e et cre\N{COMBINING GRAVE ACCENT}me bru\N{COMBINING CIRCUMFLEX ACCENT}le\N{COMBINING ACUTE ACCENT}e", "NFD: te\N{COMBINING CIRCUMFLEX ACCENT}te\N{HYPHEN}a\N{COMBINING GRAVE ACCENT}\N{HYPHEN}te\N{COMBINING CIRCUMFLEX ACCENT}te te\N{COMBINING CIRCUMFLEX ACCENT}te\N{HYPHEN}a\N{COMBINING GRAVE ACCENT}\N{HYPHEN}te\N{COMBINING CIRCUMFLEX ACCENT}te", "NFC \N{U+84DD} l\N{LATIN SMALL LETTER A WITH ACUTE}n and \N{U+7EFF} l\N{LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE}", "NFD \N{U+84DD} la\N{COMBINING ACUTE ACCENT}n and \N{U+7EFF} lu\N{COMBINING DIAERESIS}\N{COMBINING GRAVE ACCENT}", "XXX NFC q\N{LATIN SMALL LETTER I WITH MACRON}ng ti\N{LATIN SMALL LETTER A WITH MACRON}n, b\N{LATIN SMALL LETTER A WITH ACUTE}i r\N{LATIN SMALL LETTER I WITH GRAVE}, m\N{LATIN SMALL LETTER A WITH CARON}n d\N{LATIN SMALL LETTER I WITH GRAVE} h\N{LATIN SMALL LETTER O WITH ACUTE}ng", "XXX NFD qi\N{COMBINING MACRON}ng tia\N{COMBINING MACRON}n, ba\N{COMBINING ACUTE ACCENT}i ri\N{COMBINING GRAVE ACCENT}, ma\N{COMBINING CARON}n di\N{COMBINING GRAVE ACCENT} ho\N{COMBINING ACUTE ACCENT}ng", "Chinese: \N{U+9752}\N{U+5929}\N{FULLWIDTH COMMA}\N{U+767D}\N{U+65E5}\N{FULLWIDTH COMMA}\N{U+6EE1}\N{U+5730}\N{U+7EA2}", "normal \N{FULLWIDTH LATIN SMALL LETTER W}\N{FULLWIDTH LATIN SMALL LETTER I}\N{FULLWIDTH LATIN SMALL LETTER D}\N{FULLWIDTH LATIN SMALL LETTER E} normal \N{FULLWIDTH LATIN SMALL LETTER W}\N{FULLWIDTH LATIN SMALL LETTER I}\N{FULLWIDTH LATIN SMALL LETTER D}\N{FULLWIDTH LATIN SMALL LETTER E}", "NFC: h\N{LATIN SMALL LETTER A WITH TILDE}\N{COMBINING CIRCUMFLEX ACCENT}\N{LATIN SMALL LETTER C WITH CEDILLA}\N{COMBINING CARON}k ha\N{COMBINING TILDE}\N{COMBINING CIRCUMFLEX ACCENT}c\N{COMBINING CEDILLA}\N{COMBINING CARON}k h\N{LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE}\N{LATIN SMALL LETTER C WITH CEDILLA}\N{COMBINING CARON}k ha\N{COMBINING CIRCUMFLEX ACCENT}\N{COMBINING TILDE}c\N{COMBINING CEDILLA}\N{COMBINING CARON}k", "NFD: ha\N{COMBINING TILDE}\N{COMBINING CIRCUMFLEX ACCENT}c\N{COMBINING CEDILLA}\N{COMBINING CARON}k ha\N{COMBINING TILDE}\N{COMBINING CIRCUMFLEX ACCENT}c\N{COMBINING CEDILLA}\N{COMBINING CARON}k ha\N{COMBINING CIRCUMFLEX ACCENT}\N{COMBINING TILDE}c\N{COMBINING CEDILLA}\N{COMBINING CARON}k ha\N{COMBINING CIRCUMFLEX ACCENT}\N{COMBINING TILDE}c\N{COMBINING CEDILLA}\N{COMBINING CARON}k", "\N{MATHEMATICAL BOLD CAPITAL C}\N{COMBINING OVERLINE} = sqrt[\N{MATHEMATICAL BOLD CAPITAL A}\N{COMBINING OVERLINE}\N{SUPERSCRIPT TWO} + \N{MATHEMATICAL BOLD CAPITAL B}\N{COMBINING OVERLINE}\N{SUPERSCRIPT TWO}]", "4\N{FRACTION SLASH}3\N{INVISIBLE TIMES}\N{GREEK SMALL LETTER PI}\N{INVISIBLE TIMES}r\N{SUPERSCRIPT THREE} 4\N{FRACTION SLASH}3\N{INVISIBLE TIMES}\N{GREEK SMALL LETTER PI}\N{INVISIBLE TIMES}r\N{SUPERSCRIPT THREE} 4\N{FRACTION SLASH}3\N{INVISIBLE TIMES}\N{GREEK SMALL LETTER PI}\N{INVISIBLE TIMES}r\N{SUPERSCRIPT THREE} 4\N{FRACTION SLASH}3\N{INVISIBLE TIMES}\N{GREEK SMALL LETTER PI}\N{INVISIBLE TIMES}r\N{SUPERSCRIPT THREE}", ); printf "MAX byte length is %d\n\n", $MAX_BYTES; for my $line (@strings) { chomp $line; say "String was <$line>"; my $trunk = shorten($line); say "Trunc'd is <$trunk>\n"; } exit 0;