#!/usr/bin/perl
########################################################################
#
# Universal filter of Russian encodings version 2.1
# created by Serge Winitzki (1997-1999). This is free software.
# Home page: http://www.geocities.com/CapeCanaveral/Lab/5735/1/
#
# Features:
# supported encodings: alt, iso, koi, lat, mac, win;
# letters 'YO' and 'yo' correctly supported in all encodings;
# strict 'Russkaja Latinica' conformance for the 'lat' encoding which
# allows almost unambiguous repeated native<->latinized translations of text;
# faster operation on Russian input (caveat: loads the whole file into memory);
# determines the required encodings from invoked script's name (alt2koi etc.)
# or from option string.
#
# Command line options (all options are case-insensitive):
# -alt2koi or -mac2win or whatever select required encodings
#
# Options for lat -> ... conversion:
# -tex do not translate text inside $..$, $$..$$ and \command names
# -wisv translate w as v (default w is tshcha)
# -qisja translate q as ja (default q is tshcha)
# -usekh translate kh as h (default kh='k''h')
#
########################################################################
#
# Installation:
# if needed, edit the first line to reflect your perl location (`which perl`);
# put this script somewhere on the path with executable permission;
# optionally make links to this script named alt2win, win2koi etc.
# (The script can determine the source/target encoding from its *name*.)
# e.g. copy this file to /usr/local/bin/323 and then say
# cd /usr/local/bin; chmod 755 323
# ln -s 323 alt2koi; ln -s 323 koi2alt; and so on (optional)
# for all needed combinations of alt, iso, koi, mac, win, lat.
# After all this, use as a filter. For example, `lat2koi < file1 > file2`
# or else have to specify encoding as `323 -lat2koi < file1 > file2`
#
############################# start of script ##########################
#
# Direct native encodings:
#
$rusmac='\xDD\xDE\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xDF';
$rusalt='\xF0\xF1\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF';
$ruswin='\xA8\xB8\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF';
$ruskoi='\xB3\xA3\xE1\xE2\xF7\xE7\xE4\xE5\xF6\xFA\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF2\xF3\xF4\xF5\xE6\xE8\xE3\xFE\xFB\xFD\xFF\xF9\xF8\xFC\xE0\xF1\xC1\xC2\xD7\xC7\xC4\xC5\xD6\xDA\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD2\xD3\xD4\xD5\xC6\xC8\xC3\xDE\xDB\xDD\xDF\xD9\xD8\xDC\xC0\xD1';
$rusiso='\xA1\xF1\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF';
####################### main part of the script ########################
$from="nothing";
$to="nothing";
$lat_output="no"; #whether latinized output is requested. special flag.
$lat_input="no"; #same for input
$help='Universal converter of Russian encodings version 2.1
Created by Serge Winitzki, 1999. No warranty. This is free software.
http://www.geocities.com/CapeCanaveral/Lab/5735/1/
Supported encodings: alt, iso, koi, lat, mac, win
Example usage:
323 -alt2koi < inputfile > outputfile
Or rename to "xxx2xxx" where xxx is one of the supported encodings and e.g.
alt2koi < inputfile > outputfile
Note: latinized encoding "lat" is implemented according to the "Russkaja
Latinica" scheme. See http://www.geocities.com/Athens/Forum/5344/RL/ for
more details. Sample options for "lat" input:
323 -lat2koi -usekh -wisv -qisja -tex < inputfile > outputfile
See the script preamble for more information.
';
if ("@ARGV" =~ /-([aciklmnostw]{3})2([aciklmnostw]{3})/i) {
$a1=$1;
$a2=$2;
$error="Incorrect encoding '$a1 -> $a2' on command line.";
} else {
#decide the source and target encoding based on our name
$name=`basename $0`;
if ($name =~ /([aciklmnostw]{3})2([aciklmnostw]{3})/i) { #this should match koi2win etc.
$a1="$1";
$a2="$2";
}
$error="Incorrect usage of this script, see $0 for documentation.";
}
if ("@ARGV" =~ /help/i) {
print $help . "\n";
exit;
}
{
if ($a1 =~ /win/i) {
$from="$ruswin";
} elsif ($a1 =~ /koi/i) {
$from="$ruskoi";
} elsif ($a1 =~ /alt/i) {
$from="$rusalt";
} elsif ($a1 =~ /mac/i) {
$from="$rusmac";
} elsif ($a1 =~ /iso/i) {
$from="$rusiso";
} elsif ($a1 =~ /lat/i) {
$from="$ruskoi"; #this is because our latin table is for koi
$lat_input="yes";
}
if ($a2 =~ /win/i) {
$to="$ruswin";
} elsif ($a2 =~ /koi/i) {
$to="$ruskoi";
} elsif ($a2 =~ /alt/i) {
$to="$rusalt";
} elsif ($a2 =~ /mac/i) {
$to="$rusmac";
} elsif ($a2 =~ /iso/i) {
$to="$rusiso";
} elsif ($a2 =~ /lat/i) {
$to="$ruskoi"; #this is because our latin table is for koi
$lat_output="yes";
}
}
if ($to eq "nothing" or $from eq "nothing") { #wrong options
print "$error\n$0 -help for brief usage instructions.\n";
exit 1;
}
undef $/; #make it convert the whole file at once, usually much faster.
while() { #main loop
#effectively we want to do e.g.
# eval ("tr/$ruswin/$rusalt/"); #because tr requires constant strings
if ($lat_input eq "yes") {
&translate_lat_to_koi(); #call special procedure operating on $_
}
#now $_ contains all cyrillic text and we need to transform it
eval ("tr/$from/$to/"); #we need to do this now
#now $_ contains correctly transformed text
if ($lat_output eq "yes") {
&translate_koi_to_lat(); #call special procedure operating on $_
}
print;
}
#################### end of main part of the script ####################
sub translate_koi_to_lat {
#use this procedure to replace each character in $_
#using Russkaja Latinica standard (by Alexy Khabrov and Serge Winitzki, 1995)
#first, break digraphs Y-A, Y-U, Y-O - just in case we get them in the text although they are ungrammatical. Insert the canonical breaking char \\.
s/([\xF9\xD9])([\xE1\xEF\xF5\xC1\xCF\xD5])/$1\\$2/g;
#also break the sh-ch which should rarely happen but still
s/([\xFB\xDB])([\xFE\xDE])/$1\\$2/g;
#second, transform letters that require combinations. Using "x" for "kha", "j'" for "i kratkoe, "shch" for "tshcha", "e'" for "e oborotnoe".
s/\xB3/Yo/g;
s/\xF6/Zh/g;
s/\xEA/J'/g;
s/\xFE/Ch/g;
s/\xFB/Sh/g;
s/\xFD/Shch/g;
s/\xFC/E'/g;
s/\xE0/Yu/g;
s/\xF1/Ya/g;
s/\xA3/yo/g;
s/\xD6/zh/g;
s/\xCA/j'/g;
s/\xDE/ch/g;
s/\xDB/sh/g;
s/\xDD/shch/g;
s/\xDC/e'/g;
s/\xC0/yu/g;
s/\xD1/ya/g;
#then replace other letters
tr/\xE1\xE2\xF7\xE7\xE4\xE5\xFA\xE9\xEB\xEC\xED\xEE\xEF\xF0\xF2\xF3\xF4\xF5\xE6\xE8\xE3\xFF\xF9\xF8\xC1\xC2\xD7\xC7\xC4\xC5\xDA\xC9\xCB\xCC\xCD\xCE\xCF\xD0\xD2\xD3\xD4\xD5\xC6\xC8\xC3\xDF\xD9\xD8/ABVGDEZIKLMNOPRSTUFXC~Y'abvgdeziklmnoprstufxc~y'/;
}
sub translate_lat_to_koi { #operate on $_ only
%translit=(
"Shch" => "\xFD",
"shch" => "\xDD",
"Yo" => "\xB3",
"yo" => "\xA3",
"Jo" => "\xB3",
"jo" => "\xA3",
"Zh" => "\xF6",
"zh" => "\xD6",
"J'" => "\xEA",
"j'" => "\xCA",
"J`" => "\xEA",
"j`" => "\xCA",
"Ch" => "\xFE",
"ch" => "\xDE",
"Sh" => "\xFB",
"sh" => "\xDB",
"E'" => "\xFC",
"e'" => "\xDC",
"E`" => "\xFC",
"e`" => "\xDC",
"`E" => "\xFC",
"`e" => "\xDC",
"Yu" => "\xE0",
"yu" => "\xC0",
"Ju" => "\xE0",
"ju" => "\xC0",
"Ya" => "\xF1",
"ya" => "\xD1",
"Ja" => "\xF1",
"ja" => "\xD1",
);
%malleable=( # lowercase
'~' => "\xDF",
'`' => "\xD8",
"'" => "\xD8",
'@' => "\xDC",
);
%malleable_uc=( # uppercase
'~' => "\xFF",
'`' => "\xF8",
"'" => "\xF8",
'@' => "\xFC",
);
$i=0; #pointer into the input string ($_)
$EnglishNow=0; #state flag for the digestion machine
#now need to set some options
$want_tex = ("@ARGV" =~ /-tex/i) ? 1 : 0;
$want_wisv = ("@ARGV" =~ /-wisv/i) ? 1 : 0;
$want_qisja = ("@ARGV" =~ /-qisja/i) ? 1 : 0;
$want_kh = ("@ARGV" =~ /-usekh/i) ? 1 : 0;
#need to modify the tables now
if ($want_kh) {
$translit{"Kh"} = "\xE8";
$translit{"kh"} = "\xC8";
}
$output=""; #to hold the output text
while ($i < length($_)) { #loop through the input
# The current char is substr($_,i,1).
# Note that $i will not always advance by 1 and sometimes be changed inside &digest_some()
my $doutput = &digest_some();
$i += length($doutput);
$output .= $doutput;
}
$_ = $output;
}
sub digest_some { # Return next output char(s), using $i as read-only position in $_ and using flags $want_tex and $want_wisv
# our state: $EnglishNow=2 if inside $$ or after '\ ', 1 if inside \command, 0 if in Russian.
# the '$' and \commands are all ignored unless $want_tex
my $thischar = substr($_, $i, 1); #just caching, aren't going to change it
my $nextchar = substr($_, $i+1, 1); #this may be changed
if ($EnglishNow == 2) {
if ($want_tex) {
if ($thischar . $nextchar eq '$$') {
$EnglishNow= 0;
return '$$';
}
if ($thischar eq '$') {
$EnglishNow= 0;
return '$';
}
}
# insert any additional switchers here
if ($thischar . $nextchar eq '\\ ') {
#switching back to Russian
$EnglishNow= 0;
$i += 2; #incrementing $i here since not returning anything
return "";
}
# ok, English is still going on
return $thischar;
} # case of $EnglishNow == 2 is done
if ($EnglishNow == 1 and $want_tex) {
if ($thischar eq ' ' or $thischar eq '\n') { # terminates \command
$EnglishNow= 0;
return $thischar;
}
if ($thischar . $nextchar eq '$$') {
$EnglishNow= 2;
return '$$';
}
if ($thischar eq '$') {
$EnglishNow= 2;
return '$';
}
if ($thischar eq '\\') {
if ($nextchar =~ /[0-9A-z@\\\"\':]/) { # starts another \command right after this one
$EnglishNow= 1;
return $thischar;
}
}
# didn't switch to Russian, continue without translation
return $thischar;
} # case of $EnglishNow == 1 is done
if ($EnglishNow == 0) {
if ($want_tex) {
if ($thischar . $nextchar eq '$$') {
$EnglishNow= 2;
return '$$';
}
if ($thischar eq '$') {
$EnglishNow= 2;
return '$';
}
}
if ($thischar eq '\\') {
if ($want_tex) {
if ($nextchar =~ /[0-9A-z@\\\"\':]/) { # starts \command
$EnglishNow= 1;
return $thischar;
}
}
if ($nextchar eq ' ') { # switch to English now
$EnglishNow = 2;
$i += 2;
return "";
}
if ($nextchar eq '\\') { # double backslash, skipping one
$i += 1;
return "\\";
}
#we get a backslash in Russian mode and not followed by space
#tex mode quirks and double backslashes are already done
#so we should swallow it and go on with the next char
$i += 1;
return "";
} # End of processing backslash char
# all switches have been processed, now do Russian stuff
# first, the 4-letter combination for "tshcha"
if (substr($_, $i, 4) eq 'shch') { #lowercase
$i += 3;
return $translit{'shch'};
}
if (substr($_, $i, 4) =~ /shch/i) { # uppercase: we now know it's not lowercase so any case combination works
$i += 3;
return $translit{'Shch'};
}
#now looking for digraphs
$digraph = $thischar . $nextchar;
$digraph =~ tr/A-Z/a-z/; #now it's all lowercase
if (defined($translit{$digraph})) { # Found a digraph!
if ($nextchar =~ /[A-Z]/ or $thischar =~ /[A-Z]/) { # uppercase
$thischar =~ tr/a-z/A-Z/; # Clobber, clobber
$nextchar =~ tr/A-Z/a-z/;
$digraph = $thischar . $nextchar;
}
$i += 1;
return $translit{$digraph};
}
# now search for malleables
if (defined($malleable{$thischar})) { # Found a malleable.
$prevchar = ($i>0) ? substr($_, $i-1, 1) : "";
if ($thischar eq '`' or $thischar eq "'") {
if (not ($prevchar =~ /[\@A-Za-z]/) and $nextchar =~ /[\@A-Za-z]/) { # ' and ` at beginning of word are not translated
return $thischar;
}
}
if ($prevchar eq '\\') {
return $thischar; # ' and ` prefixed by \ are not translated
}
if ($prevchar eq '^') { # Special cases.
return $malleable_uc{$thischar};
}
if ($prevchar eq '_') {
return $malleable{$thischar};
}
if (($prevchar =~ /[A-Z \n\t]/ or length($prevchar) == 0) and $nextchar =~ /[A-Z \n\t]/) {
return $malleable_uc{$thischar};
}
return $malleable{$thischar};
}
#if we are still here, we have a simple letter
if ($want_qisja) {
$thischar = ($thischar eq 'Q') ? $translit{'Ja'} : (($thischar eq 'q') ? $translit{'ja'} : $thischar);
}
if ($want_wisv) {
$thischar = ($thischar eq 'W') ? 'V' : (($thischar eq 'w') ? 'v' : $thischar);
}
$thischar =~ tr/ABVGDEZIKLMNOPRSTUFXHCYWQJabvgdeziklmnoprstufxhcywqj/\xE1\xE2\xF7\xE7\xE4\xE5\xFA\xE9\xEB\xEC\xED\xEE\xEF\xF0\xF2\xF3\xF4\xF5\xE6\xE8\xE8\xE3\xF9\xFD\xFD\xEA\xC1\xC2\xD7\xC7\xC4\xC5\xDA\xC9\xCB\xCC\xCD\xCE\xCF\xD0\xD2\xD3\xD4\xD5\xC6\xC8\xC8\xC3\xD9\xDD\xDD\xCA/;
return $thischar;
} # case of EnglishNow == 0 is done
}
__END__