#!/usr/bin/env python3
# Preprocessor for serto font for the use with LaTeX.
# Copyright 2001-2020 by Johannes Heinecke
# you can use this and change it as you wish, under the condition
# that the original copyright line is not deleted
# Last changes:
# 2 Nov 2003: major reconstruction, use of three fontfiles
# 12 July 2003: correction on stretch
# 29 September 2007: Possibility to typeset two identical letters without a qusshaya
# 31 March 2013: add SERTOFONTDIR environment variable
# 18 April 2020: finally adapted to python3
# 15 December 2020: minor corrections
import re
import sys
import os
version="1.3.2"
mydir = os.path.dirname(__file__)
if not mydir:
mydir = "."
FONTFILESERTO = mydir + "/serto.font" # specify absolute path
FONTFILECHALD = mydir + "/assyr.font"
#FONTFILEESTRA = mydir + "/estrangelo.font"
print(FONTFILESERTO, file=sys.stderr)
# use the environment variable SERTOFONTDIR to specify the directory of the *.font files
# translating syriac unicode points to serto codings
class Letter:
def __init__(self,
coding, # what coding to use in .ptex-file
name, # name of the letter
isolated, # what character to take in isolated usage
initial, # what character to take in word-initial usage
medial, # what character to take in word-medial usage
final, # what character to take in word-final usage
link): # does it link to the following (# link 0: next letter is initial, 1: next letter is medial, 2: ignore, 8: character is a superscript symbol 9: character is a subscript symbol
self.coding = coding
self.name = name
self.isolated = list(map(int, isolated.split('+')))
self.initial = list(map(int, initial.split('+')))
self.medial = list(map(int, medial.split('+')))
self.final = list(map(int, final.split('+')))
self.link = int(link)
if self.isolated[0] == -1: self.isolated = None
if self.initial[0] == -1: self.initial = None
if self.medial[0] == -1: self.medial = None
if self.final[0] == -1: self.final = None
#sys.stderr.write("%s:%s-%s-%s-%s\n" \
# % (name,self.isolated,self.initial,self.medial,self.final))
def getcontext(self, ctx):
if ctx == 0: return self.isolated[0]
elif ctx == 1: return self.initial[0]
elif ctx == 2: return self.medial[0]
elif ctx == 3: return self.final[0]
def getChar(self, ctx):
if ctx == 0:
if not self.isolated: return ['']
return map(lambda x: "%c" % x, self.isolated)
#return int(self.isolated[0])
elif ctx == 1:
if not self.initial: return ['']
return map(lambda x: "%c" % x, self.initial)
#return int(self.initial[0])
elif ctx == 2:
#sys.stderr.write("MEDIAL %s\n" % self.medial)
if not self.medial:
#sys.stderr.write("NONE:e\n")
return ['']
return map(lambda x: "%c" % x, self.medial)
#return int(self.medial[0])
elif ctx == 3:
if not self.final: return ['']
return map(lambda x: "%c" % x, self.final)
#return int(self.final[0])
class Serto:
def __init__(self, elatex=0):
self.elatex=elatex # eLaTeX needs \TeXXeTstate=1
self.tabelle = {} # style: {"_d": Letter-Object}
self.transtabelle = {} # style: "_d": \d{d}
#self.usingUTF8 = False
self.inlineS = re.compile("()(.*?)()")
self.inlineT = re.compile("()(.*?)()")
self.inlineST = re.compile("()(.*?)()")
self.inlineC = re.compile("()(.*?)()")
self.inlineCT = re.compile("()(.*?)()")
self.inlineE = re.compile("()(.*?)()")
self.inlineET = re.compile("()(.*?)()")
self.tabelle["serto"] = {}
self.transtabelle["serto"] = {}
self.readfont(FONTFILESERTO,
self.tabelle["serto"],
self.transtabelle["serto"])
#print self.tabelle["serto"]
self.tabelle["chaldean"] = {}
self.transtabelle["chaldean"] = {}
self.readfont(FONTFILECHALD,
self.tabelle["chaldean"],
self.transtabelle["chaldean"])
self.UnicodeTable = { 0x0710: "'",
0x0712: "b",
0x0713: "g",
0x0714: "G", #gamal garshuni
0x0715: "d",
0x0717: "h",
0x0718: "w",
0x0719: "z",
0x071a: ".h",
0x071b: ".t",
0x071c: ".T", # teth garshuni
0x071d: "y",
0x071f: "k",
0x0720: "l",
0x0721: "m",
0x0722: "n",
0x0723: "s",
0x0724: "s", # final semkath
0x0725: "`",
0x0726: "p",
0x0728: ".s",
0x0729: "q",
0x072a: "r",
0x072b: "^s",
0x072c: "t",
0x0308: "P", # syame
0x0730: "a",
0x0731: "A",
0x0732: ":a",
0x0733: "=a",
0x0734: "=A",
0x0735: ":=a",
0x0736: "e",
0x0737: "E",
0x0738: ":e",
0x0739: ":e",
0x073a: "i",
0x073b: "I",
0x073c: ":i",
0x073d: "u",
0x073e: "U",
0x073f: ":u",
0x0740: ":=a",
0x0741: "*",
0x0742: "+",
#punctuation listed in unicode not completed implemented in serto
0x0700: ".:.",
0x0701: ".",
0x0702: ".",
0x0703: ":",
0x0704: ":",
0x0705: ":",
0x0706: ":",
0x0707: ":",
0x0708: ":",
0x0709: ":",
0x070D: ".X.",
}
# used for unicode encoded fonts (xelatex) which
# chose initial/middle/final forms automatically
self.code2Unicode = {
"'": (0x710,),
"b": (0x712,),
"g": (0x713,),
"G": (0x714,), # gamal garshuni
"d": (0x715,),
"D": (0x716,), # dotless dalath (ambiguous with dotles rish)
"h": (0x717,),
"w": (0x718,),
"z": (0x719,),
".h": (0x71a,),
".t": (0x71b,),
".T": (0x71c,), # teth garshuni
"y": (0x71d,),
"Y": (0x71e,),
"k": (0x71f,),
"l": (0x720,),
"m": (0x721,),
"n": (0x722,),
"s": (0x723,),
"S": (0x724,), # semkath final
"`": (0x725,),
"p": (0x726,),
".s": (0x728,),
"q": (0x729,),
"r": (0x72a,),
"R": (0x716,), # dotless rish (ambiguous with dotles dalath)
#"R": (0x72a,),
"^s": (0x72b,),
"t": (0x72c,),
# greek vowel symbols above
"a": (0x730,),
"e": (0x736,),
"i": (0x73a,),
"=a": (0x733,),
"o": (0x718, 0x741),
"u": (0x73d,),
# greek vowel symbols below
"A": (0x731,),
"E": (0x737,),
"I": (0x73b,),
"=A": (0x734,),
"U": (0x73e,),
"P": (0x735,),
# eastern vowels
":a": (0x732,),
":e": (0x739,),
":i": (0x73c,),
":iy": (0x71d, 0x73c),
":=a": (0x735,),
":o": (0x741,),
":ow": (0x718, 0x741),
":u": (0x73c,),
":uw": (0x718, 0x73c),
# tatweel
"-": (0x640,),
# linea occultans (not in the font)
"=": [],
# punctuation
".": (0x701, ),
":": (0x703, ),
".X.": (0x70d, ),
".:.": (0x700, ),
}
#print "zzzzzz",UnicodeTable
def readfont(self, filename, tabelle, transtabelle):
dirname = os.environ.get("SERTOFONTDIR")
if not dirname:
# no environment variable
fp = open("%s" % (filename), "r")
else:
fp = open("%s/%s" % (dirname, os.path.basename(filename)), "r")
lines = fp.readlines()
fp.close()
#self.tabelle = {} # "_d": (isol, init, med, fin, link)
#self.tabelle = {} # "_d": Letter-Object
#self.transtabelle = {} # "_d": \d{d}
self.fontname = ""
status = "syriac"
self.errct = 0
for z in lines:
if len(z) < 2:
continue
if z[0] == "#":
if z[:6] == "#FONT:":
a = z.split()
self.fontname = a[1].strip()
#self.textframe.thetext.config(font=self.fontname)
#self.testlabel.config(font=self.fontname)
elif z[:7] == "#TRANS:":
status = "transliterate"
continue
felder = z.split()
if status == "syriac":
if len(felder) < 7:
self.errct = self.errct + 1
print("ERROR:", z, file=sys.stderr)
else:
#print ("rrr",felder)
#self.tabelle[felder[0]] = (int(felder[2]),
# int(felder[3]),
# int(felder[4]),
# int(felder[5]),
# int(felder[6]))
tabelle[felder[0]] = Letter(felder[0],
felder[1],
felder[2],
felder[3],
felder[4],
felder[5],
felder[6])
else:
if len(felder) < 2:
#print "WARNING:", z
transtabelle[felder[0]] = felder[0]
else:
transtabelle[felder[0]] = felder[1]
def tokenize(self, str, xlen, style="serto"):
ix = 0
self.tokens = []
self.digits = []
number = 0 #
while(ix < xlen): #for ix in range(xlen):
#print "IX", ix, str
if str[ix] == "\\":
command = "\\"
ix = ix + 1
while(ix < xlen):
if not str[ix] in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
break
else:
command = command + str[ix]
ix = ix + 1
self.tokens.append(command)
elif str[ix] in "{}":
self.tokens.append(str[ix])
ix = ix + 1
else:
for ll in range(5, 0, -1):
if str[ix:ix+ll] in self.tabelle[style]:
if ll == 1 and str[ix:ix+ll] in "aeiou" \
and (len(self.tokens) == 0 \
or self.tokens[-1] == "~"):
#self.tokens.append("'" + str[ix:ix+ll])
self.tokens.extend(["'", str[ix:ix+ll]])
#pass
else:
if len(self.tokens) \
and str[ix:ix+ll] == self.tokens[-1] \
and self.tabelle[style][str[ix:ix+ll]].link != 3 \
and str[ix:ix+ll] not in ["~", "0", "1",
"2", "3", "4",
"5", "6", "7",
"8", "9", "--"]:
# insert shadda
self.tokens.append("Q")
else:
self.tokens.append(str[ix:ix+ll])
ix = ix + ll
break # for-loop
else:
ix = ix + 1
#print "TOKENS",self.tokens
def transtokenize(self, str, xlen, style="serto"):
ix = 0
self.tokens = []
self.digits = []
number = 0 #
while(ix < xlen): #for ix in range(xlen):
#print "IX", ix,
for ll in range(5, 0, -1):
if str[ix:ix+ll] in self.transtabelle[style]:
if ll == 1 and str[ix:ix+ll] in "aeiou" \
and (len(self.tokens) == 0 \
or self.tokens[-1] == "~"):
self.tokens.append("'" + str[ix:ix+ll])
#self.tokens.extend(["'", str[ix:ix+ll]])
else:
#if len(self.tokens) \
# and str[ix:ix+ll] == self.tokens[-1] \
# and self.tabelle[str[ix:ix+ll]][4] != 3 \
# and str[ix:ix+ll] not in ["~", "0", "1", "2", "3", "4",
# "5", "6", "7", "8", "9"]:
# """insert shadda"""
# self.tokens.append("Q")
#else:
self.tokens.append(str[ix:ix+ll])
ix = ix + ll
break # for-loop
else:
ix = ix + 1
#print "TRANSTOKENS",self.tokens
def transliterate(self, syrisch, style="serto"):
if True: #self.usingUTF8
#line = unicode(syrisch, "utf8")
line = syrisch
newline = ""
for c in line:
#print "eee %x" % ord(c), self.UnicodeTable.has_key(ord(c)),
#print c.encode("utf8")
sertocode = self.UnicodeTable.get(ord(c), c)
newline += sertocode
#print "[%s]" % sertocode, newline
syrisch = newline
syrisch = syrisch.replace(" ", "~")
self.transtokenize(syrisch, len(syrisch), style)
#self.err("TOKEN %s" %self.tokens)
ret = []
oldtok = ""
for tok in self.tokens:
if tok == "~": # blank
ret.append(" ")
#elif tok == "Q": # shadda
#ret.append(ret[-1])
#elif tok == "+": # soft sign under begadkefat
#if len(ret):
# ret[-1] = self.spec.get(oldtok+tok, oldtok+tok)
else:
ret.append(self.transtabelle[style].get(tok, tok))
#oldtok = tok
return "".join(ret)
def syriacise(self, style="serto"):
# replace tokens by serto letters, take into account context
ix = 0
out = []
digits = []
self.maxlen = len(self.tokens)
#sys.stderr.write("%s\n" % self.tokens)
number = 0
for i in range(self.maxlen):
if self.tokens[i][0] in "\\{}":
out.append(self.tokens[i])
elif self.tabelle[style][self.tokens[i]].medial == -1:
#print "skipping letter"
continue
else:
form = self.context(i, style=style)
if self.tokens[i] in ["0", "1", "2", "3", "4",
"5", "6", "7", "8", "9"]:
number = 1
digits.append(chr(self.tabelle[style][self.tokens[i]].getcontext(form)))
else:
if number == 1:
number = 0
digits.reverse()
out.extend(digits)
digits = []
#out.append(chr(self.tabelle[self.tokens[i]][form]))
#out.append("%c" %(self.tabelle[self.tokens[i]].getcontext(form)))
for c in self.tabelle[style][self.tokens[i]].getChar(form):
#sys.stderr.write("LETTER: %s\n" % c)
out.append(c)
#print self.tokens[i], form, self.tabelle[self.tokens[i]][form]
if number:
number = 0
digits.reverse()
out.extend(digits)
#for i in out: print "%d" % ord(i),
#print
#if not self.elatex:
# out.reverse()
# This kills empty letters, caused by -1 in .font-file
return "".join(out)
def context(self, ix, style="serto"):
"""returns 0 if letter is isolated
1 if letter is initial
2 if letter is medial
3 if letter is final"""
if self.before(ix, style) and self.next(ix, style):
return 2
elif self.before(ix, style) and not self.next(ix, style):
return 3
elif not self.before(ix, style) and self.next(ix, style):
return 1
else:
return 0
def next(self, ix, style="serto"):
"""returns 1 if next token is a letter"""
for i in range(ix+1, self.maxlen):
if self.tokens[i][0] in "\\{}":
return 0
elif self.tabelle[style][self.tokens[i]].link in [2,3]:
continue
elif self.tokens[i] not in ["~", "!", ",", ".", ";", "?"] :
return 1
else:
return 0
return 0
def before(self, ix, style="serto"):
"""returns 1 if preceding token is a letter"""
for i in range(ix-1, -1, -1):
if self.tokens[i][0] in "\\{}":
return 0
elif self.tabelle[style][self.tokens[i]].link == 2:
continue
elif self.tokens[i] != "~":
if self.tabelle[style][self.tokens[i]].link == 0:
return 0
else:
return 1
else:
return 0
return 0
def convert(self, transcript, style="serto"):
# interface function
# dummy blank
if style == "estra":
XXXX
else: #if True: #self.usingUTF8:
line = transcript # unicode(transcript, "utf8")
newline = ""
for c in line:
#sys.stderr.write( "eee %d %s\n" % (ord(c), self.UnicodeTable.has_key(ord(c))))
#sys.stderr.write( c.encode("utf8") + "\n")
sertocode = self.UnicodeTable.get(ord(c), c)
newline += sertocode
#sys.stderr.write( "[%s]\n" % sertocode)
transcript = newline #.encode("utf8")
transcript = transcript.replace(" ", "~")
self.tokenize(transcript, len(transcript), style)
return self.syriacise(style)
def texify(self, word, style="serto"):
if style== "estra":
return self.texifyUnicodefont(word)
res = []
for ll in serto.convert(word, style):
# ll: position of current syriac character in font table
#sys.stdout.write("LETTER: 0x%x\n" % (ord(ll)))
#sys.stderr.write("LETTER: %s\n" % ord(ll))
if ord(ll) < 16:
#print "WWWWWWWWW", len(res), res
if len(res):
res[-1] = "\\uppersyriac{%d}{%s}" % (ord(ll), res[-1])
else:
res.append("\\uppersyriac{%d}{A}" % (ord(ll))) # A: Olaf
elif ord(ll) < 32:
if len(res):
res[-1] = "\\lowersyriac{%d}{%s}" % (ord(ll), res[-1])
else:
res.append("\\lowersyriac{%d}{A}" % (ord(ll)))
elif ord(ll) < 127 and ord(ll) not in [34,35,36,37,38,95]:
res.append(ll)
else:
# special (active) TeX-characters, charactes > 127
res.append("\\char%d{}" % ord(ll))
if not self.elatex:
res.reverse()
return "".join(res)
def inlineserto(self, matchobject):
return "{\\serto\\beginR %s\\endR}" % self.texify(matchobject.group(2))
def inlinechaldean(self, matchobject):
return "{\\assyr\\beginR %s\\endR}" \
% self.texify(matchobject.group(2), "chaldean")
def texifyUnicodefont(self, line):
line += " "
out = ""
i = 0
while i < len(line)-2:
if line[i:i+3] in self.code2Unicode:
codes = self.code2Unicode[line[i:i+3]]
#out += "^^^%x" % self.code2Unicode[line[i:i+3]]
i += 3
elif line[i:i+2] in self.code2Unicode:
codes = self.code2Unicode[line[i:i+2]]
#out += "^^^%x" % self.code2Unicode[line[i:i+2]]
i += 2
elif line[i:i+1] in self.code2Unicode:
codes = self.code2Unicode[line[i:i+1]]
#out += "^^^%x" % self.code2Unicode[line[i:i+1]]
i += 1
else:
codes = None
if line[i] != "\n":
out += line[i]
i += 1
if codes:
for x in codes:
out += "^^^%x" % x
return out
def inlineestra(self, matchobject):
return "{\\beginR\\estra %s\\endR}" % self.texifyUnicodefont(matchobject.group(2))
# line = matchobject.group(2) + " "
# out = ""
# i = 0
# while i < len(line)-2:
# if line[i:i+3] in self.code2Unicode:
# codes = self.code2Unicode[line[i:i+3]]
# #out += "^^^%x" % self.code2Unicode[line[i:i+3]]
# i += 3
# elif line[i:i+2] in self.code2Unicode:
# codes = self.code2Unicode[line[i:i+2]]
# #out += "^^^%x" % self.code2Unicode[line[i:i+2]]
# i += 2
# elif line[i:i+1] in self.code2Unicode:
# codes = self.code2Unicode[line[i:i+1]]
# #out += "^^^%x" % self.code2Unicode[line[i:i+1]]
# i += 1
# else:
# codes = None
# out += line[i]
# i += 1
# if codes:
# for x in codes:
# out += "^^^%x" % x
#
#
# return "{\\beginR\\estra %s\\endR}" % out
def inlinetrans(self, matchobject):
return "\\emph{%s}" % self.transliterate(matchobject.group(2))
def inlinesertotrans(self, matchobject):
return "{\\serto\\beginR %s\\endR} \\emph{%s}" \
% (self.texify(matchobject.group(2)),
self.transliterate(matchobject.group(2)))
def inlinechaldeantrans(self, matchobject):
return "{\\assyr\\beginR %s\\endR} \\emph{%s}" \
% (self.texify(matchobject.group(2), "chaldean"),
self.transliterate(matchobject.group(2), "chaldean"))
def inlineestratrans(self, matchobject):
return "%s \\emph{%s}" \
% (self.inlineestra(matchobject),
self.transliterate(matchobject.group(2)))
def err(self, s):
sys.stderr.write(s + "\n")
#-------------------------------------------------------
if __name__ == "__main__":
sys.stderr.write("serto - LaTeX - preprocessor V %s\n(c) Johannes Heinecke\n" % version)
if len(sys.argv) < 2:
sys.stderr.write("usage:\n serto.py [-o] inputfile\n")
sys.stderr.write(" -o: for usage with an older version of LaTeX which cannot typeset right-to-left scripts elatex\n\n")
sys.exit(1)
else:
sys.stderr.write("\n")
import getopt
elatex = 1
optlist,comargs = getopt.getopt(sys.argv[1:], "")
for (o,a) in optlist:
if o == "-o":
elatex = 0
serto = Serto(elatex=elatex)
fp = open(comargs[0])
#mode = "latin"
mode = ["latin"]
z = fp.readline()
while (z):
#print "LINE", z,
#if z.find("\usepackage[utf8]{inputenc}") > -1:
# serto.usingUTF8 = True
#print 'QQQ',z, mode
if z[:-1] == "":
# must be on a single line (will be deleted)
if not elatex:
sys.stderr.write("using without the -e option (and elatex) may not work!\n")
#mode = "serto"
mode.append("serto")
print('{\\serto\\beginR %')
elif z[:-1].strip() == "":
del mode[-1]
#print '\\endR}%' # causes problems in last line
print('}%')
elif z[:-1] == "":
# must be on a single line (will be deleted)
mode.append("estra")
print('{\\beginR\\estra %')
elif z[:-1].strip() == "":
del mode[-1]
print('}%')
elif z[:-1] == "":
# must be on a single line (will be deleted)
if not elatex:
sys.stderr.write("using without the -e option (and elatex) may not work!\n")
#mode = "chaldean"
mode.append("chaldean")
print('{\\assyr\\beginR %')
elif z[:-1].strip() == "":
#mode = "latin"
del mode[-1]
#print '\\endR}%' # causes problems in last line
print('}%')
elif z[:-1] == "":
#mode = "trans"
mode.append("trans")
print('{\\it %')
elif z[:-1].strip() == "":
#mode = "latin"
del mode[-1]
print('}%')
else:
#print "mmm", mode, z
if mode[-1] == "latin":
#sys.stdout.write(serto.inlineS.sub(serto.inlineserto, z))
a = serto.inlineS.sub(serto.inlineserto, z)
b = serto.inlineT.sub(serto.inlinetrans, a)
c = serto.inlineST.sub(serto.inlinesertotrans, b)
d = serto.inlineC.sub(serto.inlinechaldean, c)
e = serto.inlineCT.sub(serto.inlinechaldeantrans, d)
f = serto.inlineE.sub(serto.inlineestra, e)
g = serto.inlineET.sub(serto.inlineestratrans, f)
sys.stdout.write(g)
elif mode[-1] == "trans":
print(serto.transliterate(z))
else:
if z[:-1] == "": print("\n\\beginR ", end="")
else:
print(serto.texify(z, mode[-1]))
#print "rrrrr", mode
z = fp.readline()
fp.close()
sys.exit(serto.errct)