Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-40328: Add tool for generating cjk mapping headers #19602

Merged
merged 14 commits into from
Apr 29, 2020
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add tools for generating mappings headers for CJKCodecs.
4 changes: 1 addition & 3 deletions Modules/cjkcodecs/README
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
To generate or modify mapping headers
-------------------------------------
Mapping headers are imported from CJKCodecs as pre-generated form.
If you need to tweak or add something on it, please look at tools/
subdirectory of CJKCodecs' distribution.
Mapping headers are generated from Tools/unicode/genmap_*.py



Expand Down
1 change: 1 addition & 0 deletions Modules/cjkcodecs/mappings_cn.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// AUTO-GENERATED FILE FROM genmap_schinese.py: DO NOT EDIT
static const ucs2_t __gb2312_decmap[7482] = {
12288,12289,12290,12539,713,711,168,12291,12293,8213,65374,8214,8230,8216,
8217,8220,8221,12308,12309,12296,12297,12298,12299,12300,12301,12302,12303,
Expand Down
1 change: 1 addition & 0 deletions Modules/cjkcodecs/mappings_jisx0213_pair.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// AUTO-GENERATED FILE FROM genmap_japanese.py: DO NOT EDIT
#define JISX0213_ENCPAIRS 46
#ifdef EXTERN_JISX0213_PAIR
static const struct widedbcs_index *jisx0213_pair_decmap;
Expand Down
1 change: 1 addition & 0 deletions Modules/cjkcodecs/mappings_jp.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// AUTO-GENERATED FILE FROM genmap_japanese.py: DO NOT EDIT
static const ucs2_t __jisx0208_decmap[6956] = {
12288,12289,12290,65292,65294,12539,65306,65307,65311,65281,12443,12444,180,
65344,168,65342,65507,65343,12541,12542,12445,12446,12291,20189,12293,12294,
Expand Down
2 changes: 2 additions & 0 deletions Modules/cjkcodecs/mappings_kr.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// AUTO-GENERATED FILE FROM genmap_korean.py: DO NOT EDIT
static const ucs2_t __ksx1001_decmap[8264] = {
12288,12289,12290,183,8229,8230,168,12291,173,8213,8741,65340,8764,8216,8217,
8220,8221,12308,12309,12296,12297,12298,12299,12300,12301,12302,12303,12304,
Expand Down Expand Up @@ -3249,3 +3250,4 @@ __cp949_encmap+31959,0,255},{__cp949_encmap+32215,0,255},{__cp949_encmap+32471
__cp949_encmap+32891,0,11},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{__cp949_encmap+
32903,1,230},
};

251 changes: 251 additions & 0 deletions Tools/unicode/genmap_japanese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
#
# genmap_ja_codecs.py: Japanese Codecs Map Generator
#
# Original Author: Hye-Shik Chang <perky@FreeBSD.org>
# Modified Author: Dong-hee Na <donghee.na92@gmail.com>
#
import os

from genmap_support import *

JISX0208_C1 = (0x21, 0x74)
JISX0208_C2 = (0x21, 0x7e)
JISX0212_C1 = (0x22, 0x6d)
JISX0212_C2 = (0x21, 0x7e)
JISX0213_C1 = (0x21, 0x7e)
JISX0213_C2 = (0x21, 0x7e)
CP932P0_C1 = (0x81, 0x81) # patches between shift-jis and cp932
CP932P0_C2 = (0x5f, 0xca)
CP932P1_C1 = (0x87, 0x87) # CP932 P1
CP932P1_C2 = (0x40, 0x9c)
CP932P2_C1 = (0xed, 0xfc) # CP932 P2
CP932P2_C2 = (0x40, 0xfc)

MAPPINGS_JIS0208 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT'
MAPPINGS_JIS0212 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT'
MAPPINGS_CP932 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT'
MAPPINGS_JISX0213_2004 = 'http://wakaba-web.hp.infoseek.co.jp/table/jisx0213-2004-std.txt'


def loadmap_jisx0213(fo):
decmap3, decmap4 = {}, {} # maps to BMP for level 3 and 4
decmap3_2, decmap4_2 = {}, {} # maps to U+2xxxx for level 3 and 4
decmap3_pair = {} # maps to BMP-pair for level 3
for line in fo:
line = line.split('#', 1)[0].strip()
if not line or len(line.split()) < 2:
continue

row = line.split()
loc = eval('0x' + row[0][2:])
level = eval(row[0][0])
m = None
if len(row[1].split('+')) == 2: # single unicode
uni = eval('0x' + row[1][2:])
if level == 3:
if uni < 0x10000:
m = decmap3
elif 0x20000 <= uni < 0x30000:
uni -= 0x20000
m = decmap3_2
elif level == 4:
if uni < 0x10000:
m = decmap4
elif 0x20000 <= uni < 0x30000:
uni -= 0x20000
m = decmap4_2
m.setdefault((loc >> 8), {})
m[(loc >> 8)][(loc & 0xff)] = uni
else: # pair
uniprefix = eval('0x' + row[1][2:6]) # body
uni = eval('0x' + row[1][7:11]) # modifier
if level != 3:
raise ValueError("invalid map")
decmap3_pair.setdefault(uniprefix, {})
m = decmap3_pair[uniprefix]

if m is None:
raise ValueError("invalid map")
m.setdefault((loc >> 8), {})
m[(loc >> 8)][(loc & 0xff)] = uni

return decmap3, decmap4, decmap3_2, decmap4_2, decmap3_pair


def main():
jisx0208file = open_mapping_file('python-mappings/JIS0208.TXT', MAPPINGS_JIS0208)
jisx0212file = open_mapping_file('python-mappings/JIS0212.TXT', MAPPINGS_JIS0212)
cp932file = open_mapping_file('python-mappings/CP932.TXT', MAPPINGS_CP932)
jisx0213file = open_mapping_file('python-mappings/jisx0213-2004-std.txt', MAPPINGS_JISX0213_2004)

print("Loading Mapping File...")

sjisdecmap = loadmap(jisx0208file, natcol=0, unicol=2)
jisx0208decmap = loadmap(jisx0208file, natcol=1, unicol=2)
jisx0212decmap = loadmap(jisx0212file)
cp932decmap = loadmap(cp932file)
jis3decmap, jis4decmap, jis3_2_decmap, jis4_2_decmap, jis3_pairdecmap = loadmap_jisx0213(jisx0213file)

if jis3decmap[0x21][0x24] != 0xff0c:
raise SystemExit('Please adjust your JIS X 0213 map using jisx0213-2000-std.txt.diff')

sjisencmap, cp932encmap = {}, {}
jisx0208_0212encmap = {}
for c1, m in sjisdecmap.items():
for c2, code in m.items():
sjisencmap.setdefault(code >> 8, {})
sjisencmap[code >> 8][code & 0xff] = c1 << 8 | c2
for c1, m in cp932decmap.items():
for c2, code in m.items():
cp932encmap.setdefault(code >> 8, {})
if (code & 0xff) not in cp932encmap[code >> 8]:
cp932encmap[code >> 8][code & 0xff] = c1 << 8 | c2
for c1, m in cp932encmap.copy().items():
for c2, code in m.copy().items():
if c1 in sjisencmap and c2 in sjisencmap[c1] and sjisencmap[c1][c2] == code:
del cp932encmap[c1][c2]
if not cp932encmap[c1]:
del cp932encmap[c1]

jisx0213pairdecmap = {}
jisx0213pairencmap = []
for unibody, m1 in jis3_pairdecmap.items():
for c1, m2 in m1.items():
for c2, modifier in m2.items():
jisx0213pairencmap.append((unibody, modifier, c1 << 8 | c2))
jisx0213pairdecmap.setdefault(c1, {})
jisx0213pairdecmap[c1][c2] = unibody << 16 | modifier

# Twinmap for both of JIS X 0208 (MSB unset) and JIS X 0212 (MSB set)
for c1, m in jisx0208decmap.items():
for c2, code in m.items():
jisx0208_0212encmap.setdefault(code >> 8, {})
jisx0208_0212encmap[code >> 8][code & 0xff] = c1 << 8 | c2

for c1, m in jisx0212decmap.items():
for c2, code in m.items():
jisx0208_0212encmap.setdefault(code >> 8, {})
if (code & 0xff) in jisx0208_0212encmap[code >> 8]:
print("OOPS!!!", (code))
jisx0208_0212encmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2

jisx0213bmpencmap = {}
for c1, m in jis3decmap.copy().items():
for c2, code in m.copy().items():
if c1 in jisx0208decmap and c2 in jisx0208decmap[c1]:
if code in jis3_pairdecmap:
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
jisx0213pairencmap.append((code, 0, c1 << 8 | c2))
elif jisx0208decmap[c1][c2] == code:
del jis3decmap[c1][c2]
if not jis3decmap[c1]:
del jis3decmap[c1]
else:
raise ValueError("Difference between JIS X 0208 and JIS X 0213 Plane 1 is found.")
else:
jisx0213bmpencmap.setdefault(code >> 8, {})
if code not in jis3_pairdecmap:
jisx0213bmpencmap[code >> 8][code & 0xff] = c1 << 8 | c2
else:
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
jisx0213pairencmap.append((code, 0, c1 << 8 | c2))

for c1, m in jis4decmap.items():
for c2, code in m.items():
jisx0213bmpencmap.setdefault(code >> 8, {})
jisx0213bmpencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2

jisx0213empencmap = {}
for c1, m in jis3_2_decmap.items():
for c2, code in m.items():
jisx0213empencmap.setdefault(code >> 8, {})
jisx0213empencmap[code >> 8][code & 0xff] = c1 << 8 | c2
for c1, m in jis4_2_decmap.items():
for c2, code in m.items():
jisx0213empencmap.setdefault(code >> 8, {})
jisx0213empencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2

with open("mappings_jp.h", "w") as fp:
print_autogen(fp, os.path.basename(__file__))
print("Generating JIS X 0208 decode map...")
writer = DecodeMapWriter(fp, "jisx0208", jisx0208decmap)
writer.update_decode_map(JISX0208_C1, JISX0208_C2)
writer.generate()

print("Generating JIS X 0212 decode map...")
writer = DecodeMapWriter(fp, "jisx0212", jisx0212decmap)
writer.update_decode_map(JISX0212_C1, JISX0212_C2)
writer.generate()

print("Generating JIS X 0208 && JIS X 0212 encode map...")
writer = EncodeMapWriter(fp, "jisxcommon", jisx0208_0212encmap)
writer.generate()

print("Generating CP932 Extension decode map...")
writer = DecodeMapWriter(fp, "cp932ext", cp932decmap)
writer.update_decode_map(CP932P0_C1, CP932P0_C2)
writer.update_decode_map(CP932P1_C1, CP932P1_C2)
writer.update_decode_map(CP932P2_C1, CP932P2_C2)
writer.generate()

print("Generating CP932 Extension encode map...")
writer = EncodeMapWriter(fp, "cp932ext", cp932encmap)
writer.generate()

print("Generating JIS X 0213 Plane 1 BMP decode map...")
writer = DecodeMapWriter(fp, "jisx0213_1_bmp", jis3decmap)
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
writer.generate()

print("Generating JIS X 0213 Plane 2 BMP decode map...")
writer = DecodeMapWriter(fp, "jisx0213_2_bmp", jis4decmap)
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
writer.generate()

print("Generating JIS X 0213 BMP encode map...")
writer = EncodeMapWriter(fp, "jisx0213_bmp", jisx0213bmpencmap)
writer.generate()

print("Generating JIS X 0213 Plane 1 EMP decode map...")
writer = DecodeMapWriter(fp, "jisx0213_1_emp", jis3_2_decmap)
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
writer.generate()

print("Generating JIS X 0213 Plane 2 EMP decode map...")
writer = DecodeMapWriter(fp, "jisx0213_2_emp", jis4_2_decmap)
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
writer.generate()

print("Generating JIS X 0213 EMP encode map...")
writer = EncodeMapWriter(fp, "jisx0213_emp", jisx0213empencmap)
writer.generate()

with open('mappings_jisx0213_pair.h', 'w') as fp:
print_autogen(fp, os.path.basename(__file__))
fp.write(f"#define JISX0213_ENCPAIRS {len(jisx0213pairencmap)}\n")
fp.write("""\
#ifdef EXTERN_JISX0213_PAIR
static const struct widedbcs_index *jisx0213_pair_decmap;
static const struct pair_encodemap *jisx0213_pair_encmap;
#else
""")

print("Generating JIS X 0213 unicode-pair decode map...")
writer = DecodeMapWriter(fp, "jisx0213_pair", jisx0213pairdecmap)
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
writer.generate(wide=True)

print("Generating JIS X 0213 unicode-pair encode map...")
jisx0213pairencmap.sort()
fp.write("static const struct pair_encodemap jisx0213_pair_encmap[JISX0213_ENCPAIRS] = {\n")
filler = BufferedFiller()
for body, modifier, jis in jisx0213pairencmap:
filler.write('{', '0x%04x%04x,' % (body, modifier), '0x%04x' % jis, '},')
filler.printout(fp)
fp.write("};\n")
fp.write("#endif\n")

print("Done!")

if __name__ == '__main__':
main()
62 changes: 62 additions & 0 deletions Tools/unicode/genmap_korean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#
# genmap_korean.py: Korean Codecs Map Generator
#
# Original Author: Hye-Shik Chang <perky@FreeBSD.org>
# Modified Author: Dong-hee Na <donghee.na92@gmail.com>
#
import os

from genmap_support import *


KSX1001_C1 = (0x21, 0x7e)
KSX1001_C2 = (0x21, 0x7e)
UHCL1_C1 = (0x81, 0xa0)
UHCL1_C2 = (0x41, 0xfe)
UHCL2_C1 = (0xa1, 0xfe)
UHCL2_C2 = (0x41, 0xa0)
MAPPINGS_CP949 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT'


def main():
mapfile = open_mapping_file('python-mappings/CP949.TXT', MAPPINGS_CP949)
print("Loading Mapping File...")
decmap = loadmap(mapfile)
uhcdecmap, ksx1001decmap, cp949encmap = {}, {}, {}
for c1, c2map in decmap.items():
for c2, code in c2map.items():
if c1 >= 0xa1 and c2 >= 0xa1:
ksx1001decmap.setdefault(c1 & 0x7f, {})
ksx1001decmap[c1 & 0x7f][c2 & 0x7f] = c2map[c2]
cp949encmap.setdefault(code >> 8, {})
cp949encmap[code >> 8][code & 0xFF] = (c1 << 8 | c2) & 0x7f7f
else:
# uhc
uhcdecmap.setdefault(c1, {})
uhcdecmap[c1][c2] = c2map[c2]
cp949encmap.setdefault(code >> 8, {}) # MSB set
cp949encmap[code >> 8][code & 0xFF] = (c1 << 8 | c2)

with open('mappings_kr.h', 'w') as fp:
print_autogen(fp, os.path.basename(__file__))

print("Generating KS X 1001 decode map...")
writer = DecodeMapWriter(fp, "ksx1001", ksx1001decmap)
writer.update_decode_map(KSX1001_C1, KSX1001_C2)
writer.generate()

print("Generating UHC decode map...")
writer = DecodeMapWriter(fp, "cp949ext", uhcdecmap)
writer.update_decode_map(UHCL1_C1, UHCL1_C2)
writer.update_decode_map(UHCL2_C1, UHCL2_C2)
writer.generate()

print("Generating CP949 (includes KS X 1001) encode map...")
writer = EncodeMapWriter(fp, "cp949", cp949encmap)
writer.generate()

print("Done!")


if __name__ == '__main__':
main()
Loading