scripts/subst_of_compose.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

# Create a substitution file for cdict from the various compose mappings.
# This is used when building the dictionaries and when making word suggestions
# disregarding case and diacritics.

import sys, os, json, glob, unicodedata

OUTPUT_FILE = "srcs/compose/substitutions.json"

def warn(msg):
    print("Warning: " + msg, file=sys.stderr)

# From srcs/compose/compile.py
def strip_cstyle_comments(inp):
    def strip_line(line):
        i = line.find("//")
        return line[:i] + "\n" if i >= 0 else line
    return "".join(map(strip_line, inp))

def parse(fname):
    with open(fname, "r") as inp:
        return json.loads(strip_cstyle_comments(inp))

def is_char16(c):
    return len(c) == 1 and ord(c) < 65536

def get_mappings(tree):
    for c, r in tree.items():
        # Remove deep compose sequences and remove mappings to non-char keys or
        # to characters that do not fit in a Java 16-bit char.
        if isinstance(r, str) and is_char16(r) and is_char16(c):
            yield c, r

def mappings_from_compose_files():
    for f in glob.glob("srcs/compose/*.json"):
        if f == OUTPUT_FILE:
            continue
        yield from get_mappings(parse(f))

# The definition of shift doesn't contain any letters as shift is implemented
# using Java's API so we generate it using Python's API. It's not important if
# both are not equivalent.
def add_case_variants(mappings):
    for c in "abcdefghijklmnopqrstuvwxyz":
        yield c, c.upper()
    for c, r in mappings:
        c_low = c.lower()
        if c_low != c and is_char16(c_low): yield c_low, r
        r_up = r.upper()
        if r_up != r and is_char16(r_up): yield c, r_up
        yield c, r

# Remove unecessary characters to reduce the lookup time
ALLOWED_CAT = [ "Ll", "Lu", "Lt", "Lo" ]
def remove_non_letters(mappings):
    for c, r in mappings:
        cat = unicodedata.category(c)
        if cat in ALLOWED_CAT:
            yield c, r

def resolve_mappings(mappings):
    m = {}
    # Sort mappings to keep the lowest char in case of a conflict
    for c, r in sorted(mappings, key=lambda it: it[1]):
        if r in m:
            if m[r] != c:
                warn("Conflicting mapping '%s -> %s' and '%s -> %s'" %
                      (c, r, m[r], r))
            continue
        m[r] = c
    def resolve(c, trace=None):
        if c in m:
            if trace is None:
                trace = set()
            elif c in trace:
                return c
            trace.add(c)
            return resolve(m[c], trace=trace)
        return c
    return { r: resolve(c) for r, c in m.items() }


with open(OUTPUT_FILE, "w") as out:
    json.dump(
            resolve_mappings(
                add_case_variants(
                    remove_non_letters(
                        mappings_from_compose_files()))),
              out, ensure_ascii=False, indent=2)

print("Generated " + OUTPUT_FILE)