diff options
Diffstat (limited to 'scripts/subst_of_compose.py')
| -rw-r--r-- | scripts/subst_of_compose.py | 90 |
1 files changed, 90 insertions, 0 deletions
diff --git a/scripts/subst_of_compose.py b/scripts/subst_of_compose.py new file mode 100644 index 0000000..cb5fc18 --- /dev/null +++ b/scripts/subst_of_compose.py @@ -0,0 +1,90 @@ +# Create a substitution file for cdict from the various compose mappings. +# This is used when building the dictionaries and when making word suggestions +# disregarding case and diacritics. + +import sys, os, json, glob, unicodedata + +OUTPUT_FILE = "srcs/compose/substitutions.json" + +def warn(msg): + print("Warning: " + msg, file=sys.stderr) + +# From srcs/compose/compile.py +def strip_cstyle_comments(inp): + def strip_line(line): + i = line.find("//") + return line[:i] + "\n" if i >= 0 else line + return "".join(map(strip_line, inp)) + +def parse(fname): + with open(fname, "r") as inp: + return json.loads(strip_cstyle_comments(inp)) + +def is_char16(c): + return len(c) == 1 and ord(c) < 65536 + +def get_mappings(tree): + for c, r in tree.items(): + # Remove deep compose sequences and remove mappings to non-char keys or + # to characters that do not fit in a Java 16-bit char. + if isinstance(r, str) and is_char16(r) and is_char16(c): + yield c, r + +def mappings_from_compose_files(): + for f in glob.glob("srcs/compose/*.json"): + if f == OUTPUT_FILE: + continue + yield from get_mappings(parse(f)) + +# The definition of shift doesn't contain any letters as shift is implemented +# using Java's API so we generate it using Python's API. It's not important if +# both are not equivalent. +def add_case_variants(mappings): + for c in "abcdefghijklmnopqrstuvwxyz": + yield c, c.upper() + for c, r in mappings: + c_low = c.lower() + if c_low != c and is_char16(c_low): yield c_low, r + r_up = r.upper() + if r_up != r and is_char16(r_up): yield c, r_up + yield c, r + +# Remove unecessary characters to reduce the lookup time +ALLOWED_CAT = [ "Ll", "Lu", "Lt", "Lo" ] +def remove_non_letters(mappings): + for c, r in mappings: + cat = unicodedata.category(c) + if cat in ALLOWED_CAT: + yield c, r + +def resolve_mappings(mappings): + m = {} + # Sort mappings to keep the lowest char in case of a conflict + for c, r in sorted(mappings, key=lambda it: it[1]): + if r in m: + if m[r] != c: + warn("Conflicting mapping '%s -> %s' and '%s -> %s'" % + (c, r, m[r], r)) + continue + m[r] = c + def resolve(c, trace=None): + if c in m: + if trace is None: + trace = set() + elif c in trace: + return c + trace.add(c) + return resolve(m[c], trace=trace) + return c + return { r: resolve(c) for r, c in m.items() } + + +with open(OUTPUT_FILE, "w") as out: + json.dump( + resolve_mappings( + add_case_variants( + remove_non_letters( + mappings_from_compose_files()))), + out, ensure_ascii=False, indent=2) + +print("Generated " + OUTPUT_FILE) |
