diff options
Diffstat (limited to 'srcs/compose/compile.py')
| -rw-r--r-- | srcs/compose/compile.py | 150 |
1 files changed, 134 insertions, 16 deletions
diff --git a/srcs/compose/compile.py b/srcs/compose/compile.py index 214d4b8..82c600e 100644 --- a/srcs/compose/compile.py +++ b/srcs/compose/compile.py @@ -1,8 +1,99 @@ -import textwrap, sys +import textwrap, sys, re, string -def parse_sequences_file(fname): +# Names not defined in Compose.pre +xkb_char_extra_names = { + "space": " ", + "minus": "-", + "asterisk": "*", + "colon": ":", + "equal": "=", + "exclam": "!", + "grave": "`", + "parenleft": "(", + "parenright": ")", + "percent": "%", + "period": ".", + "plus": "+", + "question": "?", + "semicolon": ";", + "underscore": "_", + } + +dropped_sequences = 0 + +# Parse XKB's Compose.pre files +def parse_sequences_file_xkb(fname): + # Parse a line of the form: + # <Multi_key> <minus> <space> : "~" asciitilde # TILDE + # Sequences not starting with <Multi_key> are ignored. + line_re = re.compile(r'^((?:\s*<[^>]+>)+)\s*:\s*"((?:[^"\\]+|\\.)+)"\s*(\S+)?\s*(?:#.+)?$') + char_re = re.compile(r'\s*<(?:U([a-fA-F0-9]{4,6})|([^>]+))>') + def parse_seq_line(line): + global dropped_sequences + prefix = "<Multi_key>" + if not line.startswith(prefix): + return None + m = re.match(line_re, line[len(prefix):]) + if m == None: + return None + def_ = m.group(1) + try: + def_ = parse_seq_chars(def_) + result = parse_seq_result(m.group(2)) + except Exception as e: + # print(str(e) + ". Sequence dropped: " + line.strip(), file=sys.stderr) + dropped_sequences += 1 + return None + return def_, result + char_names = { **xkb_char_extra_names } + # Interpret character names of the form "U0000" or using [char_names]. + def parse_seq_char(c): + uchar, named_char = c + if uchar != "": + return chr(int(uchar, 16)) + # else is a named char + if len(named_char) == 1: + return named_char + if not named_char in char_names: + raise Exception("Unknown char: " + named_char) + return char_names[named_char] + # Interpret the left hand side of a sequence. + def parse_seq_chars(def_): + return list(map(parse_seq_char, re.findall(char_re, def_))) + # Interpret the result of a sequence, as outputed by [line_re]. + def parse_seq_result(r): + if len(r) == 2 and r[0] == '\\': + return r[1] + # The state machine can't represent characters that do not fit in a + # 16-bit char. This breaks some sequences that output letters with + # combined diacritics or emojis. + if len(r) > 1 or ord(r[0]) > 65535: + raise Exception("Char out of range: " + r) + return r + # Populate [char_names] with the information present in the file. + with open(fname, "r") as inp: + for line in inp: + m = re.match(line_re, line) + if m == None or m.group(3) == None: + continue + try: + char_names[m.group(3)] = parse_seq_result(m.group(2)) + except Exception: + pass + # Parse the sequences with open(fname, "r") as inp: - return [ (s[:-2], s[-2]) for s in inp if len(s) > 1 ] + seqs = [] + for line in inp: + s = parse_seq_line(line) + if s != None: + seqs.append(s) + return seqs + +# Format of the sequences file is determined by its extension +def parse_sequences_file(fname): + if fname.endswith(".pre"): + return parse_sequences_file_xkb(fname) + raise Exception(fname + ": Unsupported format") # Turn a list of sequences into a trie. def add_sequences_to_trie(seqs, trie): @@ -26,7 +117,7 @@ def make_automata(tree_root): i = len(states) s = len(t.keys()) # Add node header - states.append((0, s + 1)) + states.append(("\0", s + 1)) i += 1 # Reserve space for the current node in both arrays for c in range(s): @@ -47,27 +138,53 @@ def make_automata(tree_root): add_tree(tree_root) return states +def batched(ar, n): + i = 0 + while i + n < len(ar): + yield ar[i:i+n] + i += n + if i < len(ar): + yield ar[i:] + # Print the state machine compiled by make_automata into java code that can be # used by [ComposeKeyData.java]. def gen_java(machine): - def gen_array(array, indent): - return textwrap.fill(", ".join(map(str, array)), subsequent_indent=indent) + chars_map = { + # These characters cannot be used in unicode form as Java's parser + # unescape unicode sequences before parsing. + "\"": "\\\"", + "\\": "\\\\", + "\n": "\\n", + ord("\""): "\\\"", + ord("\\"): "\\\\", + ord("\n"): "\\n", + } + def char_repr(c): + if c in chars_map: + return chars_map[c] + if type(c) == int: # The edges array contains ints + return "\\u%04x" % c + if c in string.printable: + return c + return "\\u%04x" % ord(c) + def gen_array(array): + chars = list(map(char_repr, array)) + return "\" +\n \"".join(map(lambda b: "".join(b), batched(chars, 72))) print("""package juloo.keyboard2; /** This file is generated, see [srcs/compose/compile.py]. */ public final class ComposeKeyData { - public static final char[] states = { - %s - }; + public static final char[] states = + ("%s").toCharArray(); - public static final short[] edges = { - %s - }; + public static final char[] edges = + ("%s").toCharArray(); }""" % ( - gen_array(map(lambda s: repr(s[0]), machine), ' '), - gen_array(map(lambda s: s[1], machine), ' '), + # Break the edges array every few characters using string concatenation. + gen_array(map(lambda s: s[0], machine)), + gen_array(map(lambda s: s[1], machine)), )) total_sequences = 0 @@ -76,5 +193,6 @@ for fname in sys.argv[1:]: sequences = parse_sequences_file(fname) add_sequences_to_trie(sequences, trie) total_sequences += len(sequences) -gen_java(make_automata(trie)) -print("Compiled %d sequences" % total_sequences, file=sys.stderr) +automata = make_automata(trie) +gen_java(automata) +print("Compiled %d sequences into %d states. Dropped %d sequences." % (total_sequences, len(automata), dropped_sequences), file=sys.stderr) |
