abouttreesummaryrefslogcommitdiff
path: root/srcs/compose/compile.py
diff options
context:
space:
mode:
Diffstat (limited to 'srcs/compose/compile.py')
-rw-r--r--srcs/compose/compile.py150
1 files changed, 134 insertions, 16 deletions
diff --git a/srcs/compose/compile.py b/srcs/compose/compile.py
index 214d4b8..82c600e 100644
--- a/srcs/compose/compile.py
+++ b/srcs/compose/compile.py
@@ -1,8 +1,99 @@
-import textwrap, sys
+import textwrap, sys, re, string
-def parse_sequences_file(fname):
+# Names not defined in Compose.pre
+xkb_char_extra_names = {
+ "space": " ",
+ "minus": "-",
+ "asterisk": "*",
+ "colon": ":",
+ "equal": "=",
+ "exclam": "!",
+ "grave": "`",
+ "parenleft": "(",
+ "parenright": ")",
+ "percent": "%",
+ "period": ".",
+ "plus": "+",
+ "question": "?",
+ "semicolon": ";",
+ "underscore": "_",
+ }
+
+dropped_sequences = 0
+
+# Parse XKB's Compose.pre files
+def parse_sequences_file_xkb(fname):
+ # Parse a line of the form:
+ # <Multi_key> <minus> <space> : "~" asciitilde # TILDE
+ # Sequences not starting with <Multi_key> are ignored.
+ line_re = re.compile(r'^((?:\s*<[^>]+>)+)\s*:\s*"((?:[^"\\]+|\\.)+)"\s*(\S+)?\s*(?:#.+)?$')
+ char_re = re.compile(r'\s*<(?:U([a-fA-F0-9]{4,6})|([^>]+))>')
+ def parse_seq_line(line):
+ global dropped_sequences
+ prefix = "<Multi_key>"
+ if not line.startswith(prefix):
+ return None
+ m = re.match(line_re, line[len(prefix):])
+ if m == None:
+ return None
+ def_ = m.group(1)
+ try:
+ def_ = parse_seq_chars(def_)
+ result = parse_seq_result(m.group(2))
+ except Exception as e:
+ # print(str(e) + ". Sequence dropped: " + line.strip(), file=sys.stderr)
+ dropped_sequences += 1
+ return None
+ return def_, result
+ char_names = { **xkb_char_extra_names }
+ # Interpret character names of the form "U0000" or using [char_names].
+ def parse_seq_char(c):
+ uchar, named_char = c
+ if uchar != "":
+ return chr(int(uchar, 16))
+ # else is a named char
+ if len(named_char) == 1:
+ return named_char
+ if not named_char in char_names:
+ raise Exception("Unknown char: " + named_char)
+ return char_names[named_char]
+ # Interpret the left hand side of a sequence.
+ def parse_seq_chars(def_):
+ return list(map(parse_seq_char, re.findall(char_re, def_)))
+ # Interpret the result of a sequence, as outputed by [line_re].
+ def parse_seq_result(r):
+ if len(r) == 2 and r[0] == '\\':
+ return r[1]
+ # The state machine can't represent characters that do not fit in a
+ # 16-bit char. This breaks some sequences that output letters with
+ # combined diacritics or emojis.
+ if len(r) > 1 or ord(r[0]) > 65535:
+ raise Exception("Char out of range: " + r)
+ return r
+ # Populate [char_names] with the information present in the file.
+ with open(fname, "r") as inp:
+ for line in inp:
+ m = re.match(line_re, line)
+ if m == None or m.group(3) == None:
+ continue
+ try:
+ char_names[m.group(3)] = parse_seq_result(m.group(2))
+ except Exception:
+ pass
+ # Parse the sequences
with open(fname, "r") as inp:
- return [ (s[:-2], s[-2]) for s in inp if len(s) > 1 ]
+ seqs = []
+ for line in inp:
+ s = parse_seq_line(line)
+ if s != None:
+ seqs.append(s)
+ return seqs
+
+# Format of the sequences file is determined by its extension
+def parse_sequences_file(fname):
+ if fname.endswith(".pre"):
+ return parse_sequences_file_xkb(fname)
+ raise Exception(fname + ": Unsupported format")
# Turn a list of sequences into a trie.
def add_sequences_to_trie(seqs, trie):
@@ -26,7 +117,7 @@ def make_automata(tree_root):
i = len(states)
s = len(t.keys())
# Add node header
- states.append((0, s + 1))
+ states.append(("\0", s + 1))
i += 1
# Reserve space for the current node in both arrays
for c in range(s):
@@ -47,27 +138,53 @@ def make_automata(tree_root):
add_tree(tree_root)
return states
+def batched(ar, n):
+ i = 0
+ while i + n < len(ar):
+ yield ar[i:i+n]
+ i += n
+ if i < len(ar):
+ yield ar[i:]
+
# Print the state machine compiled by make_automata into java code that can be
# used by [ComposeKeyData.java].
def gen_java(machine):
- def gen_array(array, indent):
- return textwrap.fill(", ".join(map(str, array)), subsequent_indent=indent)
+ chars_map = {
+ # These characters cannot be used in unicode form as Java's parser
+ # unescape unicode sequences before parsing.
+ "\"": "\\\"",
+ "\\": "\\\\",
+ "\n": "\\n",
+ ord("\""): "\\\"",
+ ord("\\"): "\\\\",
+ ord("\n"): "\\n",
+ }
+ def char_repr(c):
+ if c in chars_map:
+ return chars_map[c]
+ if type(c) == int: # The edges array contains ints
+ return "\\u%04x" % c
+ if c in string.printable:
+ return c
+ return "\\u%04x" % ord(c)
+ def gen_array(array):
+ chars = list(map(char_repr, array))
+ return "\" +\n \"".join(map(lambda b: "".join(b), batched(chars, 72)))
print("""package juloo.keyboard2;
/** This file is generated, see [srcs/compose/compile.py]. */
public final class ComposeKeyData
{
- public static final char[] states = {
- %s
- };
+ public static final char[] states =
+ ("%s").toCharArray();
- public static final short[] edges = {
- %s
- };
+ public static final char[] edges =
+ ("%s").toCharArray();
}""" % (
- gen_array(map(lambda s: repr(s[0]), machine), ' '),
- gen_array(map(lambda s: s[1], machine), ' '),
+ # Break the edges array every few characters using string concatenation.
+ gen_array(map(lambda s: s[0], machine)),
+ gen_array(map(lambda s: s[1], machine)),
))
total_sequences = 0
@@ -76,5 +193,6 @@ for fname in sys.argv[1:]:
sequences = parse_sequences_file(fname)
add_sequences_to_trie(sequences, trie)
total_sequences += len(sequences)
-gen_java(make_automata(trie))
-print("Compiled %d sequences" % total_sequences, file=sys.stderr)
+automata = make_automata(trie)
+gen_java(automata)
+print("Compiled %d sequences into %d states. Dropped %d sequences." % (total_sequences, len(automata), dropped_sequences), file=sys.stderr)