diff options
| author | Jules Aguillon | 2024-06-09 10:35:38 +0200 |
|---|---|---|
| committer | Jules Aguillon | 2024-06-09 10:35:38 +0200 |
| commit | a886f6eedeafbe06c6de0cf68fbd0f81655af597 (patch) | |
| tree | 012ed3fa9f4b2e0ed171ce92bf98a58aff877eb0 /srcs/compose | |
| parent | 1197ce36b4b6f67ab7343a6dc4f2258b52b73358 (diff) | |
| download | unexpected-keyboard-a886f6eedeafbe06c6de0cf68fbd0f81655af597.tar.gz unexpected-keyboard-a886f6eedeafbe06c6de0cf68fbd0f81655af597.zip | |
compose: Fix misbehaving due to encoding errors
Encoding errors in the compose data compiler due to:
- 'UTF-16' adds a BOM, use 'UTF-16-LE' instead
- 'str.encode' returns a byte array, use 'array' to have a 16-bit char
array.
Diffstat (limited to 'srcs/compose')
| -rw-r--r-- | srcs/compose/compile.py | 39 |
1 files changed, 27 insertions, 12 deletions
diff --git a/srcs/compose/compile.py b/srcs/compose/compile.py index aa3b7f8..14dc410 100644 --- a/srcs/compose/compile.py +++ b/srcs/compose/compile.py @@ -1,4 +1,5 @@ import textwrap, sys, re, string, json, os +from array import array # Parse symbol names from keysymdef.h. Many compose sequences in # en_US_UTF_8_Compose.pre reference theses. For example, all the sequences on @@ -41,16 +42,21 @@ def parse_sequences_file_xkb(fname): return def_, result char_names = { **xkb_char_extra_names } # Interpret character names of the form "U0000" or using [char_names]. - def parse_seq_char(c): - uchar, named_char = c + def parse_seq_char(sc): + uchar, named_char = sc if uchar != "": - return chr(int(uchar, 16)) - # else is a named char - if len(named_char) == 1: - return named_char - if not named_char in char_names: - raise Exception("Unknown char: " + named_char) - return char_names[named_char] + c = chr(int(uchar, 16)) + elif len(named_char) == 1: + c = named_char + else: + if not named_char in char_names: + raise Exception("Unknown char: " + named_char) + c = char_names[named_char] + # The state machine can't represent sequence characters that do not fit + # in a 16-bit char. + if len(c) > 1 or ord(c[0]) > 65535: + raise Exception("Char out of range: " + r) + return c # Interpret the left hand side of a sequence. def parse_seq_chars(def_): return list(map(parse_seq_char, re.findall(char_re, def_))) @@ -138,9 +144,9 @@ def make_automata(tree_root): # There are two encoding for leafs: character final state for 15-bit # characters and string final state for the rest. if len(c) > 1 or ord(c[0]) > 32767: # String final state - cb = c.encode("UTF-16") - states.append((-1, len(cb) + 1)) - for c in cb: + javachars = array('H', c.encode("UTF-16-LE")) + states.append((-1, len(javachars) + 1)) + for c in javachars: states.append((c, 0)) else: # Character final state states.append((c, 1)) @@ -152,6 +158,14 @@ def make_automata(tree_root): add_tree(tree_root) return states +# Debug +def print_automata(automata): + i = 0 + for (s, e) in automata: + s = "%#06x" % s if isinstance(s, int) else '"%s"' % str(s) + print("%3d %8s %d" % (i, s, e), file=sys.stderr) + i += 1 + def batched(ar, n): i = 0 while i + n < len(ar): @@ -213,3 +227,4 @@ for fname in sys.argv[1:]: automata = make_automata(trie) gen_java(automata) print("Compiled %d sequences into %d states. Dropped %d sequences." % (total_sequences, len(automata), dropped_sequences), file=sys.stderr) +# print_automata(automata) |
