diff options
| author | Jules Aguillon | 2024-05-29 15:02:08 +0200 |
|---|---|---|
| committer | Jules Aguillon | 2024-05-29 15:02:08 +0200 |
| commit | 39b3f50aa31df9786e4a10a27633137a4369f6ae (patch) | |
| tree | 96f08b316d5774e06914395d55330e05e442b652 /srcs/compose | |
| parent | f7f1d85f80c9112192385b4b5294a5544ca009f3 (diff) | |
| download | unexpected-keyboard-39b3f50aa31df9786e4a10a27633137a4369f6ae.tar.gz unexpected-keyboard-39b3f50aa31df9786e4a10a27633137a4369f6ae.zip | |
Allow compose sequence ending with more symbols
Change the compose state machine definition to allow final states that
are wider than 16-bits.
This increases the number of sequences that can be used from
en_US_UTF_8_Compose.pre from 2013 to 2043 (of 3201).
Diffstat (limited to 'srcs/compose')
| -rw-r--r-- | srcs/compose/compile.py | 16 |
1 files changed, 10 insertions, 6 deletions
diff --git a/srcs/compose/compile.py b/srcs/compose/compile.py index 1e60ae8..2cb060e 100644 --- a/srcs/compose/compile.py +++ b/srcs/compose/compile.py @@ -64,11 +64,6 @@ def parse_sequences_file_xkb(fname): def parse_seq_result(r): if len(r) == 2 and r[0] == '\\': return r[1] - # The state machine can't represent characters that do not fit in a - # 16-bit char. This breaks some sequences that output letters with - # combined diacritics or emojis. - if len(r) > 1 or ord(r[0]) > 65535: - raise Exception("Char out of range: " + r) return r # Populate [char_names] with the information present in the file. with open(fname, "r") as inp: @@ -146,7 +141,15 @@ def make_automata(tree_root): states[i] = (c, node_i) i += 1 def add_leaf(c): - states.append((c, 1)) + # There are two encoding for leafs: character final state for 15-bit + # characters and string final state for the rest. + if len(c) > 1 or ord(c[0]) > 32767: # String final state + cb = c.encode("UTF-16") + states.append((-1, len(cb) + 1)) + for c in cb: + states.append((c, 0)) + else: # Character final state + states.append((c, 1)) def add_node(n): if type(n) == str: add_leaf(n) @@ -169,6 +172,7 @@ def gen_java(machine): chars_map = { # These characters cannot be used in unicode form as Java's parser # unescape unicode sequences before parsing. + -1: "\\uFFFF", "\"": "\\\"", "\\": "\\\\", "\n": "\\n", |
