Allow compose sequence ending with more symbols

Change the compose state machine definition to allow final states that are wider than 16-bits. This increases the number of sequences that can be used from en_US_UTF_8_Compose.pre from 2013 to 2043 (of 3201).
author: Jules Aguillon 2024-05-29 15:02:08 +0200
committer: Jules Aguillon 2024-05-29 15:02:08 +0200
commit: 39b3f50aa31df9786e4a10a27633137a4369f6ae (patch)
tree: 96f08b316d5774e06914395d55330e05e442b652 /srcs/compose
parent: f7f1d85f80c9112192385b4b5294a5544ca009f3 (diff)
download: unexpected-keyboard-39b3f50aa31df9786e4a10a27633137a4369f6ae.tar.gz
unexpected-keyboard-39b3f50aa31df9786e4a10a27633137a4369f6ae.zip
1 files changed, 10 insertions, 6 deletions
diff --git a/srcs/compose/compile.py b/srcs/compose/compile.py
index 1e60ae8..2cb060e 100644
--- a/srcs/compose/compile.py
+++ b/srcs/compose/compile.py
@@ -64,11 +64,6 @@ def parse_sequences_file_xkb(fname):
     def parse_seq_result(r):
         if len(r) == 2 and r[0] == '\\':
             return r[1]
-        # The state machine can't represent characters that do not fit in a
-        # 16-bit char. This breaks some sequences that output letters with
-        # combined diacritics or emojis.
-        if len(r) > 1 or ord(r[0]) > 65535:
-            raise Exception("Char out of range: " + r)
         return r
     # Populate [char_names] with the information present in the file.
     with open(fname, "r") as inp:
@@ -146,7 +141,15 @@ def make_automata(tree_root):
             states[i] = (c, node_i)
             i += 1
     def add_leaf(c):
-        states.append((c, 1))
+        # There are two encoding for leafs: character final state for 15-bit
+        # characters and string final state for the rest.
+        if len(c) > 1 or ord(c[0]) > 32767: # String final state
+            cb = c.encode("UTF-16")
+            states.append((-1, len(cb) + 1))
+            for c in cb:
+                states.append((c, 0))
+        else: # Character final state
+            states.append((c, 1))
     def add_node(n):
         if type(n) == str:
             add_leaf(n)
@@ -169,6 +172,7 @@ def gen_java(machine):
     chars_map = {
             # These characters cannot be used in unicode form as Java's parser
             # unescape unicode sequences before parsing.
+            -1: "\\uFFFF",
             "\"": "\\\"",
             "\\": "\\\\",
             "\n": "\\n",
author	Jules Aguillon	2024-05-29 15:02:08 +0200
committer	Jules Aguillon	2024-05-29 15:02:08 +0200
commit	39b3f50aa31df9786e4a10a27633137a4369f6ae (patch)
tree	96f08b316d5774e06914395d55330e05e442b652 /srcs/compose
parent	f7f1d85f80c9112192385b4b5294a5544ca009f3 (diff)
download	unexpected-keyboard-39b3f50aa31df9786e4a10a27633137a4369f6ae.tar.gz unexpected-keyboard-39b3f50aa31df9786e4a10a27633137a4369f6ae.zip