Better suggestion with diacritics (#1223)

* Update cdict * scripts/subst_of_compose.py: Compute substitutions from compose mappings. They are used when building dictionaries. * Add substitutions compose data * Better suggestion with diacritics This improves the suggestions for words that contain diacritics and uppercase letters. This works by stripping diacritics both when building the dictionaries (using word aliases added in cdict: https://github.com/Julow/cdict/pull/3) and during lookup. Cdict then takes care of resolving the correct word. The substitutions are generated using mappings from `fn`, `shift` and all the `accent_*` modifiers into srcs/compose/substitutions.json This can be updated easily when more mappings are added.
author: Jules Aguillon 2026-04-10 19:08:23 +0200
committer: GitHub 2026-04-10 19:08:23 +0200
commit: 833b4a21549f7ea8067291f344f6acdeaff3f079 (patch)
tree: 81448d90ea6260e49a0b7e02595f8c4f5508eda0 /scripts/subst_of_compose.py
parent: 8921de7f02570ba03816ebb4025f2c95bc131707 (diff)
download: unexpected-keyboard-833b4a21549f7ea8067291f344f6acdeaff3f079.tar.gz
unexpected-keyboard-833b4a21549f7ea8067291f344f6acdeaff3f079.zip
1 files changed, 90 insertions, 0 deletions
diff --git a/scripts/subst_of_compose.py b/scripts/subst_of_compose.py
new file mode 100644
index 0000000..cb5fc18
--- /dev/null
+++ b/scripts/subst_of_compose.py
@@ -0,0 +1,90 @@
+# Create a substitution file for cdict from the various compose mappings.
+# This is used when building the dictionaries and when making word suggestions
+# disregarding case and diacritics.
+
+import sys, os, json, glob, unicodedata
+
+OUTPUT_FILE = "srcs/compose/substitutions.json"
+
+def warn(msg):
+    print("Warning: " + msg, file=sys.stderr)
+
+# From srcs/compose/compile.py
+def strip_cstyle_comments(inp):
+    def strip_line(line):
+        i = line.find("//")
+        return line[:i] + "\n" if i >= 0 else line
+    return "".join(map(strip_line, inp))
+
+def parse(fname):
+    with open(fname, "r") as inp:
+        return json.loads(strip_cstyle_comments(inp))
+
+def is_char16(c):
+    return len(c) == 1 and ord(c) < 65536
+
+def get_mappings(tree):
+    for c, r in tree.items():
+        # Remove deep compose sequences and remove mappings to non-char keys or
+        # to characters that do not fit in a Java 16-bit char.
+        if isinstance(r, str) and is_char16(r) and is_char16(c):
+            yield c, r
+
+def mappings_from_compose_files():
+    for f in glob.glob("srcs/compose/*.json"):
+        if f == OUTPUT_FILE:
+            continue
+        yield from get_mappings(parse(f))
+
+# The definition of shift doesn't contain any letters as shift is implemented
+# using Java's API so we generate it using Python's API. It's not important if
+# both are not equivalent.
+def add_case_variants(mappings):
+    for c in "abcdefghijklmnopqrstuvwxyz":
+        yield c, c.upper()
+    for c, r in mappings:
+        c_low = c.lower()
+        if c_low != c and is_char16(c_low): yield c_low, r
+        r_up = r.upper()
+        if r_up != r and is_char16(r_up): yield c, r_up
+        yield c, r
+
+# Remove unecessary characters to reduce the lookup time
+ALLOWED_CAT = [ "Ll", "Lu", "Lt", "Lo" ]
+def remove_non_letters(mappings):
+    for c, r in mappings:
+        cat = unicodedata.category(c)
+        if cat in ALLOWED_CAT:
+            yield c, r
+
+def resolve_mappings(mappings):
+    m = {}
+    # Sort mappings to keep the lowest char in case of a conflict
+    for c, r in sorted(mappings, key=lambda it: it[1]):
+        if r in m:
+            if m[r] != c:
+                warn("Conflicting mapping '%s -> %s' and '%s -> %s'" %
+                      (c, r, m[r], r))
+            continue
+        m[r] = c
+    def resolve(c, trace=None):
+        if c in m:
+            if trace is None:
+                trace = set()
+            elif c in trace:
+                return c
+            trace.add(c)
+            return resolve(m[c], trace=trace)
+        return c
+    return { r: resolve(c) for r, c in m.items() }
+
+
+with open(OUTPUT_FILE, "w") as out:
+    json.dump(
+            resolve_mappings(
+                add_case_variants(
+                    remove_non_letters(
+                        mappings_from_compose_files()))),
+              out, ensure_ascii=False, indent=2)
+
+print("Generated " + OUTPUT_FILE)
author	Jules Aguillon	2026-04-10 19:08:23 +0200
committer	GitHub	2026-04-10 19:08:23 +0200
commit	833b4a21549f7ea8067291f344f6acdeaff3f079 (patch)
tree	81448d90ea6260e49a0b7e02595f8c4f5508eda0 /scripts/subst_of_compose.py
parent	8921de7f02570ba03816ebb4025f2c95bc131707 (diff)
download	unexpected-keyboard-833b4a21549f7ea8067291f344f6acdeaff3f079.tar.gz unexpected-keyboard-833b4a21549f7ea8067291f344f6acdeaff3f079.zip