From 833b4a21549f7ea8067291f344f6acdeaff3f079 Mon Sep 17 00:00:00 2001
From: Jules Aguillon
Date: Fri, 10 Apr 2026 19:08:23 +0200
Subject: Better suggestion with diacritics (#1223)

* Update cdict

* scripts/subst_of_compose.py: Compute substitutions

from compose mappings. They are used when building dictionaries.

* Add substitutions compose data

* Better suggestion with diacritics

This improves the suggestions for words that contain diacritics and
uppercase letters.

This works by stripping diacritics both when building the dictionaries
(using word aliases added in cdict: https://github.com/Julow/cdict/pull/3)
and during lookup. Cdict then takes care of resolving the correct word.

The substitutions are generated using mappings from `fn`, `shift` and
all the `accent_*` modifiers into srcs/compose/substitutions.json
This can be updated easily when more mappings are added.---
 scripts/subst_of_compose.py | 90 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 scripts/subst_of_compose.py

(limited to 'scripts')

diff --git a/scripts/subst_of_compose.py b/scripts/subst_of_compose.py
new file mode 100644
index 0000000..cb5fc18
--- /dev/null
+++ b/scripts/subst_of_compose.py
@@ -0,0 +1,90 @@
+# Create a substitution file for cdict from the various compose mappings.
+# This is used when building the dictionaries and when making word suggestions
+# disregarding case and diacritics.
+
+import sys, os, json, glob, unicodedata
+
+OUTPUT_FILE = "srcs/compose/substitutions.json"
+
+def warn(msg):
+    print("Warning: " + msg, file=sys.stderr)
+
+# From srcs/compose/compile.py
+def strip_cstyle_comments(inp):
+    def strip_line(line):
+        i = line.find("//")
+        return line[:i] + "\n" if i >= 0 else line
+    return "".join(map(strip_line, inp))
+
+def parse(fname):
+    with open(fname, "r") as inp:
+        return json.loads(strip_cstyle_comments(inp))
+
+def is_char16(c):
+    return len(c) == 1 and ord(c) < 65536
+
+def get_mappings(tree):
+    for c, r in tree.items():
+        # Remove deep compose sequences and remove mappings to non-char keys or
+        # to characters that do not fit in a Java 16-bit char.
+        if isinstance(r, str) and is_char16(r) and is_char16(c):
+            yield c, r
+
+def mappings_from_compose_files():
+    for f in glob.glob("srcs/compose/*.json"):
+        if f == OUTPUT_FILE:
+            continue
+        yield from get_mappings(parse(f))
+
+# The definition of shift doesn't contain any letters as shift is implemented
+# using Java's API so we generate it using Python's API. It's not important if
+# both are not equivalent.
+def add_case_variants(mappings):
+    for c in "abcdefghijklmnopqrstuvwxyz":
+        yield c, c.upper()
+    for c, r in mappings:
+        c_low = c.lower()
+        if c_low != c and is_char16(c_low): yield c_low, r
+        r_up = r.upper()
+        if r_up != r and is_char16(r_up): yield c, r_up
+        yield c, r
+
+# Remove unecessary characters to reduce the lookup time
+ALLOWED_CAT = [ "Ll", "Lu", "Lt", "Lo" ]
+def remove_non_letters(mappings):
+    for c, r in mappings:
+        cat = unicodedata.category(c)
+        if cat in ALLOWED_CAT:
+            yield c, r
+
+def resolve_mappings(mappings):
+    m = {}
+    # Sort mappings to keep the lowest char in case of a conflict
+    for c, r in sorted(mappings, key=lambda it: it[1]):
+        if r in m:
+            if m[r] != c:
+                warn("Conflicting mapping '%s -> %s' and '%s -> %s'" %
+                      (c, r, m[r], r))
+            continue
+        m[r] = c
+    def resolve(c, trace=None):
+        if c in m:
+            if trace is None:
+                trace = set()
+            elif c in trace:
+                return c
+            trace.add(c)
+            return resolve(m[c], trace=trace)
+        return c
+    return { r: resolve(c) for r, c in m.items() }
+
+
+with open(OUTPUT_FILE, "w") as out:
+    json.dump(
+            resolve_mappings(
+                add_case_variants(
+                    remove_non_letters(
+                        mappings_from_compose_files()))),
+              out, ensure_ascii=False, indent=2)
+
+print("Generated " + OUTPUT_FILE)
-- 
cgit v1.2.3