abouttreesummaryrefslogcommitdiff
path: root/scripts/subst_of_compose.py
diff options
context:
space:
mode:
authorJules Aguillon2026-04-10 19:08:23 +0200
committerGitHub2026-04-10 19:08:23 +0200
commit833b4a21549f7ea8067291f344f6acdeaff3f079 (patch)
tree81448d90ea6260e49a0b7e02595f8c4f5508eda0 /scripts/subst_of_compose.py
parent8921de7f02570ba03816ebb4025f2c95bc131707 (diff)
downloadunexpected-keyboard-833b4a21549f7ea8067291f344f6acdeaff3f079.tar.gz
unexpected-keyboard-833b4a21549f7ea8067291f344f6acdeaff3f079.zip
Better suggestion with diacritics (#1223)
* Update cdict * scripts/subst_of_compose.py: Compute substitutions from compose mappings. They are used when building dictionaries. * Add substitutions compose data * Better suggestion with diacritics This improves the suggestions for words that contain diacritics and uppercase letters. This works by stripping diacritics both when building the dictionaries (using word aliases added in cdict: https://github.com/Julow/cdict/pull/3) and during lookup. Cdict then takes care of resolving the correct word. The substitutions are generated using mappings from `fn`, `shift` and all the `accent_*` modifiers into srcs/compose/substitutions.json This can be updated easily when more mappings are added.
Diffstat (limited to 'scripts/subst_of_compose.py')
-rw-r--r--scripts/subst_of_compose.py90
1 files changed, 90 insertions, 0 deletions
diff --git a/scripts/subst_of_compose.py b/scripts/subst_of_compose.py
new file mode 100644
index 0000000..cb5fc18
--- /dev/null
+++ b/scripts/subst_of_compose.py
@@ -0,0 +1,90 @@
+# Create a substitution file for cdict from the various compose mappings.
+# This is used when building the dictionaries and when making word suggestions
+# disregarding case and diacritics.
+
+import sys, os, json, glob, unicodedata
+
+OUTPUT_FILE = "srcs/compose/substitutions.json"
+
+def warn(msg):
+ print("Warning: " + msg, file=sys.stderr)
+
+# From srcs/compose/compile.py
+def strip_cstyle_comments(inp):
+ def strip_line(line):
+ i = line.find("//")
+ return line[:i] + "\n" if i >= 0 else line
+ return "".join(map(strip_line, inp))
+
+def parse(fname):
+ with open(fname, "r") as inp:
+ return json.loads(strip_cstyle_comments(inp))
+
+def is_char16(c):
+ return len(c) == 1 and ord(c) < 65536
+
+def get_mappings(tree):
+ for c, r in tree.items():
+ # Remove deep compose sequences and remove mappings to non-char keys or
+ # to characters that do not fit in a Java 16-bit char.
+ if isinstance(r, str) and is_char16(r) and is_char16(c):
+ yield c, r
+
+def mappings_from_compose_files():
+ for f in glob.glob("srcs/compose/*.json"):
+ if f == OUTPUT_FILE:
+ continue
+ yield from get_mappings(parse(f))
+
+# The definition of shift doesn't contain any letters as shift is implemented
+# using Java's API so we generate it using Python's API. It's not important if
+# both are not equivalent.
+def add_case_variants(mappings):
+ for c in "abcdefghijklmnopqrstuvwxyz":
+ yield c, c.upper()
+ for c, r in mappings:
+ c_low = c.lower()
+ if c_low != c and is_char16(c_low): yield c_low, r
+ r_up = r.upper()
+ if r_up != r and is_char16(r_up): yield c, r_up
+ yield c, r
+
+# Remove unecessary characters to reduce the lookup time
+ALLOWED_CAT = [ "Ll", "Lu", "Lt", "Lo" ]
+def remove_non_letters(mappings):
+ for c, r in mappings:
+ cat = unicodedata.category(c)
+ if cat in ALLOWED_CAT:
+ yield c, r
+
+def resolve_mappings(mappings):
+ m = {}
+ # Sort mappings to keep the lowest char in case of a conflict
+ for c, r in sorted(mappings, key=lambda it: it[1]):
+ if r in m:
+ if m[r] != c:
+ warn("Conflicting mapping '%s -> %s' and '%s -> %s'" %
+ (c, r, m[r], r))
+ continue
+ m[r] = c
+ def resolve(c, trace=None):
+ if c in m:
+ if trace is None:
+ trace = set()
+ elif c in trace:
+ return c
+ trace.add(c)
+ return resolve(m[c], trace=trace)
+ return c
+ return { r: resolve(c) for r, c in m.items() }
+
+
+with open(OUTPUT_FILE, "w") as out:
+ json.dump(
+ resolve_mappings(
+ add_case_variants(
+ remove_non_letters(
+ mappings_from_compose_files()))),
+ out, ensure_ascii=False, indent=2)
+
+print("Generated " + OUTPUT_FILE)