From 4a429357ef9faa409617f867e224bc8c6814d919 Mon Sep 17 00:00:00 2001 From: Jules Aguillon Date: Thu, 19 Dec 2024 00:34:24 +0100 Subject: compose: Fix parsing of long sequences from json files Sequences longer than two characters were not read correctly from json files, creating conflicts and causing dropped sequences. The detection of collision in sequences is also improved. Two colliding sequences are removed. --- srcs/compose/compile.py | 49 ++++++++++++++++++---------- srcs/compose/compose/cyrillic.json | 3 -- srcs/compose/compose/en_US_UTF_8_Compose.pre | 2 +- 3 files changed, 32 insertions(+), 22 deletions(-) (limited to 'srcs/compose') diff --git a/srcs/compose/compile.py b/srcs/compose/compile.py index 125e18c..e8feba3 100644 --- a/srcs/compose/compile.py +++ b/srcs/compose/compile.py @@ -99,10 +99,16 @@ def strip_cstyle_comments(inp): # Parse from a json file containing a dictionary sequence → result string. def parse_sequences_file_json(fname): + def tree_to_seqs(tree, prefix): + for c, r in tree.items(): + if isinstance(r, str): + yield prefix + [c], r + else: + yield from tree_to_seqs(r, prefix + [c]) try: with open(fname, "r") as inp: - seqs = json.loads(strip_cstyle_comments(inp)) - return list(seqs.items()) + tree = json.loads(strip_cstyle_comments(inp)) + return list(tree_to_seqs(tree, [])) except Exception as e: print("Failed parsing '%s': %s" % (fname, str(e)), file=sys.stderr) @@ -133,26 +139,33 @@ def parse_sequences_dir(dname): # Turn a list of sequences into a trie. def add_sequences_to_trie(seqs, trie): - def add_seq_to_trie(t_, seq, result): + global dropped_sequences + def add_seq_to_trie(seq, result): t_ = trie + for c in seq[:-1]: + t_ = t_.setdefault(c, {}) + if isinstance(t_, str): + return False + c = seq[-1] + if c in t_: + return False + t_[c] = result + return True + def existing_sequence_to_str(seq): # Used in error message i = 0 - while i < len(seq) - 1: - c = seq[i] - if c not in t_: - t_[c] = {} - if isinstance(t_[c], str): - global dropped_sequences - dropped_sequences += 1 - print("Sequence collide: '%s = %s' '%s = %s'" % ( - seq[:i+1], t_[c], seq, result), - file=sys.stderr) - return - t_ = t_[c] + t_ = trie + while i < len(seq): + if seq[i] not in t_: break # No collision ? + t_ = t_[seq[i]] i += 1 - c = seq[i] - t_[c] = result + if isinstance(t_, str): break + return "".join(seq[:i]) + " = " + str(t_) for seq, result in seqs: - add_seq_to_trie(trie, seq, result) + if not add_seq_to_trie(seq, result): + dropped_sequences += 1 + print("Sequence collide: '%s' and '%s = %s'" % ( + existing_sequence_to_str(seq), + "".join(seq), result), file=sys.stderr) # Compile the trie into a state machine. def make_automata(tries): diff --git a/srcs/compose/compose/cyrillic.json b/srcs/compose/compose/cyrillic.json index 61a8807..6a349aa 100644 --- a/srcs/compose/compose/cyrillic.json +++ b/srcs/compose/compose/cyrillic.json @@ -1,7 +1,4 @@ { - "\"": { - "і": "ї" - }, ",": { "г": "ӻ", "к": "ӄ", diff --git a/srcs/compose/compose/en_US_UTF_8_Compose.pre b/srcs/compose/compose/en_US_UTF_8_Compose.pre index 680f4fa..484d6d2 100644 --- a/srcs/compose/compose/en_US_UTF_8_Compose.pre +++ b/srcs/compose/compose/en_US_UTF_8_Compose.pre @@ -4016,7 +4016,7 @@ XCOMM Mathematical Operators : "∦" U2226 # NOT PARALLEL TO : "≁" U2241 # NOT TILDE : "≄" U2244 # NOT ASYMPTOTICALLY EQUAL TO - : "≇" U2247 # NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO +XCOMM : "≇" U2247 # NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO : "≉" U2249 # NOT ALMOST EQUAL TO : "≠" U2260 # NOT EQUAL TO : "≠" U2260 # NOT EQUAL TO -- cgit v1.2.3