Use a multistage lookup table for grapheme segmentation

2026-07-03 11:12:30 +08:00 · 2025-03-22 11:50:04 +05:30
parent 272045287a
commit 583a858769
9 changed files with 33798 additions and 7730 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,5 +1,5 @@
 kitty/wcwidth-std.h linguist-generated=true
-kitty/grapheme-segmentation-data.h linguist-generated=true
+kitty/char-props-data.h linguist-generated=true
 kitty_tests/GraphemeBreakTest.json linguist-generated=true
 kitty/emoji.h linguist-generated=true
 kitty/charsets.c linguist-generated=true
@@ -22,7 +22,7 @@ glfw/*.h linguist-vendored=true
 3rdparty/** linguist-vendored=true
 kittens/unicode_input/names.h linguist-generated=true
 tools/wcswidth/std.go linguist-generated=true
-tools/wcswidth/grapheme-segmentation-data.go linguist-generated=true
+tools/wcswidth/char-props-data.go linguist-generated=true
 tools/unicode_names/names.txt linguist-generated=true
 terminfo/kitty.term* linguist-generated=true
 terminfo/x/* linguist-generated=true
--- a/gen/wcwidth.py
+++ b/gen/wcwidth.py
@@ -7,7 +7,7 @@ import re
 import subprocess
 import sys
 from collections import defaultdict
-from collections.abc import Generator, Iterable
+from collections.abc import Generator, Hashable, Iterable
 from contextlib import contextmanager
 from functools import lru_cache, partial
 from html.entities import html5
@@ -16,7 +16,11 @@ from operator import itemgetter
 from typing import (
    Callable,
    DefaultDict,
+    Literal,
+    NamedTuple,
    Optional,
+    Protocol,
+    Sequence,
    Union,
 )
 from urllib.request import urlopen
@@ -464,85 +468,6 @@ def gofmt(*files: str) -> None:
    subprocess.check_call(['gofmt', '-w', '-s'] + list(files))


-def gen_grapheme_segmentation() -> None:
-    with create_header('kitty/grapheme-segmentation-data.h') as p, open('tools/wcswidth/grapheme-segmentation-data.go', 'w') as gof:
-        gp = partial(print, file=gof)
-        gp('package wcswidth\n\n')
-        def enum(name: str, *items: str, prefix: str = '') -> None:
-            p(f'typedef enum {name} {{')  # }}
-            gp(f'type {name} uint8\n')
-            gp('const (')  # )
-            for i, x in enumerate(items):
-                x = prefix + x
-                p(f'\t{x},')
-                if i == 0:
-                    gp(f'{x} {name} = iota')
-                else:
-                    gp(x)
-            p(f'}} {name};')
-            gp(')')
-            p('')
-            gp('')
-
-        enum('GraphemeBreakProperty', 'AtStart', 'None', *grapheme_segmentation_maps, prefix='GBP_')
-        enum('IndicConjunctBreak', 'None', *incb_map, prefix='ICB_')
-
-        def get_cat(name: str, c_func_name: str, go_func_name: str, prefix: str, m: dict[str, set[int]]) -> None:
-            p(f'static inline {name}')
-            p(f'{c_func_name}(const char_type c) {{')  # }}
-            p('\tswitch(c) {')  # }
-            gp(f'func {go_func_name}(code rune) {name} {{')  # }}
-            gp('\tswitch code {')  # }
-            for category, codepoints in m.items():
-                p(f'\t\t // {category} ({len(codepoints)} codepoints ''{{''{')
-                gp(f'\t\t // {category} ({len(codepoints)} codepoints ''{{''{')
-                category = prefix + category
-                for spec in get_ranges(list(codepoints)):
-                    write_case(spec, p)
-                    p(f'\t\t\treturn {category};')
-                    write_case(spec, gp, for_go=True)
-                    gp(f'\t\t\treturn {category}')
-                p('\t\t // }}''}')
-                p('')
-                gp('\t\t // }}''}')
-                gp('')
-            p('\t}')  # }
-            gp('\t}')  # }
-            p(f'\treturn {prefix + "None"};')  # }
-            gp(f'\treturn {prefix + "None"}')  # }
-            p('}')
-            gp('}')
-        get_cat('GraphemeBreakProperty', 'grapheme_break_property', 'GraphemeBreakPropertyFor', 'GBP_', grapheme_segmentation_maps)
-        p('')
-        gp('')
-        get_cat('IndicConjunctBreak', 'indic_conjunct_break', 'IndicConjunctBreakFor', 'ICB_', incb_map)
-
-        p('''
-static inline bool
-is_extended_pictographic(char_type c) {
-    switch (c) {
-        default: return false;
-''')
-        gp('''
-func IsExtendedPictographic(c rune) bool {
-    switch c {
-        default: return false;
-''')
-
-        for spec in get_ranges(list(extended_pictographic)):
-            write_case(spec, p)
-            p('\t\t\treturn true;')
-            write_case(spec, gp, for_go=True)
-            gp('\t\t\treturn true')
-        p('''
-    }
-}''')
-        gp('''
-    }
-}''')
-    gofmt(gof.name)
-
-
 def gen_wcwidth() -> None:
    seen: set[int] = set()
    non_printing = class_maps['Cc'] | class_maps['Cf'] | class_maps['Cs']
@@ -680,6 +605,157 @@ def gen_test_data() -> None:
        f.write(json.dumps(tests, indent=2, ensure_ascii=False).encode())


+def getsize(data: Iterable[int]) -> Literal[1, 2, 4]:
+    # return smallest possible integer size for the given array
+    maxdata = max(data)
+    if maxdata < 256:
+        return 1
+    if maxdata < 65536:
+        return 2
+    return 4
+
+
+def splitbins[T: Hashable](t: tuple[T, ...], property_size: int, use_fixed_shift: int = 0) -> tuple[list[int], list[T], int, int, int]:
+    if use_fixed_shift:
+        candidates = range(use_fixed_shift, use_fixed_shift + 1)
+    else:
+        n = len(t)-1    # last valid index
+        maxshift = 0    # the most we can shift n and still have something left
+        if n > 0:
+            while n >> 1:
+                n >>= 1
+                maxshift += 1
+        candidates = range(maxshift + 1)
+    bytesz = sys.maxsize
+    for shift in candidates:
+        t1: list[int] = []
+        t2: list[T] = []
+        size = 2**shift
+        bincache: dict[tuple[T, ...], int] = {}
+        for i in range(0, len(t), size):
+            bin = t[i:i+size]
+            index = bincache.get(bin)
+            if index is None:
+                index = len(t2)
+                bincache[bin] = index
+                t2.extend(bin)
+            t1.append(index >> shift)
+        # determine memory size
+        b = len(t1)*getsize(t1) + len(t2)*property_size
+        if b < bytesz:
+            best = t1, t2, shift
+            bytesz = b
+    t1, t2, shift = best
+    mask = ~((~0) << shift)
+    return t1, t2, shift, mask, bytesz
+
+
+class Property(Protocol):
+    @property
+    def as_c(self) -> str:
+        return ''
+
+
+def gen_multistage_table(
+    c: Callable[..., None], g: Callable[..., None], t1: Sequence[int], t2: Sequence[Property], shift: int, mask: int
+) -> None:
+    sz = getsize(t1)
+    name = t2[0].__class__.__name__
+    match sz:
+        case 1:
+            ctype = 'unsigned char'
+        case 2:
+            ctype = 'unsigned short'
+        case 4:
+            ctype = 'uint32_t'
+    c(f'static const unsigned {name}_mask = {mask}u;')
+    c(f'static const unsigned {name}_shift = {shift}u;')
+    c(f'static const {ctype} {name}_t1[{len(t1)}] = ''{')
+    c(f'\t{", ".join(map(str, t1))}')
+    c('};')
+    items = '\n\t'.join(x.as_c + ',' for x in t2)
+    c(f'static const {name} {name}_t2[{len(t2)}] = ''{')
+    c(f'\t{items}')
+    c('};')
+
+
+class CharProps(NamedTuple):
+
+    width: int  # 3 bits
+    grapheme_break: str  # 4 bits
+    indic_conjunct_break: str # 2 bits
+    is_invalid: bool
+    is_extended_pictographic: bool
+    is_non_rendered: bool
+
+    @property
+    def as_c(self) -> str:
+        return ('{'
+            f' .shifted_width={self.width + 4}, .grapheme_break=GBP_{self.grapheme_break},'
+            f' .indic_conjunct_break=ICB_{self.indic_conjunct_break},'
+            f' .is_invalid={int(self.is_invalid)}, .is_extended_pictographic={int(self.is_extended_pictographic)},'
+            f' .is_non_rendered={int(self.is_non_rendered)},'
+        ' }')
+
+
+def generate_enum(p: Callable[..., None], gp: Callable[..., None], name: str, *items: str, prefix: str = '') -> None:
+    p(f'typedef enum {name} {{')  # }}
+    gp(f'type {name} uint8\n')
+    gp('const (')  # )
+    for i, x in enumerate(items):
+        x = prefix + x
+        p(f'\t{x},')
+        if i == 0:
+            gp(f'{x} {name} = iota')
+        else:
+            gp(x)
+    p(f'}} {name};')
+    gp(')')
+    p('')
+    gp('')
+
+
+def gen_char_props() -> None:
+    invalid = class_maps['Cc'] | class_maps['Cs']
+    non_printing = invalid | class_maps['Cf']
+    width_map: dict[int, int] = {}
+    def aw(s: Iterable[int], width: int) -> None:
+        nonlocal width_map
+        d = dict.fromkeys(s, width)
+        d.update(width_map)
+        width_map = d
+
+    aw(flag_codepoints, 2)
+    aw(doublewidth, 2)
+    aw(wide_emoji, 2)
+    aw(marks | {0}, 0)
+    aw(non_printing, -1)
+    aw(ambiguous, -2)
+    aw(class_maps['Co'], -3)  # Private use
+    aw(not_assigned, -4)
+
+    gs_map: dict[int, str] = {}
+    icb_map: dict[int, str] = {}
+    for name, cps in grapheme_segmentation_maps.items():
+        gs_map.update(dict.fromkeys(cps, name))
+    for name, cps in incb_map.items():
+        icb_map.update(dict.fromkeys(cps, name))
+    prop_array = tuple(
+        CharProps(
+            width=width_map.get(ch, 1), grapheme_break=gs_map.get(ch, 'None'), indic_conjunct_break=icb_map.get(ch, 'None'),
+            is_invalid=ch in invalid, is_non_rendered=ch in non_printing,
+            is_extended_pictographic=ch in extended_pictographic
+        ) for ch in range(sys.maxunicode + 1))
+    t1, t2, shift, mask, bytesz = splitbins(prop_array, 2)
+    print(f'Size of character properties table: {bytesz/1024:.1f}KB')
+    with create_header('kitty/char-props-data.h', include_data_types=False) as c, open('tools/wcswidth/char-props-data.go', 'w') as gof:
+        gp = partial(print, file=gof)
+        gp('package wcswidth\n\n')
+        generate_enum(c, gp, 'GraphemeBreakProperty', 'AtStart', 'None', *grapheme_segmentation_maps, prefix='GBP_')
+        generate_enum(c, gp, 'IndicConjunctBreak', 'None', *incb_map, prefix='ICB_')
+        gen_multistage_table(c, gp, t1, t2, shift, mask)
+
+
 def main(args: list[str]=sys.argv) -> None:
    parse_ucd()
    parse_prop_list()
@@ -691,8 +767,8 @@ def main(args: list[str]=sys.argv) -> None:
    gen_emoji()
    gen_names()
    gen_rowcolumn_diacritics()
-    gen_grapheme_segmentation()
    gen_test_data()
+    gen_char_props()


 if __name__ == '__main__':
--- a/kitty/char-props-data.h
+++ b/kitty/char-props-data.h
--- a/kitty/grapheme-segmentation.c
+++ b/kitty/grapheme-segmentation.c
@@ -1,17 +1,20 @@
 /*
- * grapheme-segmentation.c
+ * char-props.c
 * Copyright (C) 2025 Kovid Goyal <kovid at kovidgoyal.net>
 *
 * Distributed under terms of the GPL3 license.
 */

-#include "text-cache.h"
-#include "grapheme-segmentation-data.h"
+#include "char-props.h"
+#include "char-props-data.h"
+

 #define is_linker_or_extend(incb) ((incb) == ICB_Linker || (incb) == ICB_Extend)

-#define GSS_IMPLEMENTATION
-#include "grapheme-segmentation.h"
+CharProps
+char_props_for(char_type ch) {
+    return CharProps_t2[(CharProps_t1[ch >> CharProps_shift] << CharProps_shift) + (ch & CharProps_mask)];
+}

 void
 grapheme_segmentation_reset(GraphemeSegmentationState *s) {
@@ -19,12 +22,12 @@ grapheme_segmentation_reset(GraphemeSegmentationState *s) {
 }

 bool
-grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch) {
+grapheme_segmentation_step(GraphemeSegmentationState *s, CharProps ch) {
    // Grapheme segmentation as per UAX29-C1-1 as defined in https://www.unicode.org/reports/tr29/
    // Returns true iff ch should be added to the current cell based on s which
-    // must reflect the state of the current cell.
-    GraphemeBreakProperty prop = grapheme_break_property(ch);
-    IndicConjunctBreak incb = indic_conjunct_break(ch);
+    // must reflect the state of the current cell. s is updated by ch.
+    GraphemeBreakProperty prop = ch.grapheme_break;
+    IndicConjunctBreak incb = ch.indic_conjunct_break;
    bool add_to_cell = false;
    if (s->last_char_prop == GBP_AtStart) {
        add_to_cell = true;
@@ -46,7 +49,7 @@ grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch) {
         * Between consonant {extend|linker}* linker {extend|linker}* and consonant (GB9c).  */
        else if (s->incb_consonant_extended_linker_extended && incb == ICB_Consonant) add_to_cell = true;
        /* No break within emoji modifier sequences or emoji zwj sequences (GB11).  */
-        else if (s->last_char_prop == GBP_ZWJ && s->emoji_modifier_sequence_before_last_char && is_extended_pictographic(ch)) add_to_cell = true;
+        else if (s->last_char_prop == GBP_ZWJ && s->emoji_modifier_sequence_before_last_char && ch.is_extended_pictographic) add_to_cell = true;
        /* No break between RI if there is an odd number of RI characters before (GB12, GB13).  */
        else if (prop == GBP_Regional_Indicator && (s->ri_count % 2) != 0) add_to_cell = true;
        /* Break everywhere else */
@@ -59,7 +62,7 @@ grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch) {
    s->incb_consonant_extended = (incb == ICB_Consonant || (
        s->incb_consonant_extended && is_linker_or_extend(incb)));
    s->emoji_modifier_sequence_before_last_char = s->emoji_modifier_sequence;
-    s->emoji_modifier_sequence = (s->emoji_modifier_sequence && prop == GBP_Extend) || is_extended_pictographic(ch);
+    s->emoji_modifier_sequence = (s->emoji_modifier_sequence && prop == GBP_Extend) || ch.is_extended_pictographic;
    s->last_char_prop = prop;

    if (prop == GBP_Regional_Indicator) s->ri_count++; else s->ri_count = 0;
--- a/kitty/grapheme-segmentation.h
+++ b/kitty/grapheme-segmentation.h
@@ -1,5 +1,5 @@
 /*
- * grapheme-segmentation.h
+ * char-props.h
 * Copyright (C) 2025 Kovid Goyal <kovid at kovidgoyal.net>
 *
 * Distributed under terms of the GPL3 license.
@@ -9,6 +9,20 @@

 #include "data-types.h"

+typedef union CharProps {
+    struct {
+        uint8_t grapheme_break : 4;
+        uint8_t indic_conjunct_break : 2;
+        uint8_t is_extended_pictographic: 1;
+        uint8_t is_invalid: 1;
+        uint8_t shifted_width : 3;
+        uint8_t is_non_rendered : 1;
+        uint8_t : 4;
+    };
+    uint16_t val;
+} CharProps;
+static_assert(sizeof(CharProps) == sizeof(uint16_t), "Fix the ordering of CharProps");
+
 typedef struct GraphemeSegmentationState {
    int last_char_prop;

@@ -34,5 +48,6 @@ values:  consonant {extend|linker}* linker {extend|linker}*  */
    size_t ri_count;
 } GraphemeSegmentationState;

+CharProps char_props_for(char_type ch);
 void grapheme_segmentation_reset(GraphemeSegmentationState *s);
-bool grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch);
+bool grapheme_segmentation_step(GraphemeSegmentationState *s, CharProps ch);
--- a/kitty/data-types.c
+++ b/kitty/data-types.c
@@ -12,7 +12,7 @@
 #undef _DARWIN_C_SOURCE
 #endif

-#include "grapheme-segmentation.h"
+#include "char-props.h"
 #include "line.h"
 #include "charsets.h"
 #include "base64.h"
@@ -144,7 +144,7 @@ split_into_graphemes(PyObject UNUSED *self, PyObject *src) {
    Py_ssize_t pos = 0;
    for (Py_ssize_t i = 0; i < PyUnicode_GET_LENGTH(src); i++) {
        char_type ch = PyUnicode_READ(kind, data, i);
-        if (!grapheme_segmentation_step(&s, ch)) {
+        if (!grapheme_segmentation_step(&s, char_props_for(ch))) {
            RAII_PyObject(u, PyUnicode_FromKindAndData(kind, data + kind * pos, i - pos));
            if (!u || PyList_Append(ans, u) != 0) return NULL;
            pos = i;
--- a/kitty/grapheme-segmentation-data.h
+++ b/kitty/grapheme-segmentation-data.h
--- a/tools/wcswidth/char-props-data.go
+++ b/tools/wcswidth/char-props-data.go
@@ -0,0 +1,32 @@
+package wcswidth
+
+
+type GraphemeBreakProperty uint8
+
+const (
+GBP_AtStart GraphemeBreakProperty = iota
+GBP_None
+GBP_Prepend
+GBP_CR
+GBP_LF
+GBP_Control
+GBP_Extend
+GBP_Regional_Indicator
+GBP_SpacingMark
+GBP_L
+GBP_V
+GBP_T
+GBP_LV
+GBP_LVT
+GBP_ZWJ
+)
+
+type IndicConjunctBreak uint8
+
+const (
+ICB_None IndicConjunctBreak = iota
+ICB_Linker
+ICB_Consonant
+ICB_Extend
+)
+
--- a/tools/wcswidth/grapheme-segmentation-data.go
+++ b/tools/wcswidth/grapheme-segmentation-data.go