mirror of
https://github.com/kovidgoyal/kitty.git
synced 2026-07-03 11:12:30 +08:00
Use a multistage lookup table for grapheme segmentation
This commit is contained in:
4
.gitattributes
vendored
4
.gitattributes
vendored
@@ -1,5 +1,5 @@
|
||||
kitty/wcwidth-std.h linguist-generated=true
|
||||
kitty/grapheme-segmentation-data.h linguist-generated=true
|
||||
kitty/char-props-data.h linguist-generated=true
|
||||
kitty_tests/GraphemeBreakTest.json linguist-generated=true
|
||||
kitty/emoji.h linguist-generated=true
|
||||
kitty/charsets.c linguist-generated=true
|
||||
@@ -22,7 +22,7 @@ glfw/*.h linguist-vendored=true
|
||||
3rdparty/** linguist-vendored=true
|
||||
kittens/unicode_input/names.h linguist-generated=true
|
||||
tools/wcswidth/std.go linguist-generated=true
|
||||
tools/wcswidth/grapheme-segmentation-data.go linguist-generated=true
|
||||
tools/wcswidth/char-props-data.go linguist-generated=true
|
||||
tools/unicode_names/names.txt linguist-generated=true
|
||||
terminfo/kitty.term* linguist-generated=true
|
||||
terminfo/x/* linguist-generated=true
|
||||
|
||||
238
gen/wcwidth.py
238
gen/wcwidth.py
@@ -7,7 +7,7 @@ import re
|
||||
import subprocess
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from collections.abc import Generator, Iterable
|
||||
from collections.abc import Generator, Hashable, Iterable
|
||||
from contextlib import contextmanager
|
||||
from functools import lru_cache, partial
|
||||
from html.entities import html5
|
||||
@@ -16,7 +16,11 @@ from operator import itemgetter
|
||||
from typing import (
|
||||
Callable,
|
||||
DefaultDict,
|
||||
Literal,
|
||||
NamedTuple,
|
||||
Optional,
|
||||
Protocol,
|
||||
Sequence,
|
||||
Union,
|
||||
)
|
||||
from urllib.request import urlopen
|
||||
@@ -464,85 +468,6 @@ def gofmt(*files: str) -> None:
|
||||
subprocess.check_call(['gofmt', '-w', '-s'] + list(files))
|
||||
|
||||
|
||||
def gen_grapheme_segmentation() -> None:
|
||||
with create_header('kitty/grapheme-segmentation-data.h') as p, open('tools/wcswidth/grapheme-segmentation-data.go', 'w') as gof:
|
||||
gp = partial(print, file=gof)
|
||||
gp('package wcswidth\n\n')
|
||||
def enum(name: str, *items: str, prefix: str = '') -> None:
|
||||
p(f'typedef enum {name} {{') # }}
|
||||
gp(f'type {name} uint8\n')
|
||||
gp('const (') # )
|
||||
for i, x in enumerate(items):
|
||||
x = prefix + x
|
||||
p(f'\t{x},')
|
||||
if i == 0:
|
||||
gp(f'{x} {name} = iota')
|
||||
else:
|
||||
gp(x)
|
||||
p(f'}} {name};')
|
||||
gp(')')
|
||||
p('')
|
||||
gp('')
|
||||
|
||||
enum('GraphemeBreakProperty', 'AtStart', 'None', *grapheme_segmentation_maps, prefix='GBP_')
|
||||
enum('IndicConjunctBreak', 'None', *incb_map, prefix='ICB_')
|
||||
|
||||
def get_cat(name: str, c_func_name: str, go_func_name: str, prefix: str, m: dict[str, set[int]]) -> None:
|
||||
p(f'static inline {name}')
|
||||
p(f'{c_func_name}(const char_type c) {{') # }}
|
||||
p('\tswitch(c) {') # }
|
||||
gp(f'func {go_func_name}(code rune) {name} {{') # }}
|
||||
gp('\tswitch code {') # }
|
||||
for category, codepoints in m.items():
|
||||
p(f'\t\t // {category} ({len(codepoints)} codepoints ''{{''{')
|
||||
gp(f'\t\t // {category} ({len(codepoints)} codepoints ''{{''{')
|
||||
category = prefix + category
|
||||
for spec in get_ranges(list(codepoints)):
|
||||
write_case(spec, p)
|
||||
p(f'\t\t\treturn {category};')
|
||||
write_case(spec, gp, for_go=True)
|
||||
gp(f'\t\t\treturn {category}')
|
||||
p('\t\t // }}''}')
|
||||
p('')
|
||||
gp('\t\t // }}''}')
|
||||
gp('')
|
||||
p('\t}') # }
|
||||
gp('\t}') # }
|
||||
p(f'\treturn {prefix + "None"};') # }
|
||||
gp(f'\treturn {prefix + "None"}') # }
|
||||
p('}')
|
||||
gp('}')
|
||||
get_cat('GraphemeBreakProperty', 'grapheme_break_property', 'GraphemeBreakPropertyFor', 'GBP_', grapheme_segmentation_maps)
|
||||
p('')
|
||||
gp('')
|
||||
get_cat('IndicConjunctBreak', 'indic_conjunct_break', 'IndicConjunctBreakFor', 'ICB_', incb_map)
|
||||
|
||||
p('''
|
||||
static inline bool
|
||||
is_extended_pictographic(char_type c) {
|
||||
switch (c) {
|
||||
default: return false;
|
||||
''')
|
||||
gp('''
|
||||
func IsExtendedPictographic(c rune) bool {
|
||||
switch c {
|
||||
default: return false;
|
||||
''')
|
||||
|
||||
for spec in get_ranges(list(extended_pictographic)):
|
||||
write_case(spec, p)
|
||||
p('\t\t\treturn true;')
|
||||
write_case(spec, gp, for_go=True)
|
||||
gp('\t\t\treturn true')
|
||||
p('''
|
||||
}
|
||||
}''')
|
||||
gp('''
|
||||
}
|
||||
}''')
|
||||
gofmt(gof.name)
|
||||
|
||||
|
||||
def gen_wcwidth() -> None:
|
||||
seen: set[int] = set()
|
||||
non_printing = class_maps['Cc'] | class_maps['Cf'] | class_maps['Cs']
|
||||
@@ -680,6 +605,157 @@ def gen_test_data() -> None:
|
||||
f.write(json.dumps(tests, indent=2, ensure_ascii=False).encode())
|
||||
|
||||
|
||||
def getsize(data: Iterable[int]) -> Literal[1, 2, 4]:
|
||||
# return smallest possible integer size for the given array
|
||||
maxdata = max(data)
|
||||
if maxdata < 256:
|
||||
return 1
|
||||
if maxdata < 65536:
|
||||
return 2
|
||||
return 4
|
||||
|
||||
|
||||
def splitbins[T: Hashable](t: tuple[T, ...], property_size: int, use_fixed_shift: int = 0) -> tuple[list[int], list[T], int, int, int]:
|
||||
if use_fixed_shift:
|
||||
candidates = range(use_fixed_shift, use_fixed_shift + 1)
|
||||
else:
|
||||
n = len(t)-1 # last valid index
|
||||
maxshift = 0 # the most we can shift n and still have something left
|
||||
if n > 0:
|
||||
while n >> 1:
|
||||
n >>= 1
|
||||
maxshift += 1
|
||||
candidates = range(maxshift + 1)
|
||||
bytesz = sys.maxsize
|
||||
for shift in candidates:
|
||||
t1: list[int] = []
|
||||
t2: list[T] = []
|
||||
size = 2**shift
|
||||
bincache: dict[tuple[T, ...], int] = {}
|
||||
for i in range(0, len(t), size):
|
||||
bin = t[i:i+size]
|
||||
index = bincache.get(bin)
|
||||
if index is None:
|
||||
index = len(t2)
|
||||
bincache[bin] = index
|
||||
t2.extend(bin)
|
||||
t1.append(index >> shift)
|
||||
# determine memory size
|
||||
b = len(t1)*getsize(t1) + len(t2)*property_size
|
||||
if b < bytesz:
|
||||
best = t1, t2, shift
|
||||
bytesz = b
|
||||
t1, t2, shift = best
|
||||
mask = ~((~0) << shift)
|
||||
return t1, t2, shift, mask, bytesz
|
||||
|
||||
|
||||
class Property(Protocol):
|
||||
@property
|
||||
def as_c(self) -> str:
|
||||
return ''
|
||||
|
||||
|
||||
def gen_multistage_table(
|
||||
c: Callable[..., None], g: Callable[..., None], t1: Sequence[int], t2: Sequence[Property], shift: int, mask: int
|
||||
) -> None:
|
||||
sz = getsize(t1)
|
||||
name = t2[0].__class__.__name__
|
||||
match sz:
|
||||
case 1:
|
||||
ctype = 'unsigned char'
|
||||
case 2:
|
||||
ctype = 'unsigned short'
|
||||
case 4:
|
||||
ctype = 'uint32_t'
|
||||
c(f'static const unsigned {name}_mask = {mask}u;')
|
||||
c(f'static const unsigned {name}_shift = {shift}u;')
|
||||
c(f'static const {ctype} {name}_t1[{len(t1)}] = ''{')
|
||||
c(f'\t{", ".join(map(str, t1))}')
|
||||
c('};')
|
||||
items = '\n\t'.join(x.as_c + ',' for x in t2)
|
||||
c(f'static const {name} {name}_t2[{len(t2)}] = ''{')
|
||||
c(f'\t{items}')
|
||||
c('};')
|
||||
|
||||
|
||||
class CharProps(NamedTuple):
|
||||
|
||||
width: int # 3 bits
|
||||
grapheme_break: str # 4 bits
|
||||
indic_conjunct_break: str # 2 bits
|
||||
is_invalid: bool
|
||||
is_extended_pictographic: bool
|
||||
is_non_rendered: bool
|
||||
|
||||
@property
|
||||
def as_c(self) -> str:
|
||||
return ('{'
|
||||
f' .shifted_width={self.width + 4}, .grapheme_break=GBP_{self.grapheme_break},'
|
||||
f' .indic_conjunct_break=ICB_{self.indic_conjunct_break},'
|
||||
f' .is_invalid={int(self.is_invalid)}, .is_extended_pictographic={int(self.is_extended_pictographic)},'
|
||||
f' .is_non_rendered={int(self.is_non_rendered)},'
|
||||
' }')
|
||||
|
||||
|
||||
def generate_enum(p: Callable[..., None], gp: Callable[..., None], name: str, *items: str, prefix: str = '') -> None:
|
||||
p(f'typedef enum {name} {{') # }}
|
||||
gp(f'type {name} uint8\n')
|
||||
gp('const (') # )
|
||||
for i, x in enumerate(items):
|
||||
x = prefix + x
|
||||
p(f'\t{x},')
|
||||
if i == 0:
|
||||
gp(f'{x} {name} = iota')
|
||||
else:
|
||||
gp(x)
|
||||
p(f'}} {name};')
|
||||
gp(')')
|
||||
p('')
|
||||
gp('')
|
||||
|
||||
|
||||
def gen_char_props() -> None:
|
||||
invalid = class_maps['Cc'] | class_maps['Cs']
|
||||
non_printing = invalid | class_maps['Cf']
|
||||
width_map: dict[int, int] = {}
|
||||
def aw(s: Iterable[int], width: int) -> None:
|
||||
nonlocal width_map
|
||||
d = dict.fromkeys(s, width)
|
||||
d.update(width_map)
|
||||
width_map = d
|
||||
|
||||
aw(flag_codepoints, 2)
|
||||
aw(doublewidth, 2)
|
||||
aw(wide_emoji, 2)
|
||||
aw(marks | {0}, 0)
|
||||
aw(non_printing, -1)
|
||||
aw(ambiguous, -2)
|
||||
aw(class_maps['Co'], -3) # Private use
|
||||
aw(not_assigned, -4)
|
||||
|
||||
gs_map: dict[int, str] = {}
|
||||
icb_map: dict[int, str] = {}
|
||||
for name, cps in grapheme_segmentation_maps.items():
|
||||
gs_map.update(dict.fromkeys(cps, name))
|
||||
for name, cps in incb_map.items():
|
||||
icb_map.update(dict.fromkeys(cps, name))
|
||||
prop_array = tuple(
|
||||
CharProps(
|
||||
width=width_map.get(ch, 1), grapheme_break=gs_map.get(ch, 'None'), indic_conjunct_break=icb_map.get(ch, 'None'),
|
||||
is_invalid=ch in invalid, is_non_rendered=ch in non_printing,
|
||||
is_extended_pictographic=ch in extended_pictographic
|
||||
) for ch in range(sys.maxunicode + 1))
|
||||
t1, t2, shift, mask, bytesz = splitbins(prop_array, 2)
|
||||
print(f'Size of character properties table: {bytesz/1024:.1f}KB')
|
||||
with create_header('kitty/char-props-data.h', include_data_types=False) as c, open('tools/wcswidth/char-props-data.go', 'w') as gof:
|
||||
gp = partial(print, file=gof)
|
||||
gp('package wcswidth\n\n')
|
||||
generate_enum(c, gp, 'GraphemeBreakProperty', 'AtStart', 'None', *grapheme_segmentation_maps, prefix='GBP_')
|
||||
generate_enum(c, gp, 'IndicConjunctBreak', 'None', *incb_map, prefix='ICB_')
|
||||
gen_multistage_table(c, gp, t1, t2, shift, mask)
|
||||
|
||||
|
||||
def main(args: list[str]=sys.argv) -> None:
|
||||
parse_ucd()
|
||||
parse_prop_list()
|
||||
@@ -691,8 +767,8 @@ def main(args: list[str]=sys.argv) -> None:
|
||||
gen_emoji()
|
||||
gen_names()
|
||||
gen_rowcolumn_diacritics()
|
||||
gen_grapheme_segmentation()
|
||||
gen_test_data()
|
||||
gen_char_props()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
33574
kitty/char-props-data.h
generated
Normal file
33574
kitty/char-props-data.h
generated
Normal file
File diff suppressed because one or more lines are too long
@@ -1,17 +1,20 @@
|
||||
/*
|
||||
* grapheme-segmentation.c
|
||||
* char-props.c
|
||||
* Copyright (C) 2025 Kovid Goyal <kovid at kovidgoyal.net>
|
||||
*
|
||||
* Distributed under terms of the GPL3 license.
|
||||
*/
|
||||
|
||||
#include "text-cache.h"
|
||||
#include "grapheme-segmentation-data.h"
|
||||
#include "char-props.h"
|
||||
#include "char-props-data.h"
|
||||
|
||||
|
||||
#define is_linker_or_extend(incb) ((incb) == ICB_Linker || (incb) == ICB_Extend)
|
||||
|
||||
#define GSS_IMPLEMENTATION
|
||||
#include "grapheme-segmentation.h"
|
||||
CharProps
|
||||
char_props_for(char_type ch) {
|
||||
return CharProps_t2[(CharProps_t1[ch >> CharProps_shift] << CharProps_shift) + (ch & CharProps_mask)];
|
||||
}
|
||||
|
||||
void
|
||||
grapheme_segmentation_reset(GraphemeSegmentationState *s) {
|
||||
@@ -19,12 +22,12 @@ grapheme_segmentation_reset(GraphemeSegmentationState *s) {
|
||||
}
|
||||
|
||||
bool
|
||||
grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch) {
|
||||
grapheme_segmentation_step(GraphemeSegmentationState *s, CharProps ch) {
|
||||
// Grapheme segmentation as per UAX29-C1-1 as defined in https://www.unicode.org/reports/tr29/
|
||||
// Returns true iff ch should be added to the current cell based on s which
|
||||
// must reflect the state of the current cell.
|
||||
GraphemeBreakProperty prop = grapheme_break_property(ch);
|
||||
IndicConjunctBreak incb = indic_conjunct_break(ch);
|
||||
// must reflect the state of the current cell. s is updated by ch.
|
||||
GraphemeBreakProperty prop = ch.grapheme_break;
|
||||
IndicConjunctBreak incb = ch.indic_conjunct_break;
|
||||
bool add_to_cell = false;
|
||||
if (s->last_char_prop == GBP_AtStart) {
|
||||
add_to_cell = true;
|
||||
@@ -46,7 +49,7 @@ grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch) {
|
||||
* Between consonant {extend|linker}* linker {extend|linker}* and consonant (GB9c). */
|
||||
else if (s->incb_consonant_extended_linker_extended && incb == ICB_Consonant) add_to_cell = true;
|
||||
/* No break within emoji modifier sequences or emoji zwj sequences (GB11). */
|
||||
else if (s->last_char_prop == GBP_ZWJ && s->emoji_modifier_sequence_before_last_char && is_extended_pictographic(ch)) add_to_cell = true;
|
||||
else if (s->last_char_prop == GBP_ZWJ && s->emoji_modifier_sequence_before_last_char && ch.is_extended_pictographic) add_to_cell = true;
|
||||
/* No break between RI if there is an odd number of RI characters before (GB12, GB13). */
|
||||
else if (prop == GBP_Regional_Indicator && (s->ri_count % 2) != 0) add_to_cell = true;
|
||||
/* Break everywhere else */
|
||||
@@ -59,7 +62,7 @@ grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch) {
|
||||
s->incb_consonant_extended = (incb == ICB_Consonant || (
|
||||
s->incb_consonant_extended && is_linker_or_extend(incb)));
|
||||
s->emoji_modifier_sequence_before_last_char = s->emoji_modifier_sequence;
|
||||
s->emoji_modifier_sequence = (s->emoji_modifier_sequence && prop == GBP_Extend) || is_extended_pictographic(ch);
|
||||
s->emoji_modifier_sequence = (s->emoji_modifier_sequence && prop == GBP_Extend) || ch.is_extended_pictographic;
|
||||
s->last_char_prop = prop;
|
||||
|
||||
if (prop == GBP_Regional_Indicator) s->ri_count++; else s->ri_count = 0;
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* grapheme-segmentation.h
|
||||
* char-props.h
|
||||
* Copyright (C) 2025 Kovid Goyal <kovid at kovidgoyal.net>
|
||||
*
|
||||
* Distributed under terms of the GPL3 license.
|
||||
@@ -9,6 +9,20 @@
|
||||
|
||||
#include "data-types.h"
|
||||
|
||||
typedef union CharProps {
|
||||
struct {
|
||||
uint8_t grapheme_break : 4;
|
||||
uint8_t indic_conjunct_break : 2;
|
||||
uint8_t is_extended_pictographic: 1;
|
||||
uint8_t is_invalid: 1;
|
||||
uint8_t shifted_width : 3;
|
||||
uint8_t is_non_rendered : 1;
|
||||
uint8_t : 4;
|
||||
};
|
||||
uint16_t val;
|
||||
} CharProps;
|
||||
static_assert(sizeof(CharProps) == sizeof(uint16_t), "Fix the ordering of CharProps");
|
||||
|
||||
typedef struct GraphemeSegmentationState {
|
||||
int last_char_prop;
|
||||
|
||||
@@ -34,5 +48,6 @@ values: consonant {extend|linker}* linker {extend|linker}* */
|
||||
size_t ri_count;
|
||||
} GraphemeSegmentationState;
|
||||
|
||||
CharProps char_props_for(char_type ch);
|
||||
void grapheme_segmentation_reset(GraphemeSegmentationState *s);
|
||||
bool grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch);
|
||||
bool grapheme_segmentation_step(GraphemeSegmentationState *s, CharProps ch);
|
||||
@@ -12,7 +12,7 @@
|
||||
#undef _DARWIN_C_SOURCE
|
||||
#endif
|
||||
|
||||
#include "grapheme-segmentation.h"
|
||||
#include "char-props.h"
|
||||
#include "line.h"
|
||||
#include "charsets.h"
|
||||
#include "base64.h"
|
||||
@@ -144,7 +144,7 @@ split_into_graphemes(PyObject UNUSED *self, PyObject *src) {
|
||||
Py_ssize_t pos = 0;
|
||||
for (Py_ssize_t i = 0; i < PyUnicode_GET_LENGTH(src); i++) {
|
||||
char_type ch = PyUnicode_READ(kind, data, i);
|
||||
if (!grapheme_segmentation_step(&s, ch)) {
|
||||
if (!grapheme_segmentation_step(&s, char_props_for(ch))) {
|
||||
RAII_PyObject(u, PyUnicode_FromKindAndData(kind, data + kind * pos, i - pos));
|
||||
if (!u || PyList_Append(ans, u) != 0) return NULL;
|
||||
pos = i;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
32
tools/wcswidth/char-props-data.go
generated
Normal file
32
tools/wcswidth/char-props-data.go
generated
Normal file
@@ -0,0 +1,32 @@
|
||||
package wcswidth
|
||||
|
||||
|
||||
type GraphemeBreakProperty uint8
|
||||
|
||||
const (
|
||||
GBP_AtStart GraphemeBreakProperty = iota
|
||||
GBP_None
|
||||
GBP_Prepend
|
||||
GBP_CR
|
||||
GBP_LF
|
||||
GBP_Control
|
||||
GBP_Extend
|
||||
GBP_Regional_Indicator
|
||||
GBP_SpacingMark
|
||||
GBP_L
|
||||
GBP_V
|
||||
GBP_T
|
||||
GBP_LV
|
||||
GBP_LVT
|
||||
GBP_ZWJ
|
||||
)
|
||||
|
||||
type IndicConjunctBreak uint8
|
||||
|
||||
const (
|
||||
ICB_None IndicConjunctBreak = iota
|
||||
ICB_Linker
|
||||
ICB_Consonant
|
||||
ICB_Extend
|
||||
)
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user