From ea2d44bdefef7ded28cc33dbb6ac85842dc6ca44 Mon Sep 17 00:00:00 2001 From: Kartik Agaram Date: Sun, 2 Aug 2020 20:14:50 -0700 Subject: [PATCH] snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. --- 403code-point.mu | 178 +++++++++++++++++++++++++++++++++++++++++++++++ 405screen.mu | 5 ++ 2 files changed, 183 insertions(+) create mode 100644 403code-point.mu diff --git a/403code-point.mu b/403code-point.mu new file mode 100644 index 00000000..3b2b7205 --- /dev/null +++ b/403code-point.mu @@ -0,0 +1,178 @@ +# Helpers for Unicode "code points". +# https://en.wikipedia.org/wiki/Code_point +# +# Mu has no characters, only code points and graphemes. +# Code points are the indivisible atoms of text streams. +# Graphemes are the smallest self-contained unit of text. +# Graphemes may consist of multiple code points. +# +# Mu graphemes are always represented in utf-8, and they are required to fit +# in 4 bytes. +# +# Mu doesn't currently support combining code points, or graphemes made of +# multiple code points. + +# transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox +# https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm +fn to-grapheme in: code-point -> out/eax: grapheme { +$to-grapheme:body: { + var c/eax: int <- copy in + var num-trailers/ecx: int <- copy 0 + var first/edx: int <- copy 0 + $to-grapheme:compute-length: { + # single byte: just return it + compare c, 0x7f + { + break-if-> + out <- copy c + break $to-grapheme:body + } + # 2 bytes + compare c, 0x7ff + { + break-if-> + num-trailers <- copy 1 + first <- copy 0xc0 + break $to-grapheme:compute-length + } + # 3 bytes + compare c, 0xffff + { + break-if-> + num-trailers <- copy 2 + first <- copy 0xe0 + break $to-grapheme:compute-length + } + # 4 bytes + compare c, 0x1fffff + { + break-if-> + num-trailers <- copy 3 + first <- copy 0xf0 + break $to-grapheme:compute-length + } + # more than 4 bytes: unsupported + compare c, 0x1fffff + { + break-if-> + print-string-to-real-screen "unsupported code point " + print-int32-hex-to-real-screen c + print-string-to-real-screen "\n" + var exit-status/ebx: int <- copy 1 + syscall_exit + } + } + # emit trailer bytes, 6 bits from 'in', first two bits '10' + var byte-shifts/ebx: int <- copy 0 + var result/edi: int <- copy 0 + { + compare num-trailers, 0 + break-if-<= + var tmp/esi: int <- copy c + tmp <- and 0x3f + tmp <- or 0x80 + tmp <- shift-left-bytes tmp, byte-shifts + result <- or tmp + # update loop state + c <- shift-right 6 + byte-shifts <- increment + num-trailers <- decrement + loop + } + # emit engine + var tmp/esi: int <- copy c + tmp <- or first + tmp <- shift-left-bytes tmp, byte-shifts + result <- or tmp + # + out <- copy result +} +} + +# single-byte code point have identical graphemes +fn test-to-grapheme-single-byte { + var in-int/ecx: int <- copy 0 + { + compare in-int, 0x7f + break-if-> + var in/eax: code-point <- copy in-int + var out/eax: grapheme <- to-grapheme in + var out-int/eax: int <- copy out + check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte" + in-int <- increment + loop + } +} + +# smallest 2-byte utf-8 +fn test-to-grapheme-two-bytes-min { + var in/eax: code-point <- copy 0x80 # 10 000000 + var out/eax: grapheme <- to-grapheme in + var out-int/eax: int <- copy out + check-ints-equal out-int, 0xc280, "F 2gr" # 110 00010 10 000000 +} + +# largest 2-byte utf-8 +fn test-to-grapheme-two-bytes-max { + var in/eax: code-point <- copy 0x7ff # 11111 111111 + var out/eax: grapheme <- to-grapheme in + var out-int/eax: int <- copy out + check-ints-equal out-int, 0xdfbf, "F 2gr" # 110 11111 10 111111 +} + +# smallest 3-byte utf-8 +fn test-to-grapheme-three-bytes-min { + var in/eax: code-point <- copy 0x800 # 100000 000000 + var out/eax: grapheme <- to-grapheme in + var out-int/eax: int <- copy out + check-ints-equal out-int, 0xc280, "F 2gr" # 1110 0000 10 100000 10 000000 +} + +# needed because available primitives only shift by a literal/constant number of bits +fn shift-left-bytes n: int, k: int -> result/esi: int { + var i/eax: int <- copy 0 + result <- copy n + { + compare i, k + break-if->= + compare i, 4 # only 4 bytes in 32 bits + break-if->= + result <- shift-left 8 + i <- increment + loop + } +} + +fn test-shift-left-bytes-0 { + var result/esi: int <- shift-left-bytes 1, 0 + check-ints-equal result, 1, "F - shift-left-bytes 0" +} + +fn test-shift-left-bytes-1 { + var result/esi: int <- shift-left-bytes 1, 1 + check-ints-equal result, 0x100, "F - shift-left-bytes 1" +} + +fn test-shift-left-bytes-2 { + var result/esi: int <- shift-left-bytes 1, 2 + check-ints-equal result, 0x10000, "F - shift-left-bytes 2" +} + +fn test-shift-left-bytes-3 { + var result/esi: int <- shift-left-bytes 1, 3 + check-ints-equal result, 0x1000000, "F - shift-left-bytes 3" +} + +fn test-shift-left-bytes-4 { + var result/esi: int <- shift-left-bytes 1, 4 + check-ints-equal result, 0, "F - shift-left-bytes 4" +} + +fn test-shift-left-bytes-5 { + var result/esi: int <- shift-left-bytes 1, 5 + check-ints-equal result, 0, "F - shift-left-bytes >4" +} + +#? fn main { +#? run-tests +#? } diff --git a/405screen.mu b/405screen.mu index 8165e50f..99eae4f4 100644 --- a/405screen.mu +++ b/405screen.mu @@ -152,6 +152,11 @@ $print-grapheme:body: { } } +fn print-code-point screen: (addr screen), c: code-point { + var g/eax: grapheme <- to-grapheme c + print-grapheme screen, g +} + fn print-int32-hex screen: (addr screen), n: int { $print-int32-hex:body: { compare screen, 0