6733 - read utf-8 'grapheme' from byte stream

No support for combining characters. Graphemes are currently just utf-8
encodings of a single Unicode code-point. No support for code-points that
require more than 32 bits in utf-8.
This commit is contained in:
Kartik Agaram 2020-08-28 23:24:04 -07:00
parent 392ebcce80
commit cd94852dbc
16 changed files with 189 additions and 1 deletions

View File

@ -33,7 +33,7 @@ $Stdin->buffer:
# . op subop mod rm32 base index scale r32
# . 1-3 bytes 3 bits 2 bits 3 bits 3 bits 3 bits 2 bits 2 bits 0/1/2/4 bytes 0/1/2/4 bytes
# return next byte value in eax, with top 3 bytes cleared.
# Return next byte value in eax, with top 3 bytes cleared.
# On reaching end of file, return 0xffffffff (Eof).
read-byte-buffered: # f: (addr buffered-file) -> byte-or-Eof/eax: byte
# . prologue
@ -268,6 +268,50 @@ test-read-byte-buffered-refills-buffer:
# . end
c3/return
# Return next byte value in eax, with top 3 bytes cleared.
# Abort on reaching end of file.
read-byte: # s: (addr stream byte) -> result/eax: byte
# . prologue
55/push-ebp
89/copy 3/mod/direct 5/rm32/ebp . . . 4/r32/esp . . # copy esp to ebp
# . save registers
51/push-ecx
56/push-esi
# esi = s
8b/copy 1/mod/*+disp8 5/rm32/ebp . . . 6/r32/esi 8/disp8 . # copy *(ebp+8) to esi
# ecx = s->read
8b/copy 1/mod/*+disp8 6/rm32/esi . . . 1/r32/ecx 4/disp8 . # copy *(esi+4) to ecx
# if (f->read >= f->write) abort
3b/compare 0/mod/indirect 6/rm32/esi . . . 1/r32/ecx . . # compare ecx with *esi
0f 8d/jump-if->= $read-byte:abort/disp32
# result = f->data[f->read]
31/xor 3/mod/direct 0/rm32/eax . . . 0/r32/eax . . # clear eax
8a/copy-byte 1/mod/*+disp8 4/rm32/sib 6/base/esi 1/index/ecx . 0/r32/AL 0xc/disp8 . # copy byte at *(esi+ecx+12) to AL
# ++f->read
ff 0/subop/increment 1/mod/*+disp8 6/rm32/esi . . . . 4/disp8 . # increment *(esi+4)
$read-byte:end:
# . restore registers
5e/pop-to-esi
59/pop-to-ecx
# . epilogue
89/copy 3/mod/direct 4/rm32/esp . . . 5/r32/ebp . . # copy ebp to esp
5d/pop-to-ebp
c3/return
$read-byte:abort:
# . _write(2/stderr, error)
# . . push args
68/push "read-byte: empty stream\n"/imm32
68/push 2/imm32/stderr
# . . call
e8/call _write/disp32
# . . discard args
81 0/subop/add 3/mod/direct 4/rm32/esp . . . . . 8/imm32 # add to esp
# . syscall(exit, 1)
bb/copy-to-ebx 1/imm32
e8/call syscall_exit/disp32
# never gets here
== data
# a test buffered file for _test-stream

1
400.mu
View File

@ -51,6 +51,7 @@ sig tailor-exit-descriptor ed: (addr exit-descriptor), nbytes: int
sig stop ed: (addr exit-descriptor), value: int
#sig read f: fd or (addr stream byte), s: (addr stream byte) -> num-bytes-read/eax: int
sig read-byte-buffered f: (addr buffered-file) -> byte-or-Eof/eax: byte
sig read-byte s: (addr stream byte) -> result/eax: byte
#sig write-stream f: fd or (addr stream byte), s: (addr stream byte)
#sig error ed: (addr exit-descriptor), out: fd or (addr stream byte), msg: (addr array byte)
sig write-byte-buffered f: (addr buffered-file), n: int

View File

@ -55,6 +55,7 @@ $to-grapheme:body: {
break $to-grapheme:compute-length
}
# more than 4 bytes: unsupported
# TODO: print to stderr
compare c, 0x1fffff
{
break-if->
@ -153,6 +154,148 @@ fn test-to-grapheme-four-bytes-max {
check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b" # 1111-0 111 10 11-1111 10 11-1111 10 11-1111
}
# read the next grapheme from a stream of bytes
fn read-grapheme in: (addr stream byte) -> out/eax: grapheme {
$read-grapheme:body: {
var c/eax: byte <- read-byte in
var num-trailers/ecx: int <- copy 0
$read-grapheme:compute-length: {
# single byte: just return it
compare c, 0xc0
{
break-if->=
out <- copy c
num-trailers <- copy 0
break $read-grapheme:body
}
compare c, 0xfe
{
break-if-<
out <- copy c
break $read-grapheme:body
}
# 2 bytes
compare c, 0xe0
{
break-if->=
num-trailers <- copy 1
break $read-grapheme:compute-length
}
# 3 bytes
compare c, 0xf0
{
break-if->=
num-trailers <- copy 2
break $read-grapheme:compute-length
}
# 4 bytes
compare c, 0xf8
{
break-if->=
num-trailers <- copy 3
break $read-grapheme:compute-length
}
$read-grapheme:abort: {
# TODO: print to stderr
print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
var n/eax: int <- copy c
print-int32-hex-to-real-screen n
print-string-to-real-screen "\n"
var exit-status/ebx: int <- copy 1
syscall_exit
}
}
# prepend trailer bytes
var result/edi: int <- copy c
var num-byte-shifts/edx: int <- copy 1
{
compare num-trailers, 0
break-if-<=
var tmp/eax: byte <- read-byte in
var tmp2/eax: int <- copy tmp
tmp2 <- shift-left-bytes tmp2, num-byte-shifts
result <- or tmp2
# update loop state
num-byte-shifts <- increment
num-trailers <- decrement
loop
}
out <- copy result
}
}
fn test-read-grapheme {
var s: (stream byte 0x30)
var s2/ecx: (addr stream byte) <- address s
write s2, "aΒcde"
var c/eax: grapheme <- read-grapheme s2
var n/eax: int <- copy c
check-ints-equal n, 0x61, "F - test grapheme/0"
var c/eax: grapheme <- read-grapheme s2
var n/eax: int <- copy c
check-ints-equal n, 0x92ce, "F - test grapheme/1" # greek capital letter beta
var c/eax: grapheme <- read-grapheme s2
var n/eax: int <- copy c
check-ints-equal n, 0x63, "F - test grapheme/2"
var c/eax: grapheme <- read-grapheme s2
var n/eax: int <- copy c
check-ints-equal n, 0x96b8e4, "F - test grapheme/3"
var c/eax: grapheme <- read-grapheme s2
var n/eax: int <- copy c
check-ints-equal n, 0x64, "F - test grapheme/4"
var c/eax: grapheme <- read-grapheme s2
var n/eax: int <- copy c
check-ints-equal n, 0x8c95e7, "F - test grapheme/5"
var c/eax: grapheme <- read-grapheme s2
var n/eax: int <- copy c
check-ints-equal n, 0x65, "F - test grapheme/6"
}
# needed because available primitives only shift by a literal/constant number of bits
fn shift-left-bytes n: int, k: int -> result/eax: int {
var i/ecx: int <- copy 0
result <- copy n
{
compare i, k
break-if->=
compare i, 4 # only 4 bytes in 32 bits
break-if->=
result <- shift-left 8
i <- increment
loop
}
}
fn test-shift-left-bytes-0 {
var result/eax: int <- shift-left-bytes 1, 0
check-ints-equal result, 1, "F - shift-left-bytes 0"
}
fn test-shift-left-bytes-1 {
var result/eax: int <- shift-left-bytes 1, 1
check-ints-equal result, 0x100, "F - shift-left-bytes 1"
}
fn test-shift-left-bytes-2 {
var result/eax: int <- shift-left-bytes 1, 2
check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
}
fn test-shift-left-bytes-3 {
var result/eax: int <- shift-left-bytes 1, 3
check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
}
fn test-shift-left-bytes-4 {
var result/eax: int <- shift-left-bytes 1, 4
check-ints-equal result, 0, "F - shift-left-bytes 4"
}
fn test-shift-left-bytes-5 {
var result/eax: int <- shift-left-bytes 1, 5
check-ints-equal result, 0, "F - shift-left-bytes >4"
}
# To run all tests, uncomment this and run:
# $ ./translate_mu && ./a.elf
#? fn main -> r/ebx: int {

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
apps/hex

Binary file not shown.

BIN
apps/mu

Binary file not shown.

BIN
apps/pack

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.