6733 - read utf-8 'grapheme' from byte stream
No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8.
This commit is contained in:
parent
392ebcce80
commit
cd94852dbc
|
@ -33,7 +33,7 @@ $Stdin->buffer:
|
|||
# . op subop mod rm32 base index scale r32
|
||||
# . 1-3 bytes 3 bits 2 bits 3 bits 3 bits 3 bits 2 bits 2 bits 0/1/2/4 bytes 0/1/2/4 bytes
|
||||
|
||||
# return next byte value in eax, with top 3 bytes cleared.
|
||||
# Return next byte value in eax, with top 3 bytes cleared.
|
||||
# On reaching end of file, return 0xffffffff (Eof).
|
||||
read-byte-buffered: # f: (addr buffered-file) -> byte-or-Eof/eax: byte
|
||||
# . prologue
|
||||
|
@ -268,6 +268,50 @@ test-read-byte-buffered-refills-buffer:
|
|||
# . end
|
||||
c3/return
|
||||
|
||||
# Return next byte value in eax, with top 3 bytes cleared.
|
||||
# Abort on reaching end of file.
|
||||
read-byte: # s: (addr stream byte) -> result/eax: byte
|
||||
# . prologue
|
||||
55/push-ebp
|
||||
89/copy 3/mod/direct 5/rm32/ebp . . . 4/r32/esp . . # copy esp to ebp
|
||||
# . save registers
|
||||
51/push-ecx
|
||||
56/push-esi
|
||||
# esi = s
|
||||
8b/copy 1/mod/*+disp8 5/rm32/ebp . . . 6/r32/esi 8/disp8 . # copy *(ebp+8) to esi
|
||||
# ecx = s->read
|
||||
8b/copy 1/mod/*+disp8 6/rm32/esi . . . 1/r32/ecx 4/disp8 . # copy *(esi+4) to ecx
|
||||
# if (f->read >= f->write) abort
|
||||
3b/compare 0/mod/indirect 6/rm32/esi . . . 1/r32/ecx . . # compare ecx with *esi
|
||||
0f 8d/jump-if->= $read-byte:abort/disp32
|
||||
# result = f->data[f->read]
|
||||
31/xor 3/mod/direct 0/rm32/eax . . . 0/r32/eax . . # clear eax
|
||||
8a/copy-byte 1/mod/*+disp8 4/rm32/sib 6/base/esi 1/index/ecx . 0/r32/AL 0xc/disp8 . # copy byte at *(esi+ecx+12) to AL
|
||||
# ++f->read
|
||||
ff 0/subop/increment 1/mod/*+disp8 6/rm32/esi . . . . 4/disp8 . # increment *(esi+4)
|
||||
$read-byte:end:
|
||||
# . restore registers
|
||||
5e/pop-to-esi
|
||||
59/pop-to-ecx
|
||||
# . epilogue
|
||||
89/copy 3/mod/direct 4/rm32/esp . . . 5/r32/ebp . . # copy ebp to esp
|
||||
5d/pop-to-ebp
|
||||
c3/return
|
||||
|
||||
$read-byte:abort:
|
||||
# . _write(2/stderr, error)
|
||||
# . . push args
|
||||
68/push "read-byte: empty stream\n"/imm32
|
||||
68/push 2/imm32/stderr
|
||||
# . . call
|
||||
e8/call _write/disp32
|
||||
# . . discard args
|
||||
81 0/subop/add 3/mod/direct 4/rm32/esp . . . . . 8/imm32 # add to esp
|
||||
# . syscall(exit, 1)
|
||||
bb/copy-to-ebx 1/imm32
|
||||
e8/call syscall_exit/disp32
|
||||
# never gets here
|
||||
|
||||
== data
|
||||
|
||||
# a test buffered file for _test-stream
|
||||
|
|
1
400.mu
1
400.mu
|
@ -51,6 +51,7 @@ sig tailor-exit-descriptor ed: (addr exit-descriptor), nbytes: int
|
|||
sig stop ed: (addr exit-descriptor), value: int
|
||||
#sig read f: fd or (addr stream byte), s: (addr stream byte) -> num-bytes-read/eax: int
|
||||
sig read-byte-buffered f: (addr buffered-file) -> byte-or-Eof/eax: byte
|
||||
sig read-byte s: (addr stream byte) -> result/eax: byte
|
||||
#sig write-stream f: fd or (addr stream byte), s: (addr stream byte)
|
||||
#sig error ed: (addr exit-descriptor), out: fd or (addr stream byte), msg: (addr array byte)
|
||||
sig write-byte-buffered f: (addr buffered-file), n: int
|
||||
|
|
143
403unicode.mu
143
403unicode.mu
|
@ -55,6 +55,7 @@ $to-grapheme:body: {
|
|||
break $to-grapheme:compute-length
|
||||
}
|
||||
# more than 4 bytes: unsupported
|
||||
# TODO: print to stderr
|
||||
compare c, 0x1fffff
|
||||
{
|
||||
break-if->
|
||||
|
@ -153,6 +154,148 @@ fn test-to-grapheme-four-bytes-max {
|
|||
check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b" # 1111-0 111 10 11-1111 10 11-1111 10 11-1111
|
||||
}
|
||||
|
||||
# read the next grapheme from a stream of bytes
|
||||
fn read-grapheme in: (addr stream byte) -> out/eax: grapheme {
|
||||
$read-grapheme:body: {
|
||||
var c/eax: byte <- read-byte in
|
||||
var num-trailers/ecx: int <- copy 0
|
||||
$read-grapheme:compute-length: {
|
||||
# single byte: just return it
|
||||
compare c, 0xc0
|
||||
{
|
||||
break-if->=
|
||||
out <- copy c
|
||||
num-trailers <- copy 0
|
||||
break $read-grapheme:body
|
||||
}
|
||||
compare c, 0xfe
|
||||
{
|
||||
break-if-<
|
||||
out <- copy c
|
||||
break $read-grapheme:body
|
||||
}
|
||||
# 2 bytes
|
||||
compare c, 0xe0
|
||||
{
|
||||
break-if->=
|
||||
num-trailers <- copy 1
|
||||
break $read-grapheme:compute-length
|
||||
}
|
||||
# 3 bytes
|
||||
compare c, 0xf0
|
||||
{
|
||||
break-if->=
|
||||
num-trailers <- copy 2
|
||||
break $read-grapheme:compute-length
|
||||
}
|
||||
# 4 bytes
|
||||
compare c, 0xf8
|
||||
{
|
||||
break-if->=
|
||||
num-trailers <- copy 3
|
||||
break $read-grapheme:compute-length
|
||||
}
|
||||
$read-grapheme:abort: {
|
||||
# TODO: print to stderr
|
||||
print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: "
|
||||
var n/eax: int <- copy c
|
||||
print-int32-hex-to-real-screen n
|
||||
print-string-to-real-screen "\n"
|
||||
var exit-status/ebx: int <- copy 1
|
||||
syscall_exit
|
||||
}
|
||||
}
|
||||
# prepend trailer bytes
|
||||
var result/edi: int <- copy c
|
||||
var num-byte-shifts/edx: int <- copy 1
|
||||
{
|
||||
compare num-trailers, 0
|
||||
break-if-<=
|
||||
var tmp/eax: byte <- read-byte in
|
||||
var tmp2/eax: int <- copy tmp
|
||||
tmp2 <- shift-left-bytes tmp2, num-byte-shifts
|
||||
result <- or tmp2
|
||||
# update loop state
|
||||
num-byte-shifts <- increment
|
||||
num-trailers <- decrement
|
||||
loop
|
||||
}
|
||||
out <- copy result
|
||||
}
|
||||
}
|
||||
|
||||
fn test-read-grapheme {
|
||||
var s: (stream byte 0x30)
|
||||
var s2/ecx: (addr stream byte) <- address s
|
||||
write s2, "aΒc世d界e"
|
||||
var c/eax: grapheme <- read-grapheme s2
|
||||
var n/eax: int <- copy c
|
||||
check-ints-equal n, 0x61, "F - test grapheme/0"
|
||||
var c/eax: grapheme <- read-grapheme s2
|
||||
var n/eax: int <- copy c
|
||||
check-ints-equal n, 0x92ce, "F - test grapheme/1" # greek capital letter beta
|
||||
var c/eax: grapheme <- read-grapheme s2
|
||||
var n/eax: int <- copy c
|
||||
check-ints-equal n, 0x63, "F - test grapheme/2"
|
||||
var c/eax: grapheme <- read-grapheme s2
|
||||
var n/eax: int <- copy c
|
||||
check-ints-equal n, 0x96b8e4, "F - test grapheme/3"
|
||||
var c/eax: grapheme <- read-grapheme s2
|
||||
var n/eax: int <- copy c
|
||||
check-ints-equal n, 0x64, "F - test grapheme/4"
|
||||
var c/eax: grapheme <- read-grapheme s2
|
||||
var n/eax: int <- copy c
|
||||
check-ints-equal n, 0x8c95e7, "F - test grapheme/5"
|
||||
var c/eax: grapheme <- read-grapheme s2
|
||||
var n/eax: int <- copy c
|
||||
check-ints-equal n, 0x65, "F - test grapheme/6"
|
||||
}
|
||||
|
||||
# needed because available primitives only shift by a literal/constant number of bits
|
||||
fn shift-left-bytes n: int, k: int -> result/eax: int {
|
||||
var i/ecx: int <- copy 0
|
||||
result <- copy n
|
||||
{
|
||||
compare i, k
|
||||
break-if->=
|
||||
compare i, 4 # only 4 bytes in 32 bits
|
||||
break-if->=
|
||||
result <- shift-left 8
|
||||
i <- increment
|
||||
loop
|
||||
}
|
||||
}
|
||||
|
||||
fn test-shift-left-bytes-0 {
|
||||
var result/eax: int <- shift-left-bytes 1, 0
|
||||
check-ints-equal result, 1, "F - shift-left-bytes 0"
|
||||
}
|
||||
|
||||
fn test-shift-left-bytes-1 {
|
||||
var result/eax: int <- shift-left-bytes 1, 1
|
||||
check-ints-equal result, 0x100, "F - shift-left-bytes 1"
|
||||
}
|
||||
|
||||
fn test-shift-left-bytes-2 {
|
||||
var result/eax: int <- shift-left-bytes 1, 2
|
||||
check-ints-equal result, 0x10000, "F - shift-left-bytes 2"
|
||||
}
|
||||
|
||||
fn test-shift-left-bytes-3 {
|
||||
var result/eax: int <- shift-left-bytes 1, 3
|
||||
check-ints-equal result, 0x1000000, "F - shift-left-bytes 3"
|
||||
}
|
||||
|
||||
fn test-shift-left-bytes-4 {
|
||||
var result/eax: int <- shift-left-bytes 1, 4
|
||||
check-ints-equal result, 0, "F - shift-left-bytes 4"
|
||||
}
|
||||
|
||||
fn test-shift-left-bytes-5 {
|
||||
var result/eax: int <- shift-left-bytes 1, 5
|
||||
check-ints-equal result, 0, "F - shift-left-bytes >4"
|
||||
}
|
||||
|
||||
# To run all tests, uncomment this and run:
|
||||
# $ ./translate_mu && ./a.elf
|
||||
#? fn main -> r/ebx: int {
|
||||
|
|
BIN
apps/assort
BIN
apps/assort
Binary file not shown.
BIN
apps/braces
BIN
apps/braces
Binary file not shown.
BIN
apps/calls
BIN
apps/calls
Binary file not shown.
BIN
apps/crenshaw2-1
BIN
apps/crenshaw2-1
Binary file not shown.
Binary file not shown.
BIN
apps/dquotes
BIN
apps/dquotes
Binary file not shown.
BIN
apps/factorial
BIN
apps/factorial
Binary file not shown.
BIN
apps/sigils
BIN
apps/sigils
Binary file not shown.
BIN
apps/survey
BIN
apps/survey
Binary file not shown.
BIN
apps/tests
BIN
apps/tests
Binary file not shown.
Loading…
Reference in New Issue