diff --git a/112read-byte.subx b/112read-byte.subx index 32f89647..06da3a64 100644 --- a/112read-byte.subx +++ b/112read-byte.subx @@ -269,7 +269,7 @@ test-read-byte-buffered-refills-buffer: c3/return # Return next byte value in eax, with top 3 bytes cleared. -# Abort on reaching end of file. +# Abort on reaching end of stream. read-byte: # s: (addr stream byte) -> result/eax: byte # . prologue 55/push-ebp diff --git a/403unicode.mu b/403unicode.mu index c1a4d748..23f14d8f 100644 --- a/403unicode.mu +++ b/403unicode.mu @@ -253,6 +253,75 @@ fn test-read-grapheme { check-ints-equal n, 0x65, "F - test grapheme/6" } +fn read-grapheme-buffered in: (addr buffered-file) -> out/eax: grapheme { +$read-grapheme-buffered:body: { + var c/eax: byte <- read-byte-buffered in + var num-trailers/ecx: int <- copy 0 + $read-grapheme-buffered:compute-length: { + # single byte: just return it + compare c, 0xc0 + { + break-if->= + out <- copy c + num-trailers <- copy 0 + break $read-grapheme-buffered:body + } + compare c, 0xfe + { + break-if-< + out <- copy c + break $read-grapheme-buffered:body + } + # 2 bytes + compare c, 0xe0 + { + break-if->= + num-trailers <- copy 1 + break $read-grapheme-buffered:compute-length + } + # 3 bytes + compare c, 0xf0 + { + break-if->= + num-trailers <- copy 2 + break $read-grapheme-buffered:compute-length + } + # 4 bytes + compare c, 0xf8 + { + break-if->= + num-trailers <- copy 3 + break $read-grapheme-buffered:compute-length + } +$read-grapheme-buffered:abort: { + # TODO: print to stderr + print-string-to-real-screen "utf-8 encodings larger than 4 bytes are not supported. First byte seen: " + var n/eax: int <- copy c + print-int32-hex-to-real-screen n + print-string-to-real-screen "\n" + var exit-status/ebx: int <- copy 1 + syscall_exit + } + } + # prepend trailer bytes + var result/edi: int <- copy c + var num-byte-shifts/edx: int <- copy 1 + { + compare num-trailers, 0 + break-if-<= + var tmp/eax: byte <- read-byte-buffered in + var tmp2/eax: int <- copy tmp + tmp2 <- shift-left-bytes tmp2, num-byte-shifts + result <- or tmp2 + # update loop state + num-byte-shifts <- increment + num-trailers <- decrement + loop + } + out <- copy result +} +} + # needed because available primitives only shift by a literal/constant number of bits fn shift-left-bytes n: int, k: int -> result/eax: int { var i/ecx: int <- copy 0 diff --git a/apps/browse/main.mu b/apps/browse/main.mu index d710c1a6..8467f01f 100644 --- a/apps/browse/main.mu +++ b/apps/browse/main.mu @@ -33,13 +33,13 @@ fn render screen: (addr screen), fs: (addr buffered-file), state: (addr screen-p fn render-normal screen: (addr screen), fs: (addr buffered-file), state: (addr screen-position-state) { var newline-seen?/esi: boolean <- copy 0 # false var start-of-paragraph?/edi: boolean <- copy 1 # true - var previous-char/ebx: byte <- copy 0 + var previous-grapheme/ebx: grapheme <- copy 0 $render-normal:loop: { # if done-drawing?(state) break var done?/eax: boolean <- done-drawing? state compare done?, 0 # false break-if-!= - var c/eax: byte <- read-byte-buffered fs + var c/eax: grapheme <- read-grapheme-buffered fs $render-normal:loop-body: { # if (c == EOF) break compare c, 0xffffffff # EOF marker @@ -59,8 +59,8 @@ $render-normal:loop-body: { # otherwise render two newlines { break-if-= - add-char state, 0xa # newline - add-char state, 0xa # newline + add-grapheme state, 0xa # newline + add-grapheme state, 0xa # newline newline-seen? <- copy 0 # false start-of-paragraph? <- copy 1 # true break $render-normal:loop-body @@ -94,20 +94,20 @@ $render-normal:flush-buffered-newline: { { compare c, 0x20 break-if-!= - add-char state, 0xa # newline + add-grapheme state, 0xa # newline break $render-normal:flush-buffered-newline } - add-char state, 0x20 # space + add-grapheme state, 0x20 # space # fall through to print c } ## end soft newline support $render-normal:whitespace-separated-regions: { - # if previous-char wasn't whitespace, skip this block + # if previous-grapheme wasn't whitespace, skip this block { - compare previous-char, 0x20 # space + compare previous-grapheme, 0x20 # space break-if-= - compare previous-char, 0xa # newline + compare previous-grapheme, 0xa # newline break-if-= break $render-normal:whitespace-separated-regions } @@ -133,9 +133,9 @@ $render-normal:whitespace-separated-regions: { } } # - add-char state, c + add-grapheme state, c } # $render-normal:loop-body - previous-char <- copy c + previous-grapheme <- copy c loop } # $render-normal:loop } @@ -144,7 +144,7 @@ fn render-header-line screen: (addr screen), fs: (addr buffered-file), state: (a $render-header-line:body: { # compute color based on number of '#'s var header-level/esi: int <- copy 1 # caller already grabbed one - var c/eax: byte <- copy 0 + var c/eax: grapheme <- copy 0 { # if done-drawing?(state) return { @@ -153,7 +153,7 @@ $render-header-line:body: { break-if-!= $render-header-line:body } # - c <- read-byte-buffered fs + c <- read-grapheme-buffered fs # if (c != '#') break compare c, 0x23 # '#' break-if-!= @@ -171,7 +171,7 @@ $render-header-line:body: { break-if-!= } # - c <- read-byte-buffered fs + c <- read-grapheme-buffered fs # if (c == EOF) break compare c, 0xffffffff # EOF marker break-if-= @@ -179,7 +179,7 @@ $render-header-line:body: { compare c, 0xa # newline break-if-= # - add-char state, c + add-grapheme state, c # loop } @@ -226,7 +226,7 @@ fn render-until-asterisk fs: (addr buffered-file), state: (addr screen-position- compare done?, 0 # false break-if-!= # - var c/eax: byte <- read-byte-buffered fs + var c/eax: grapheme <- read-grapheme-buffered fs # if (c == EOF) break compare c, 0xffffffff # EOF marker break-if-= @@ -234,7 +234,7 @@ fn render-until-asterisk fs: (addr buffered-file), state: (addr screen-position- compare c, 0x2a # '*' break-if-= # - add-char state, c + add-grapheme state, c # loop } @@ -247,7 +247,7 @@ fn render-until-underscore fs: (addr buffered-file), state: (addr screen-positio compare done?, 0 # false break-if-!= # - var c/eax: byte <- read-byte-buffered fs + var c/eax: grapheme <- read-grapheme-buffered fs # if (c == EOF) break compare c, 0xffffffff # EOF marker break-if-= @@ -255,7 +255,7 @@ fn render-until-underscore fs: (addr buffered-file), state: (addr screen-positio compare c, 0x5f # '_' break-if-= # - add-char state, c + add-grapheme state, c # loop } diff --git a/apps/browse/screen-position-state.mu b/apps/browse/screen-position-state.mu index 7b53ae07..f342faab 100644 --- a/apps/browse/screen-position-state.mu +++ b/apps/browse/screen-position-state.mu @@ -61,19 +61,18 @@ fn start-drawing _self: (addr screen-position-state) { reposition-cursor self } -fn add-char _self: (addr screen-position-state), c: byte { -$add-char:body: { +fn add-grapheme _self: (addr screen-position-state), c: grapheme { +$add-grapheme:body: { var self/esi: (addr screen-position-state) <- copy _self { compare c, 0xa # newline break-if-!= next-line self reposition-cursor self - break $add-char:body + break $add-grapheme:body } # print c - var g/eax: grapheme <- copy c - print-grapheme 0, g + print-grapheme 0, c # self->col++ var tmp/eax: (addr int) <- get self, col increment *tmp