mu/403unicode.mu

# Helpers for Unicode.
#
# Mu has no characters, only code points and graphemes.
# Code points are the indivisible atoms of text streams.
#   https://en.wikipedia.org/wiki/Code_point
# Graphemes are the smallest self-contained unit of text.
# Graphemes may consist of multiple code points.
#
# Mu graphemes are always represented in utf-8, and they are required to fit
# in 4 bytes. (This can be confusing if you focus just on ASCII, where Mu's
# graphemes and code-points are identical.)
#
# Mu doesn't currently support combining code points, or graphemes made of
# multiple code points. One day we will.
#   https://en.wikipedia.org/wiki/Combining_character

fn to-code-point in: grapheme -> _/eax: code-point {
  var g/eax: grapheme <- copy in
  var result/eax: code-point <- copy g  # TODO: support non-ASCII
  return result
}

# transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
# https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
fn to-grapheme in: code-point -> _/eax: grapheme {
  var c/eax: int <- copy in
  var num-trailers/ecx: int <- copy 0
  var first/edx: int <- copy 0
  $to-grapheme:compute-length: {
    # single byte: just return it
    compare c, 0x7f
    {
      break-if->
      var g/eax: grapheme <- copy c
      return g
    }
    # 2 bytes
    compare c, 0x7ff
    {
      break-if->
      num-trailers <- copy 1
      first <- copy 0xc0
      break $to-grapheme:compute-length
    }
    # 3 bytes
    compare c, 0xffff
    {
      break-if->
      num-trailers <- copy 2
      first <- copy 0xe0
      break $to-grapheme:compute-length
    }
    # 4 bytes
    compare c, 0x1fffff
    {
      break-if->
      num-trailers <- copy 3
      first <- copy 0xf0
      break $to-grapheme:compute-length
    }
    # more than 4 bytes: unsupported
    # TODO: print error message to stderr
    compare c, 0x1fffff
    {
      break-if->
      return 0
    }
  }
  # emit trailer bytes, 6 bits from 'in', first two bits '10'
  var result/edi: grapheme <- copy 0
  {
    compare num-trailers, 0
    break-if-<=
    var tmp/esi: int <- copy c
    tmp <- and 0x3f
    tmp <- or 0x80
    result <- shift-left 8
    result <- or tmp
    # update loop state
    c <- shift-right 6
    num-trailers <- decrement
    loop
  }
  # emit engine
  result <- shift-left 8
  result <- or c
  result <- or first
  #
  return result
}

# single-byte code point have identical graphemes
fn test-to-grapheme-single-byte {
  var in-int/ecx: int <- copy 0
  {
    compare in-int, 0x7f
    break-if->
    var in/eax: code-point <- copy in-int
    var out/eax: grapheme <- to-grapheme in
    var out-int/eax: int <- copy out
    check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte"
    in-int <- increment
    loop
  }
}

                                                              # byte       | byte      | byte      | byte
# smallest 2-byte utf-8
fn test-to-grapheme-two-bytes-min {
  var in/eax: code-point <- copy 0x80                         #                                 10     00-0000
  var out/eax: grapheme <- to-grapheme in
  var out-int/eax: int <- copy out
  check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a"      #                         110 0-0010  10 00-0000
}

# largest 2-byte utf-8
fn test-to-grapheme-two-bytes-max {
  var in/eax: code-point <- copy 0x7ff                        #                             1-1111     11-1111
  var out/eax: grapheme <- to-grapheme in
  var out-int/eax: int <- copy out
  check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b"      #                         110 1-1111  10 11-1111
}

# smallest 3-byte utf-8
fn test-to-grapheme-three-bytes-min {
  var in/eax: code-point <- copy 0x800                        #                            10-0000     00-0000
  var out/eax: grapheme <- to-grapheme in
  var out-int/eax: int <- copy out
  check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a"    #              1110 0000  10 10-0000  10 00-0000
}

# largest 3-byte utf-8
fn test-to-grapheme-three-bytes-max {
  var in/eax: code-point <- copy 0xffff                       #                   1111     11-1111     11-1111
  var out/eax: grapheme <- to-grapheme in
  var out-int/eax: int <- copy out
  check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b"    #              1110 1111  10 11-1111  10 11-1111
}

# smallest 4-byte utf-8
fn test-to-grapheme-four-bytes-min {
  var in/eax: code-point <- copy 0x10000                      #                 1-0000     00-0000     00-0000
  var out/eax: grapheme <- to-grapheme in
  var out-int/eax: int <- copy out
  check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a"  # 1111-0 000  10 01-0000  10 00-0000  10 00-0000
}

# largest 4-byte utf-8
fn test-to-grapheme-four-bytes-max {
  var in/eax: code-point <- copy 0x1fffff                     #        111     11-1111     11-1111     11-1111
  var out/eax: grapheme <- to-grapheme in
  var out-int/eax: int <- copy out
  check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b"  # 1111-0 111  10 11-1111  10 11-1111  10 11-1111
}

# read the next grapheme from a stream of bytes
fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
  # if at eof, return EOF
  {
    var eof?/eax: boolean <- stream-empty? in
    compare eof?, 0/false
    break-if-=
    return 0xffffffff
  }
  var c/eax: byte <- read-byte in
  var num-trailers/ecx: int <- copy 0
  $read-grapheme:compute-length: {
    # single byte: just return it
    compare c, 0xc0
    {
      break-if->=
      var g/eax: grapheme <- copy c
      return g
    }
    compare c, 0xfe
    {
      break-if-<
      var g/eax: grapheme <- copy c
      return g
    }
    # 2 bytes
    compare c, 0xe0
    {
      break-if->=
      num-trailers <- copy 1
      break $read-grapheme:compute-length
    }
    # 3 bytes
    compare c, 0xf0
    {
      break-if->=
      num-trailers <- copy 2
      break $read-grapheme:compute-length
    }
    # 4 bytes
    compare c, 0xf8
    {
      break-if->=
      num-trailers <- copy 3
      break $read-grapheme:compute-length
    }
    # TODO: print error message
    return 0
  }
  # prepend trailer bytes
  var result/edi: grapheme <- copy c
  var num-byte-shifts/edx: int <- copy 1
  {
    compare num-trailers, 0
    break-if-<=
    var tmp/eax: byte <- read-byte in
    var tmp2/eax: int <- copy tmp
    tmp2 <- shift-left-bytes tmp2, num-byte-shifts
    result <- or tmp2
    # update loop state
    num-byte-shifts <- increment
    num-trailers <- decrement
    loop
  }
  return result
}

# needed because available primitives only shift by a literal/constant number of bits
fn shift-left-bytes n: int, k: int -> _/eax: int {
  var i/ecx: int <- copy 0
  var result/eax: int <- copy n
  {
    compare i, k
    break-if->=
    compare i, 4  # only 4 bytes in 32 bits
    break-if->=
    result <- shift-left 8
    i <- increment
    loop
  }
  return result
}

# write a grapheme to a stream of bytes
# this is like write-to-stream, except we skip leading 0 bytes
fn write-grapheme out: (addr stream byte), g: grapheme {
$write-grapheme:body: {
  var c/eax: int <- copy g
  append-byte out, c  # first byte is always written
  c <- shift-right 8
  compare c, 0
  break-if-= $write-grapheme:body
  append-byte out, c
  c <- shift-right 8
  compare c, 0
  break-if-= $write-grapheme:body
  append-byte out, c
  c <- shift-right 8
  compare c, 0
  break-if-= $write-grapheme:body
  append-byte out, c
}
}
6732 2020-08-29 05:38:08 +00:00			`# Helpers for Unicode.`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`#`
			`# Mu has no characters, only code points and graphemes.`
			`# Code points are the indivisible atoms of text streams.`
6732 2020-08-29 05:38:08 +00:00			`# https://en.wikipedia.org/wiki/Code_point`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`# Graphemes are the smallest self-contained unit of text.`
			`# Graphemes may consist of multiple code points.`
			`#`
			`# Mu graphemes are always represented in utf-8, and they are required to fit`
fix bad terminology: grapheme -> code point Unix text-mode terminals transparently support utf-8 these days, and so I treat utf-8 sequences (which I call graphemes in Mu) as fundamental. I then blindly carried over this state of affairs to bare-metal Mu, where it makes no sense. If you don't have a terminal handling font-rendering for you, fonts are most often indexed by code points and not utf-8 sequences. 2021-08-30 05:16:34 +00:00			`# in 4 bytes. (This can be confusing if you focus just on ASCII, where Mu's`
			`# graphemes and code-points are identical.)`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`#`
			`# Mu doesn't currently support combining code points, or graphemes made of`
6710 - utf-8 encoding Example program: fn main -> r/ebx: int { var x/eax: code-point <- copy 0x2192 # unicode character 'rightwards arrow' print-code-point 0, x print-string 0, "\n" r <- copy 0 } Run: $ ./translate_mu x.mu && ./a.elf → $ 2020-08-03 03:51:52 +00:00			`# multiple code points. One day we will.`
fix bad terminology: grapheme -> code point Unix text-mode terminals transparently support utf-8 these days, and so I treat utf-8 sequences (which I call graphemes in Mu) as fundamental. I then blindly carried over this state of affairs to bare-metal Mu, where it makes no sense. If you don't have a terminal handling font-rendering for you, fonts are most often indexed by code points and not utf-8 sequences. 2021-08-30 05:16:34 +00:00			`# https://en.wikipedia.org/wiki/Combining_character`

			`fn to-code-point in: grapheme -> _/eax: code-point {`
			`var g/eax: grapheme <- copy in`
			`var result/eax: code-point <- copy g # TODO: support non-ASCII`
			`return result`
			`}`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00
			`# transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox`
			`# https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm`
7158 2020-11-02 08:07:07 +00:00			`fn to-grapheme in: code-point -> _/eax: grapheme {`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`var c/eax: int <- copy in`
			`var num-trailers/ecx: int <- copy 0`
			`var first/edx: int <- copy 0`
			`$to-grapheme:compute-length: {`
			`# single byte: just return it`
			`compare c, 0x7f`
			`{`
			`break-if->`
7158 2020-11-02 08:07:07 +00:00			`var g/eax: grapheme <- copy c`
			`return g`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`}`
			`# 2 bytes`
			`compare c, 0x7ff`
			`{`
			`break-if->`
			`num-trailers <- copy 1`
			`first <- copy 0xc0`
			`break $to-grapheme:compute-length`
			`}`
			`# 3 bytes`
			`compare c, 0xffff`
			`{`
			`break-if->`
			`num-trailers <- copy 2`
			`first <- copy 0xe0`
			`break $to-grapheme:compute-length`
			`}`
			`# 4 bytes`
			`compare c, 0x1fffff`
			`{`
			`break-if->`
			`num-trailers <- copy 3`
			`first <- copy 0xf0`
			`break $to-grapheme:compute-length`
			`}`
			`# more than 4 bytes: unsupported`
7842 - new directory organization Baremetal is now the default build target and therefore has its sources at the top-level. Baremetal programs build using the phase-2 Mu toolchain that requires a Linux kernel. This phase-2 codebase which used to be at the top-level is now under the linux/ directory. Finally, the phase-2 toolchain, while self-hosting, has a way to bootstrap from a C implementation, which is now stored in linux/bootstrap. The bootstrap C implementation uses some literate programming tools that are now in linux/bootstrap/tools. So the whole thing has gotten inverted. Each directory should build one artifact and include the main sources (along with standard library). Tools used for building it are relegated to sub-directories, even though those tools are often useful in their own right, and have had lots of interesting programs written using them. A couple of things have gotten dropped in this process: - I had old ways to run on just a Linux kernel, or with a Soso kernel. No more. - I had some old tooling for running a single test at the cursor. I haven't used that lately. Maybe I'll bring it back one day. The reorg isn't done yet. Still to do: - redo documentation everywhere. All the README files, all other markdown, particularly vocabulary.md. - clean up how-to-run comments at the start of programs everywhere - rethink what to do with the html/ directory. Do we even want to keep supporting it? In spite of these shortcomings, all the scripts at the top-level, linux/ and linux/bootstrap are working. The names of the scripts also feel reasonable. This is a good milestone to take stock at. 2021-03-04 06:09:50 +00:00			`# TODO: print error message to stderr`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`compare c, 0x1fffff`
			`{`
			`break-if->`
7842 - new directory organization Baremetal is now the default build target and therefore has its sources at the top-level. Baremetal programs build using the phase-2 Mu toolchain that requires a Linux kernel. This phase-2 codebase which used to be at the top-level is now under the linux/ directory. Finally, the phase-2 toolchain, while self-hosting, has a way to bootstrap from a C implementation, which is now stored in linux/bootstrap. The bootstrap C implementation uses some literate programming tools that are now in linux/bootstrap/tools. So the whole thing has gotten inverted. Each directory should build one artifact and include the main sources (along with standard library). Tools used for building it are relegated to sub-directories, even though those tools are often useful in their own right, and have had lots of interesting programs written using them. A couple of things have gotten dropped in this process: - I had old ways to run on just a Linux kernel, or with a Soso kernel. No more. - I had some old tooling for running a single test at the cursor. I haven't used that lately. Maybe I'll bring it back one day. The reorg isn't done yet. Still to do: - redo documentation everywhere. All the README files, all other markdown, particularly vocabulary.md. - clean up how-to-run comments at the start of programs everywhere - rethink what to do with the html/ directory. Do we even want to keep supporting it? In spite of these shortcomings, all the scripts at the top-level, linux/ and linux/bootstrap are working. The names of the scripts also feel reasonable. This is a good milestone to take stock at. 2021-03-04 06:09:50 +00:00			`return 0`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`}`
			`}`
			`# emit trailer bytes, 6 bits from 'in', first two bits '10'`
7163 - first type checks for 'return' statements 2020-11-03 21:29:09 +00:00			`var result/edi: grapheme <- copy 0`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`{`
			`compare num-trailers, 0`
			`break-if-<=`
			`var tmp/esi: int <- copy c`
			`tmp <- and 0x3f`
			`tmp <- or 0x80`
6710 - utf-8 encoding Example program: fn main -> r/ebx: int { var x/eax: code-point <- copy 0x2192 # unicode character 'rightwards arrow' print-code-point 0, x print-string 0, "\n" r <- copy 0 } Run: $ ./translate_mu x.mu && ./a.elf → $ 2020-08-03 03:51:52 +00:00			`result <- shift-left 8`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`result <- or tmp`
			`# update loop state`
			`c <- shift-right 6`
			`num-trailers <- decrement`
			`loop`
			`}`
			`# emit engine`
6710 - utf-8 encoding Example program: fn main -> r/ebx: int { var x/eax: code-point <- copy 0x2192 # unicode character 'rightwards arrow' print-code-point 0, x print-string 0, "\n" r <- copy 0 } Run: $ ./translate_mu x.mu && ./a.elf → $ 2020-08-03 03:51:52 +00:00			`result <- shift-left 8`
			`result <- or c`
			`result <- or first`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`#`
7158 2020-11-02 08:07:07 +00:00			`return result`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`}`

. 2021-08-30 07:06:51 +00:00			`# single-byte code point have identical graphemes`
			`fn test-to-grapheme-single-byte {`
			`var in-int/ecx: int <- copy 0`
			`{`
			`compare in-int, 0x7f`
			`break-if->`
			`var in/eax: code-point <- copy in-int`
			`var out/eax: grapheme <- to-grapheme in`
			`var out-int/eax: int <- copy out`
			`check-ints-equal out-int, in-int, "F - test-to-grapheme-single-byte"`
			`in-int <- increment`
			`loop`
			`}`
			`}`

			`# byte \| byte \| byte \| byte`
			`# smallest 2-byte utf-8`
			`fn test-to-grapheme-two-bytes-min {`
			`var in/eax: code-point <- copy 0x80 # 10 00-0000`
			`var out/eax: grapheme <- to-grapheme in`
			`var out-int/eax: int <- copy out`
			`check-ints-equal out-int, 0x80c2, "F - to-grapheme/2a" # 110 0-0010 10 00-0000`
			`}`

			`# largest 2-byte utf-8`
			`fn test-to-grapheme-two-bytes-max {`
			`var in/eax: code-point <- copy 0x7ff # 1-1111 11-1111`
			`var out/eax: grapheme <- to-grapheme in`
			`var out-int/eax: int <- copy out`
			`check-ints-equal out-int, 0xbfdf, "F - to-grapheme/2b" # 110 1-1111 10 11-1111`
			`}`

			`# smallest 3-byte utf-8`
			`fn test-to-grapheme-three-bytes-min {`
			`var in/eax: code-point <- copy 0x800 # 10-0000 00-0000`
			`var out/eax: grapheme <- to-grapheme in`
			`var out-int/eax: int <- copy out`
			`check-ints-equal out-int, 0x80a0e0, "F - to-grapheme/3a" # 1110 0000 10 10-0000 10 00-0000`
			`}`

			`# largest 3-byte utf-8`
			`fn test-to-grapheme-three-bytes-max {`
			`var in/eax: code-point <- copy 0xffff # 1111 11-1111 11-1111`
			`var out/eax: grapheme <- to-grapheme in`
			`var out-int/eax: int <- copy out`
			`check-ints-equal out-int, 0xbfbfef, "F - to-grapheme/3b" # 1110 1111 10 11-1111 10 11-1111`
			`}`

			`# smallest 4-byte utf-8`
			`fn test-to-grapheme-four-bytes-min {`
			`var in/eax: code-point <- copy 0x10000 # 1-0000 00-0000 00-0000`
			`var out/eax: grapheme <- to-grapheme in`
			`var out-int/eax: int <- copy out`
			`check-ints-equal out-int, 0x808090f0, "F - to-grapheme/4a" # 1111-0 000 10 01-0000 10 00-0000 10 00-0000`
			`}`

			`# largest 4-byte utf-8`
			`fn test-to-grapheme-four-bytes-max {`
			`var in/eax: code-point <- copy 0x1fffff # 111 11-1111 11-1111 11-1111`
			`var out/eax: grapheme <- to-grapheme in`
			`var out-int/eax: int <- copy out`
			`check-ints-equal out-int, 0xbfbfbff7, "F - to-grapheme/4b" # 1111-0 111 10 11-1111 10 11-1111 10 11-1111`
			`}`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`# read the next grapheme from a stream of bytes`
7158 2020-11-02 08:07:07 +00:00			`fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {`
7123 - tile: truncate string if necessary 2020-10-27 06:43:58 +00:00			`# if at eof, return EOF`
			`{`
			`var eof?/eax: boolean <- stream-empty? in`
7690 Convert comments about magic constants into metadata. 2021-02-07 08:17:17 +00:00			`compare eof?, 0/false`
7123 - tile: truncate string if necessary 2020-10-27 06:43:58 +00:00			`break-if-=`
7158 2020-11-02 08:07:07 +00:00			`return 0xffffffff`
7123 - tile: truncate string if necessary 2020-10-27 06:43:58 +00:00			`}`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`var c/eax: byte <- read-byte in`
			`var num-trailers/ecx: int <- copy 0`
			`$read-grapheme:compute-length: {`
			`# single byte: just return it`
			`compare c, 0xc0`
			`{`
			`break-if->=`
7158 2020-11-02 08:07:07 +00:00			`var g/eax: grapheme <- copy c`
			`return g`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`}`
			`compare c, 0xfe`
			`{`
			`break-if-<`
7158 2020-11-02 08:07:07 +00:00			`var g/eax: grapheme <- copy c`
			`return g`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`}`
			`# 2 bytes`
			`compare c, 0xe0`
			`{`
			`break-if->=`
			`num-trailers <- copy 1`
			`break $read-grapheme:compute-length`
			`}`
			`# 3 bytes`
			`compare c, 0xf0`
			`{`
			`break-if->=`
			`num-trailers <- copy 2`
			`break $read-grapheme:compute-length`
			`}`
			`# 4 bytes`
			`compare c, 0xf8`
			`{`
			`break-if->=`
			`num-trailers <- copy 3`
			`break $read-grapheme:compute-length`
			`}`
7842 - new directory organization Baremetal is now the default build target and therefore has its sources at the top-level. Baremetal programs build using the phase-2 Mu toolchain that requires a Linux kernel. This phase-2 codebase which used to be at the top-level is now under the linux/ directory. Finally, the phase-2 toolchain, while self-hosting, has a way to bootstrap from a C implementation, which is now stored in linux/bootstrap. The bootstrap C implementation uses some literate programming tools that are now in linux/bootstrap/tools. So the whole thing has gotten inverted. Each directory should build one artifact and include the main sources (along with standard library). Tools used for building it are relegated to sub-directories, even though those tools are often useful in their own right, and have had lots of interesting programs written using them. A couple of things have gotten dropped in this process: - I had old ways to run on just a Linux kernel, or with a Soso kernel. No more. - I had some old tooling for running a single test at the cursor. I haven't used that lately. Maybe I'll bring it back one day. The reorg isn't done yet. Still to do: - redo documentation everywhere. All the README files, all other markdown, particularly vocabulary.md. - clean up how-to-run comments at the start of programs everywhere - rethink what to do with the html/ directory. Do we even want to keep supporting it? In spite of these shortcomings, all the scripts at the top-level, linux/ and linux/bootstrap are working. The names of the scripts also feel reasonable. This is a good milestone to take stock at. 2021-03-04 06:09:50 +00:00			`# TODO: print error message`
			`return 0`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`}`
			`# prepend trailer bytes`
7163 - first type checks for 'return' statements 2020-11-03 21:29:09 +00:00			`var result/edi: grapheme <- copy c`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`var num-byte-shifts/edx: int <- copy 1`
			`{`
			`compare num-trailers, 0`
			`break-if-<=`
			`var tmp/eax: byte <- read-byte in`
			`var tmp2/eax: int <- copy tmp`
			`tmp2 <- shift-left-bytes tmp2, num-byte-shifts`
			`result <- or tmp2`
			`# update loop state`
			`num-byte-shifts <- increment`
			`num-trailers <- decrement`
			`loop`
			`}`
7158 2020-11-02 08:07:07 +00:00			`return result`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`}`

			`# needed because available primitives only shift by a literal/constant number of bits`
7158 2020-11-02 08:07:07 +00:00			`fn shift-left-bytes n: int, k: int -> _/eax: int {`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`var i/ecx: int <- copy 0`
7158 2020-11-02 08:07:07 +00:00			`var result/eax: int <- copy n`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`{`
			`compare i, k`
			`break-if->=`
			`compare i, 4 # only 4 bytes in 32 bits`
			`break-if->=`
			`result <- shift-left 8`
			`i <- increment`
			`loop`
			`}`
7158 2020-11-02 08:07:07 +00:00			`return result`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`}`

6807 - tile: render intermediate stack state 2020-09-20 04:44:48 +00:00			`# write a grapheme to a stream of bytes`
			`# this is like write-to-stream, except we skip leading 0 bytes`
			`fn write-grapheme out: (addr stream byte), g: grapheme {`
			`$write-grapheme:body: {`
			`var c/eax: int <- copy g`
			`append-byte out, c # first byte is always written`
			`c <- shift-right 8`
			`compare c, 0`
			`break-if-= $write-grapheme:body`
			`append-byte out, c`
			`c <- shift-right 8`
			`compare c, 0`
			`break-if-= $write-grapheme:body`
			`append-byte out, c`
			`c <- shift-right 8`
			`compare c, 0`
			`break-if-= $write-grapheme:body`
			`append-byte out, c`
			`}`
			`}`