mu/403unicode.mu

# Helpers for Unicode.
#
# Mu has no characters, only code points and graphemes.
# Code points are the indivisible atoms of text streams.
#   https://en.wikipedia.org/wiki/Code_point
# Graphemes are the smallest self-contained unit of text.
# Graphemes may consist of multiple code points.
#
# Mu graphemes are always represented in utf-8, and they are required to fit
# in 4 bytes.
#
# Mu doesn't currently support combining code points, or graphemes made of
# multiple code points. One day we will.
# We also don't currently support code points that translate into multiple
# or wide graphemes. (In particular, Tab will never be supported.)

# transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
# https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
#
# The day we want to support combining characters, this function will need to
# take multiple code points. Or something.
fn to-grapheme in: code-point -> _/eax: grapheme {
  var c/eax: int <- copy in
  var num-trailers/ecx: int <- copy 0
  var first/edx: int <- copy 0
  $to-grapheme:compute-length: {
    # single byte: just return it
    compare c, 0x7f
    {
      break-if->
      var g/eax: grapheme <- copy c
      return g
    }
    # 2 bytes
    compare c, 0x7ff
    {
      break-if->
      num-trailers <- copy 1
      first <- copy 0xc0
      break $to-grapheme:compute-length
    }
    # 3 bytes
    compare c, 0xffff
    {
      break-if->
      num-trailers <- copy 2
      first <- copy 0xe0
      break $to-grapheme:compute-length
    }
    # 4 bytes
    compare c, 0x1fffff
    {
      break-if->
      num-trailers <- copy 3
      first <- copy 0xf0
      break $to-grapheme:compute-length
    }
    # more than 4 bytes: unsupported
    # TODO: print error message to stderr
    compare c, 0x1fffff
    {
      break-if->
      return 0
    }
  }
  # emit trailer bytes, 6 bits from 'in', first two bits '10'
  var result/edi: grapheme <- copy 0
  {
    compare num-trailers, 0
    break-if-<=
    var tmp/esi: int <- copy c
    tmp <- and 0x3f
    tmp <- or 0x80
    result <- shift-left 8
    result <- or tmp
    # update loop state
    c <- shift-right 6
    num-trailers <- decrement
    loop
  }
  # emit engine
  result <- shift-left 8
  result <- or c
  result <- or first
  #
  return result
}

# TODO: bring in tests once we have check-ints-equal

# read the next grapheme from a stream of bytes
fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
  # if at eof, return EOF
  {
    var eof?/eax: boolean <- stream-empty? in
    compare eof?, 0/false
    break-if-=
    return 0xffffffff
  }
  var c/eax: byte <- read-byte in
  var num-trailers/ecx: int <- copy 0
  $read-grapheme:compute-length: {
    # single byte: just return it
    compare c, 0xc0
    {
      break-if->=
      var g/eax: grapheme <- copy c
      return g
    }
    compare c, 0xfe
    {
      break-if-<
      var g/eax: grapheme <- copy c
      return g
    }
    # 2 bytes
    compare c, 0xe0
    {
      break-if->=
      num-trailers <- copy 1
      break $read-grapheme:compute-length
    }
    # 3 bytes
    compare c, 0xf0
    {
      break-if->=
      num-trailers <- copy 2
      break $read-grapheme:compute-length
    }
    # 4 bytes
    compare c, 0xf8
    {
      break-if->=
      num-trailers <- copy 3
      break $read-grapheme:compute-length
    }
    # TODO: print error message
    return 0
  }
  # prepend trailer bytes
  var result/edi: grapheme <- copy c
  var num-byte-shifts/edx: int <- copy 1
  {
    compare num-trailers, 0
    break-if-<=
    var tmp/eax: byte <- read-byte in
    var tmp2/eax: int <- copy tmp
    tmp2 <- shift-left-bytes tmp2, num-byte-shifts
    result <- or tmp2
    # update loop state
    num-byte-shifts <- increment
    num-trailers <- decrement
    loop
  }
  return result
}

# needed because available primitives only shift by a literal/constant number of bits
fn shift-left-bytes n: int, k: int -> _/eax: int {
  var i/ecx: int <- copy 0
  var result/eax: int <- copy n
  {
    compare i, k
    break-if->=
    compare i, 4  # only 4 bytes in 32 bits
    break-if->=
    result <- shift-left 8
    i <- increment
    loop
  }
  return result
}

# write a grapheme to a stream of bytes
# this is like write-to-stream, except we skip leading 0 bytes
fn write-grapheme out: (addr stream byte), g: grapheme {
$write-grapheme:body: {
  var c/eax: int <- copy g
  append-byte out, c  # first byte is always written
  c <- shift-right 8
  compare c, 0
  break-if-= $write-grapheme:body
  append-byte out, c
  c <- shift-right 8
  compare c, 0
  break-if-= $write-grapheme:body
  append-byte out, c
  c <- shift-right 8
  compare c, 0
  break-if-= $write-grapheme:body
  append-byte out, c
}
}
6732 2020-08-29 05:38:08 +00:00			`# Helpers for Unicode.`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`#`
			`# Mu has no characters, only code points and graphemes.`
			`# Code points are the indivisible atoms of text streams.`
6732 2020-08-29 05:38:08 +00:00			`# https://en.wikipedia.org/wiki/Code_point`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`# Graphemes are the smallest self-contained unit of text.`
			`# Graphemes may consist of multiple code points.`
			`#`
			`# Mu graphemes are always represented in utf-8, and they are required to fit`
			`# in 4 bytes.`
			`#`
			`# Mu doesn't currently support combining code points, or graphemes made of`
6710 - utf-8 encoding Example program: fn main -> r/ebx: int { var x/eax: code-point <- copy 0x2192 # unicode character 'rightwards arrow' print-code-point 0, x print-string 0, "\n" r <- copy 0 } Run: $ ./translate_mu x.mu && ./a.elf → $ 2020-08-03 03:51:52 +00:00			`# multiple code points. One day we will.`
6737 2020-09-07 04:36:05 +00:00			`# We also don't currently support code points that translate into multiple`
			`# or wide graphemes. (In particular, Tab will never be supported.)`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00
			`# transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox`
			`# https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm`
6714 2020-08-03 04:25:28 +00:00			`#`
			`# The day we want to support combining characters, this function will need to`
			`# take multiple code points. Or something.`
7158 2020-11-02 08:07:07 +00:00			`fn to-grapheme in: code-point -> _/eax: grapheme {`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`var c/eax: int <- copy in`
			`var num-trailers/ecx: int <- copy 0`
			`var first/edx: int <- copy 0`
			`$to-grapheme:compute-length: {`
			`# single byte: just return it`
			`compare c, 0x7f`
			`{`
			`break-if->`
7158 2020-11-02 08:07:07 +00:00			`var g/eax: grapheme <- copy c`
			`return g`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`}`
			`# 2 bytes`
			`compare c, 0x7ff`
			`{`
			`break-if->`
			`num-trailers <- copy 1`
			`first <- copy 0xc0`
			`break $to-grapheme:compute-length`
			`}`
			`# 3 bytes`
			`compare c, 0xffff`
			`{`
			`break-if->`
			`num-trailers <- copy 2`
			`first <- copy 0xe0`
			`break $to-grapheme:compute-length`
			`}`
			`# 4 bytes`
			`compare c, 0x1fffff`
			`{`
			`break-if->`
			`num-trailers <- copy 3`
			`first <- copy 0xf0`
			`break $to-grapheme:compute-length`
			`}`
			`# more than 4 bytes: unsupported`
7842 - new directory organization Baremetal is now the default build target and therefore has its sources at the top-level. Baremetal programs build using the phase-2 Mu toolchain that requires a Linux kernel. This phase-2 codebase which used to be at the top-level is now under the linux/ directory. Finally, the phase-2 toolchain, while self-hosting, has a way to bootstrap from a C implementation, which is now stored in linux/bootstrap. The bootstrap C implementation uses some literate programming tools that are now in linux/bootstrap/tools. So the whole thing has gotten inverted. Each directory should build one artifact and include the main sources (along with standard library). Tools used for building it are relegated to sub-directories, even though those tools are often useful in their own right, and have had lots of interesting programs written using them. A couple of things have gotten dropped in this process: - I had old ways to run on just a Linux kernel, or with a Soso kernel. No more. - I had some old tooling for running a single test at the cursor. I haven't used that lately. Maybe I'll bring it back one day. The reorg isn't done yet. Still to do: - redo documentation everywhere. All the README files, all other markdown, particularly vocabulary.md. - clean up how-to-run comments at the start of programs everywhere - rethink what to do with the html/ directory. Do we even want to keep supporting it? In spite of these shortcomings, all the scripts at the top-level, linux/ and linux/bootstrap are working. The names of the scripts also feel reasonable. This is a good milestone to take stock at. 2021-03-04 06:09:50 +00:00			`# TODO: print error message to stderr`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`compare c, 0x1fffff`
			`{`
			`break-if->`
7842 - new directory organization Baremetal is now the default build target and therefore has its sources at the top-level. Baremetal programs build using the phase-2 Mu toolchain that requires a Linux kernel. This phase-2 codebase which used to be at the top-level is now under the linux/ directory. Finally, the phase-2 toolchain, while self-hosting, has a way to bootstrap from a C implementation, which is now stored in linux/bootstrap. The bootstrap C implementation uses some literate programming tools that are now in linux/bootstrap/tools. So the whole thing has gotten inverted. Each directory should build one artifact and include the main sources (along with standard library). Tools used for building it are relegated to sub-directories, even though those tools are often useful in their own right, and have had lots of interesting programs written using them. A couple of things have gotten dropped in this process: - I had old ways to run on just a Linux kernel, or with a Soso kernel. No more. - I had some old tooling for running a single test at the cursor. I haven't used that lately. Maybe I'll bring it back one day. The reorg isn't done yet. Still to do: - redo documentation everywhere. All the README files, all other markdown, particularly vocabulary.md. - clean up how-to-run comments at the start of programs everywhere - rethink what to do with the html/ directory. Do we even want to keep supporting it? In spite of these shortcomings, all the scripts at the top-level, linux/ and linux/bootstrap are working. The names of the scripts also feel reasonable. This is a good milestone to take stock at. 2021-03-04 06:09:50 +00:00			`return 0`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`}`
			`}`
			`# emit trailer bytes, 6 bits from 'in', first two bits '10'`
7163 - first type checks for 'return' statements 2020-11-03 21:29:09 +00:00			`var result/edi: grapheme <- copy 0`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`{`
			`compare num-trailers, 0`
			`break-if-<=`
			`var tmp/esi: int <- copy c`
			`tmp <- and 0x3f`
			`tmp <- or 0x80`
6710 - utf-8 encoding Example program: fn main -> r/ebx: int { var x/eax: code-point <- copy 0x2192 # unicode character 'rightwards arrow' print-code-point 0, x print-string 0, "\n" r <- copy 0 } Run: $ ./translate_mu x.mu && ./a.elf → $ 2020-08-03 03:51:52 +00:00			`result <- shift-left 8`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`result <- or tmp`
			`# update loop state`
			`c <- shift-right 6`
			`num-trailers <- decrement`
			`loop`
			`}`
			`# emit engine`
6710 - utf-8 encoding Example program: fn main -> r/ebx: int { var x/eax: code-point <- copy 0x2192 # unicode character 'rightwards arrow' print-code-point 0, x print-string 0, "\n" r <- copy 0 } Run: $ ./translate_mu x.mu && ./a.elf → $ 2020-08-03 03:51:52 +00:00			`result <- shift-left 8`
			`result <- or c`
			`result <- or first`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`#`
7158 2020-11-02 08:07:07 +00:00			`return result`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00			`}`

7842 - new directory organization Baremetal is now the default build target and therefore has its sources at the top-level. Baremetal programs build using the phase-2 Mu toolchain that requires a Linux kernel. This phase-2 codebase which used to be at the top-level is now under the linux/ directory. Finally, the phase-2 toolchain, while self-hosting, has a way to bootstrap from a C implementation, which is now stored in linux/bootstrap. The bootstrap C implementation uses some literate programming tools that are now in linux/bootstrap/tools. So the whole thing has gotten inverted. Each directory should build one artifact and include the main sources (along with standard library). Tools used for building it are relegated to sub-directories, even though those tools are often useful in their own right, and have had lots of interesting programs written using them. A couple of things have gotten dropped in this process: - I had old ways to run on just a Linux kernel, or with a Soso kernel. No more. - I had some old tooling for running a single test at the cursor. I haven't used that lately. Maybe I'll bring it back one day. The reorg isn't done yet. Still to do: - redo documentation everywhere. All the README files, all other markdown, particularly vocabulary.md. - clean up how-to-run comments at the start of programs everywhere - rethink what to do with the html/ directory. Do we even want to keep supporting it? In spite of these shortcomings, all the scripts at the top-level, linux/ and linux/bootstrap are working. The names of the scripts also feel reasonable. This is a good milestone to take stock at. 2021-03-04 06:09:50 +00:00			`# TODO: bring in tests once we have check-ints-equal`
snapshot: encoding code-points to utf-8 I have it partly working, but just realized I've been reversing the output bytes. 2020-08-03 03:14:50 +00:00
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`# read the next grapheme from a stream of bytes`
7158 2020-11-02 08:07:07 +00:00			`fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {`
7123 - tile: truncate string if necessary 2020-10-27 06:43:58 +00:00			`# if at eof, return EOF`
			`{`
			`var eof?/eax: boolean <- stream-empty? in`
7690 Convert comments about magic constants into metadata. 2021-02-07 08:17:17 +00:00			`compare eof?, 0/false`
7123 - tile: truncate string if necessary 2020-10-27 06:43:58 +00:00			`break-if-=`
7158 2020-11-02 08:07:07 +00:00			`return 0xffffffff`
7123 - tile: truncate string if necessary 2020-10-27 06:43:58 +00:00			`}`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`var c/eax: byte <- read-byte in`
			`var num-trailers/ecx: int <- copy 0`
			`$read-grapheme:compute-length: {`
			`# single byte: just return it`
			`compare c, 0xc0`
			`{`
			`break-if->=`
7158 2020-11-02 08:07:07 +00:00			`var g/eax: grapheme <- copy c`
			`return g`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`}`
			`compare c, 0xfe`
			`{`
			`break-if-<`
7158 2020-11-02 08:07:07 +00:00			`var g/eax: grapheme <- copy c`
			`return g`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`}`
			`# 2 bytes`
			`compare c, 0xe0`
			`{`
			`break-if->=`
			`num-trailers <- copy 1`
			`break $read-grapheme:compute-length`
			`}`
			`# 3 bytes`
			`compare c, 0xf0`
			`{`
			`break-if->=`
			`num-trailers <- copy 2`
			`break $read-grapheme:compute-length`
			`}`
			`# 4 bytes`
			`compare c, 0xf8`
			`{`
			`break-if->=`
			`num-trailers <- copy 3`
			`break $read-grapheme:compute-length`
			`}`
7842 - new directory organization Baremetal is now the default build target and therefore has its sources at the top-level. Baremetal programs build using the phase-2 Mu toolchain that requires a Linux kernel. This phase-2 codebase which used to be at the top-level is now under the linux/ directory. Finally, the phase-2 toolchain, while self-hosting, has a way to bootstrap from a C implementation, which is now stored in linux/bootstrap. The bootstrap C implementation uses some literate programming tools that are now in linux/bootstrap/tools. So the whole thing has gotten inverted. Each directory should build one artifact and include the main sources (along with standard library). Tools used for building it are relegated to sub-directories, even though those tools are often useful in their own right, and have had lots of interesting programs written using them. A couple of things have gotten dropped in this process: - I had old ways to run on just a Linux kernel, or with a Soso kernel. No more. - I had some old tooling for running a single test at the cursor. I haven't used that lately. Maybe I'll bring it back one day. The reorg isn't done yet. Still to do: - redo documentation everywhere. All the README files, all other markdown, particularly vocabulary.md. - clean up how-to-run comments at the start of programs everywhere - rethink what to do with the html/ directory. Do we even want to keep supporting it? In spite of these shortcomings, all the scripts at the top-level, linux/ and linux/bootstrap are working. The names of the scripts also feel reasonable. This is a good milestone to take stock at. 2021-03-04 06:09:50 +00:00			`# TODO: print error message`
			`return 0`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`}`
			`# prepend trailer bytes`
7163 - first type checks for 'return' statements 2020-11-03 21:29:09 +00:00			`var result/edi: grapheme <- copy c`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`var num-byte-shifts/edx: int <- copy 1`
			`{`
			`compare num-trailers, 0`
			`break-if-<=`
			`var tmp/eax: byte <- read-byte in`
			`var tmp2/eax: int <- copy tmp`
			`tmp2 <- shift-left-bytes tmp2, num-byte-shifts`
			`result <- or tmp2`
			`# update loop state`
			`num-byte-shifts <- increment`
			`num-trailers <- decrement`
			`loop`
			`}`
7158 2020-11-02 08:07:07 +00:00			`return result`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`}`

			`# needed because available primitives only shift by a literal/constant number of bits`
7158 2020-11-02 08:07:07 +00:00			`fn shift-left-bytes n: int, k: int -> _/eax: int {`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`var i/ecx: int <- copy 0`
7158 2020-11-02 08:07:07 +00:00			`var result/eax: int <- copy n`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`{`
			`compare i, k`
			`break-if->=`
			`compare i, 4 # only 4 bytes in 32 bits`
			`break-if->=`
			`result <- shift-left 8`
			`i <- increment`
			`loop`
			`}`
7158 2020-11-02 08:07:07 +00:00			`return result`
6733 - read utf-8 'grapheme' from byte stream No support for combining characters. Graphemes are currently just utf-8 encodings of a single Unicode code-point. No support for code-points that require more than 32 bits in utf-8. 2020-08-29 06:24:04 +00:00			`}`

6807 - tile: render intermediate stack state 2020-09-20 04:44:48 +00:00			`# write a grapheme to a stream of bytes`
			`# this is like write-to-stream, except we skip leading 0 bytes`
			`fn write-grapheme out: (addr stream byte), g: grapheme {`
			`$write-grapheme:body: {`
			`var c/eax: int <- copy g`
			`append-byte out, c # first byte is always written`
			`c <- shift-right 8`
			`compare c, 0`
			`break-if-= $write-grapheme:body`
			`append-byte out, c`
			`c <- shift-right 8`
			`compare c, 0`
			`break-if-= $write-grapheme:body`
			`append-byte out, c`
			`c <- shift-right 8`
			`compare c, 0`
			`break-if-= $write-grapheme:body`
			`append-byte out, c`
			`}`
			`}`