# Helpers for Unicode. # # Mu has no characters, only code points and graphemes. # Code points are the indivisible atoms of text streams. # https://en.wikipedia.org/wiki/Code_point # Graphemes are the smallest self-contained unit of text. # Graphemes may consist of multiple code points. # # Mu graphemes are always represented in utf-8, and they are required to fit # in 4 bytes. (This can be confusing if you focus just on ASCII, where Mu's # graphemes and code-points are identical.) # # Mu doesn't currently support combining code points, or graphemes made of # multiple code points. One day we will. # https://en.wikipedia.org/wiki/Combining_character fn to-code-point in: grapheme -> _/eax: code-point { var g/eax: grapheme <- copy in var result/eax: code-point <- copy g # TODO: support non-ASCII return result } # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm fn to-grapheme in: code-point -> _/eax: grapheme { var c/eax: int <- copy in var num-trailers/ecx: int <- copy 0 var first/edx: int <- copy 0 $to-grapheme:compute-length: { # single byte: just return it compare c, 0x7f { break-if-> var g/eax: grapheme <- copy c return g } # 2 bytes compare c, 0x7ff { break-if-> num-trailers <- copy 1 first <- copy 0xc0 break $to-grapheme:compute-length } # 3 bytes compare c, 0xffff { break-if-> num-trailers <- copy 2 first <- copy 0xe0 break $to-grapheme:compute-length } # 4 bytes compare c, 0x1fffff { break-if-> num-trailers <- copy 3 first <- copy 0xf0 break $to-grapheme:compute-length } # more than 4 bytes: unsupported # TODO: print error message to stderr compare c, 0x1fffff { break-if-> return 0 } } # emit trailer bytes, 6 bits from 'in', first two bits '10' var result/edi: grapheme <- copy 0 { compare num-trailers, 0 break-if-<= var tmp/esi: int <- copy c tmp <- and 0x3f tmp <- or 0x80 result <- shift-left 8 result <- or tmp # update loop state c <- shift-right 6 num-trailers <- decrement loop } # emit engine result <- shift-left 8 result <- or c result <- or first # return result } # TODO: bring in tests once we have check-ints-equal # read the next grapheme from a stream of bytes fn read-grapheme in: (addr stream byte) -> _/eax: grapheme { # if at eof, return EOF { var eof?/eax: boolean <- stream-empty? in compare eof?, 0/false break-if-= return 0xffffffff } var c/eax: byte <- read-byte in var num-trailers/ecx: int <- copy 0 $read-grapheme:compute-length: { # single byte: just return it compare c, 0xc0 { break-if->= var g/eax: grapheme <- copy c return g } compare c, 0xfe { break-if-< var g/eax: grapheme <- copy c return g } # 2 bytes compare c, 0xe0 { break-if->= num-trailers <- copy 1 break $read-grapheme:compute-length } # 3 bytes compare c, 0xf0 { break-if->= num-trailers <- copy 2 break $read-grapheme:compute-length } # 4 bytes compare c, 0xf8 { break-if->= num-trailers <- copy 3 break $read-grapheme:compute-length } # TODO: print error message return 0 } # prepend trailer bytes var result/edi: grapheme <- copy c var num-byte-shifts/edx: int <- copy 1 { compare num-trailers, 0 break-if-<= var tmp/eax: byte <- read-byte in var tmp2/eax: int <- copy tmp tmp2 <- shift-left-bytes tmp2, num-byte-shifts result <- or tmp2 # update loop state num-byte-shifts <- increment num-trailers <- decrement loop } return result } # needed because available primitives only shift by a literal/constant number of bits fn shift-left-bytes n: int, k: int -> _/eax: int { var i/ecx: int <- copy 0 var result/eax: int <- copy n { compare i, k break-if->= compare i, 4 # only 4 bytes in 32 bits break-if->= result <- shift-left 8 i <- increment loop } return result } # write a grapheme to a stream of bytes # this is like write-to-stream, except we skip leading 0 bytes fn write-grapheme out: (addr stream byte), g: grapheme { $write-grapheme:body: { var c/eax: int <- copy g append-byte out, c # first byte is always written c <- shift-right 8 compare c, 0 break-if-= $write-grapheme:body append-byte out, c c <- shift-right 8 compare c, 0 break-if-= $write-grapheme:body append-byte out, c c <- shift-right 8 compare c, 0 break-if-= $write-grapheme:body append-byte out, c } }