first rendering of non-latin script

Open question: why does column 0 get cropped? The spacing also seems excessive. Are we taking up 3 grid points?
2021-08-30 00:32:15 -07:00 · 2021-08-30 00:32:15 -07:00 · c970190021
parent 877fbf640a
commit c970190021
2 changed files with 127 additions and 2 deletions
--- a/403unicode.mu
+++ b/403unicode.mu
@ -14,9 +14,88 @@
 # multiple code points. One day we will.
 #   https://en.wikipedia.org/wiki/Combining_character

+fn test-unicode-serialization-and-deserialization {
+  var i/ebx: int <- copy 0
+  var init?/esi: boolean <- copy 1/true
+  {
+    compare i, 0x10000  # 32 bits of utf-8 are sufficient for https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
+                        # but not emoji
+    break-if->=
+    var c/eax: code-point <- copy i
+    var _g/eax: grapheme <- to-grapheme c
+    var g/ecx: grapheme <- copy _g
+    var c2/eax: code-point <- to-code-point g
+    compare i, c2
+    {
+      break-if-=
+      {
+        compare init?, 0/false
+        break-if-=
+        draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "F - test-unicode-serialization-and-deserialization: ", 3/fg 0/bg
+      }
+      init? <- copy 0/false
+      draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, i, 3/fg 0/bg
+      draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
+      {
+        var x/eax: int <- copy g
+        draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x, 3/fg 0/bg
+      }
+      draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "/", 3/fg 0/bg
+      {
+        var x2/eax: int <- copy c2
+        draw-int32-hex-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, x2, 3/fg 0/bg
+      }
+      draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, " ", 3/fg 0/bg
+    }
+    i <- add 0xf  # to speed things up; ensure increment is not a power of 2
+    loop
+  }
+}
+
+# transliterated from tb_utf8_char_to_unicode in https://github.com/nsf/termbox
 fn to-code-point in: grapheme -> _/eax: code-point {
-  var g/eax: grapheme <- copy in
-  var result/eax: code-point <- copy g  # TODO: support non-ASCII
+  var g/ebx: int <- copy in
+  # if single byte, just return it
+  {
+    compare g, 0xff
+    break-if->
+    var result/eax: code-point <- copy g
+    return result
+  }
+  #
+  var len/edx: int <- grapheme-length in
+  # extract bits from first byte
+  var b/eax: byte <- copy-byte g
+  var result/edi: code-point <- copy b
+  {
+    compare len, 2
+    break-if-!=
+    result <- and 0x1f
+  }
+  {
+    compare len, 3
+    break-if-!=
+    result <- and 0x0f
+  }
+  {
+    compare len, 4
+    break-if-!=
+    result <- and 0x07
+  }
+  # extract bits from remaining bytes
+  g <- shift-right 8
+  var i/ecx: int <- copy 1
+  {
+    compare i, len
+    break-if->=
+    var b/eax: byte <- copy-byte g
+    b <- and 0x3f
+    result <- shift-left 6
+    result <- or b
+    g <- shift-right 8
+    i <- increment
+    loop
+  }
  return result
 }

@ -220,6 +299,25 @@ fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
  return result
 }

+fn grapheme-length g: grapheme -> _/edx: int {
+  {
+    compare g, 0xff
+    break-if->
+    return 1
+  }
+  {
+    compare g, 0xffff
+    break-if->
+    return 2
+  }
+  {
+    compare g, 0xffffff
+    break-if->
+    return 3
+  }
+  return 4
+}
+
 # needed because available primitives only shift by a literal/constant number of bits
 fn shift-left-bytes n: int, k: int -> _/eax: int {
  var i/ecx: int <- copy 0
--- a/apps/ex14.mu
+++ b/apps/ex14.mu
@ -0,0 +1,27 @@
+# Unicode demo
+#
+# Mu can't read Unicode from keyboard yet, so we'll read from disk and print
+# to screen.
+#
+# Steps for trying it out:
+#   1. Translate this example into a disk image code.img.
+#       ./translate apps/ex14.mu
+#   2. Build a second disk image data.img containing some Unicode text.
+#       dd if=/dev/zero of=data.img count=20160
+#       echo 'நட' |dd of=data.img conv=notrunc
+#   3. Run:
+#       qemu-system-i386 -hda code.img -hdb data.img
+#
+# Expected output: 'நட' in green near the top-left corner of screen
+#
+# Limitations:
+#   - Utf-8 is the one true encoding.
+#   - No keyboard support yet.
+#   - Just single-code-point graphemes so far. No combiner characters, etc.
+
+fn main screen: (addr screen), keyboard: (addr keyboard), data-disk: (addr disk) {
+  var text-storage: (stream byte 0x200)
+  var text/esi: (addr stream byte) <- address text-storage
+  load-sectors data-disk, 0/lba, 1/num-sectors, text
+  var dummy/eax: int <- draw-stream-rightward screen, text, 1/x 0x80/xmax 0/y, 0xa/fg, 0/bg
+}