https://github.com/akkartik/mu/blob/main/403unicode.mu
  1 # Helpers for Unicode.
  2 #
  3 # Mu has no characters, only code points and graphemes.
  4 # Code points are the indivisible atoms of text streams.
  5 #   https://en.wikipedia.org/wiki/Code_point
  6 # Graphemes are the smallest self-contained unit of text.
  7 # Graphemes may consist of multiple code points.
  8 #
  9 # Mu graphemes are always represented in utf-8, and they are required to fit
 10 # in 4 bytes.
 11 #
 12 # Mu doesn't currently support combining code points, or graphemes made of
 13 # multiple code points. One day we will.
 14 # We also don't currently support code points that translate into multiple
 15 # or wide graphemes. (In particular, Tab will never be supported.)
 16 
 17 # transliterated from tb_utf8_unicode_to_char in https://github.com/nsf/termbox
 18 # https://wiki.tcl-lang.org/page/UTF%2D8+bit+by+bit explains the algorithm
 19 #
 20 # The day we want to support combining characters, this function will need to
 21 # take multiple code points. Or something.
 22 fn to-grapheme in: code-point -> _/eax: grapheme {
 23   var c/eax: int <- copy in
 24   var num-trailers/ecx: int <- copy 0
 25   var first/edx: int <- copy 0
 26   $to-grapheme:compute-length: {
 27     # single byte: just return it
 28     compare c, 0x7f
 29     {
 30       break-if->
 31       var g/eax: grapheme <- copy c
 32       return g
 33     }
 34     # 2 bytes
 35     compare c, 0x7ff
 36     {
 37       break-if->
 38       num-trailers <- copy 1
 39       first <- copy 0xc0
 40       break $to-grapheme:compute-length
 41     }
 42     # 3 bytes
 43     compare c, 0xffff
 44     {
 45       break-if->
 46       num-trailers <- copy 2
 47       first <- copy 0xe0
 48       break $to-grapheme:compute-length
 49     }
 50     # 4 bytes
 51     compare c, 0x1fffff
 52     {
 53       break-if->
 54       num-trailers <- copy 3
 55       first <- copy 0xf0
 56       break $to-grapheme:compute-length
 57     }
 58     # more than 4 bytes: unsupported
 59     # TODO: print error message to stderr
 60     compare c, 0x1fffff
 61     {
 62       break-if->
 63       return 0
 64     }
 65   }
 66   # emit trailer bytes, 6 bits from 'in', first two bits '10'
 67   var result/edi: grapheme <- copy 0
 68   {
 69     compare num-trailers, 0
 70     break-if-<=
 71     var tmp/esi: int <- copy c
 72     tmp <- and 0x3f
 73     tmp <- or 0x80
 74     result <- shift-left 8
 75     result <- or tmp
 76     # update loop state
 77     c <- shift-right 6
 78     num-trailers <- decrement
 79     loop
 80   }
 81   # emit engine
 82   result <- shift-left 8
 83   result <- or c
 84   result <- or first
 85   #
 86   return result
 87 }
 88 
 89 # TODO: bring in tests once we have check-ints-equal
 90 
 91 # read the next grapheme from a stream of bytes
 92 fn read-grapheme in: (addr stream byte) -> _/eax: grapheme {
 93   # if at eof, return EOF
 94   {
 95     var eof?/eax: boolean <- stream-empty? in
 96     compare eof?, 0/false
 97     break-if-=
 98     return 0xffffffff
 99   }
100   var c/eax: byte <- read-byte in
101   var num-trailers/ecx: int <- copy 0
102   $read-grapheme:compute-length: {
103     # single byte: just return it
104     compare c, 0xc0
105     {
106       break-if->=
107       var g/eax: grapheme <- copy c
108       return g
109     }
110     compare c, 0xfe
111     {
112       break-if-<
113       var g/eax: grapheme <- copy c
114       return g
115     }
116     # 2 bytes
117     compare c, 0xe0
118     {
119       break-if->=
120       num-trailers <- copy 1
121       break $read-grapheme:compute-length
122     }
123     # 3 bytes
124     compare c, 0xf0
125     {
126       break-if->=
127       num-trailers <- copy 2
128       break $read-grapheme:compute-length
129     }
130     # 4 bytes
131     compare c, 0xf8
132     {
133       break-if->=
134       num-trailers <- copy 3
135       break $read-grapheme:compute-length
136     }
137     # TODO: print error message
138     return 0
139   }
140   # prepend trailer bytes
141   var result/edi: grapheme <- copy c
142   var num-byte-shifts/edx: int <- copy 1
143   {
144     compare num-trailers, 0
145     break-if-<=
146     var tmp/eax: byte <- read-byte in
147     var tmp2/eax: int <- copy tmp
148     tmp2 <- shift-left-bytes tmp2, num-byte-shifts
149     result <- or tmp2
150     # update loop state
151     num-byte-shifts <- increment
152     num-trailers <- decrement
153     loop
154   }
155   return result
156 }
157 
158 # needed because available primitives only shift by a literal/constant number of bits
159 fn shift-left-bytes n: int, k: int -> _/eax: int {
160   var i/ecx: int <- copy 0
161   var result/eax: int <- copy n
162   {
163     compare i, k
164     break-if->=
165     compare i, 4  # only 4 bytes in 32 bits
166     break-if->=
167     result <- shift-left 8
168     i <- increment
169     loop
170   }
171   return result
172 }
173 
174 # write a grapheme to a stream of bytes
175 # this is like write-to-stream, except we skip leading 0 bytes
176 fn write-grapheme out: (addr stream byte), g: grapheme {
177 $write-grapheme:body: {
178   var c/eax: int <- copy g
179   append-byte out, c  # first byte is always written
180   c <- shift-right 8
181   compare c, 0
182   break-if-= $write-grapheme:body
183   append-byte out, c
184   c <- shift-right 8
185   compare c, 0
186   break-if-= $write-grapheme:body
187   append-byte out, c
188   c <- shift-right 8
189   compare c, 0
190   break-if-= $write-grapheme:body
191   append-byte out, c
192 }
193 }