mu/shell/tokenize.mu

1106 lines
37 KiB
Forth
Raw Normal View History

2021-06-19 04:41:17 +00:00
# The language is indent-sensitive.
# Each line consists of an initial indent token followed by other tokens.
2021-06-17 16:03:24 +00:00
type token {
type: int
2021-06-18 14:53:04 +00:00
# type 0: default
# type 1: stream
2021-06-17 16:03:24 +00:00
text-data: (handle stream byte)
# type 2: skip (end of line or end of file)
2021-06-19 04:41:17 +00:00
# type 3: indent
number-data: int
2021-06-17 16:03:24 +00:00
}
2021-02-27 15:21:29 +00:00
2021-06-17 16:03:24 +00:00
fn tokenize in: (addr gap-buffer), out: (addr stream token), trace: (addr trace) {
trace-text trace, "tokenize", "tokenize"
2021-02-27 06:11:57 +00:00
trace-lower trace
rewind-gap-buffer in
2021-06-19 03:00:29 +00:00
var at-start-of-line?/edi: boolean <- copy 1/true
2021-02-27 06:11:57 +00:00
{
var done?/eax: boolean <- gap-buffer-scan-done? in
compare done?, 0/false
break-if-!=
2021-02-27 21:05:07 +00:00
#
2021-06-19 02:40:35 +00:00
var token-storage: token
var token/edx: (addr token) <- address token-storage
2021-06-19 03:00:29 +00:00
at-start-of-line? <- next-token in, token, at-start-of-line?, trace
2021-02-27 06:11:57 +00:00
var error?/eax: boolean <- has-errors? trace
compare error?, 0/false
{
break-if-=
return
}
var skip?/eax: boolean <- skip-token? token
2021-05-31 00:20:42 +00:00
compare skip?, 0/false
loop-if-!=
2021-02-27 21:05:07 +00:00
write-to-stream out, token # shallow-copy text-data
2021-02-27 06:11:57 +00:00
loop
}
trace-higher trace
}
fn test-tokenize-number {
var in-storage: gap-buffer
var in/esi: (addr gap-buffer) <- address in-storage
initialize-gap-buffer-with in, "123 a"
#
2021-06-17 16:03:24 +00:00
var stream-storage: (stream token 0x10)
var stream/edi: (addr stream token) <- address stream-storage
#
var trace-storage: trace
var trace/edx: (addr trace) <- address trace-storage
initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
tokenize in, stream, trace
#
2021-06-17 16:03:24 +00:00
var curr-token-storage: token
var curr-token/ebx: (addr token) <- address curr-token-storage
read-from-stream stream, curr-token
2021-06-19 04:41:17 +00:00
var curr-token-type/eax: (addr int) <- get curr-token, type
check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-number/before-indent-type"
var curr-token-data/eax: (addr int) <- get curr-token, number-data
check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-number/before-indent"
read-from-stream stream, curr-token
var number?/eax: boolean <- number-token? curr-token
check number?, "F - test-tokenize-number"
var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
check-stream-equal curr-token-data, "123", "F - test-tokenize-number: value"
}
fn test-tokenize-negative-number {
var in-storage: gap-buffer
var in/esi: (addr gap-buffer) <- address in-storage
initialize-gap-buffer-with in, "-123 a"
#
2021-06-17 16:03:24 +00:00
var stream-storage: (stream token 0x10)
var stream/edi: (addr stream token) <- address stream-storage
#
var trace-storage: trace
var trace/edx: (addr trace) <- address trace-storage
initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
tokenize in, stream, trace
#
2021-06-17 16:03:24 +00:00
var curr-token-storage: token
var curr-token/ebx: (addr token) <- address curr-token-storage
read-from-stream stream, curr-token
2021-06-19 04:41:17 +00:00
var curr-token-type/eax: (addr int) <- get curr-token, type
check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-negative-number/before-indent-type"
var curr-token-data/eax: (addr int) <- get curr-token, number-data
check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-negative-number/before-indent"
read-from-stream stream, curr-token
var number?/eax: boolean <- number-token? curr-token
check number?, "F - test-tokenize-negative-number"
var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
check-stream-equal curr-token-data, "-123", "F - test-tokenize-negative-number: value"
}
fn test-tokenize-quote {
var in-storage: gap-buffer
var in/esi: (addr gap-buffer) <- address in-storage
initialize-gap-buffer-with in, "'(a)"
#
2021-06-17 16:03:24 +00:00
var stream-storage: (stream token 0x10)
var stream/edi: (addr stream token) <- address stream-storage
#
var trace-storage: trace
var trace/edx: (addr trace) <- address trace-storage
initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
tokenize in, stream, trace
#
2021-06-17 16:03:24 +00:00
var curr-token-storage: token
var curr-token/ebx: (addr token) <- address curr-token-storage
read-from-stream stream, curr-token
2021-06-19 04:41:17 +00:00
var curr-token-type/eax: (addr int) <- get curr-token, type
check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-quote/before-indent-type"
var curr-token-data/eax: (addr int) <- get curr-token, number-data
check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-quote/before-indent"
read-from-stream stream, curr-token
var quote?/eax: boolean <- quote-token? curr-token
check quote?, "F - test-tokenize-quote: quote"
read-from-stream stream, curr-token
var open-paren?/eax: boolean <- open-paren-token? curr-token
check open-paren?, "F - test-tokenize-quote: open paren"
read-from-stream stream, curr-token # skip a
read-from-stream stream, curr-token
var close-paren?/eax: boolean <- close-paren-token? curr-token
check close-paren?, "F - test-tokenize-quote: close paren"
}
fn test-tokenize-backquote {
var in-storage: gap-buffer
var in/esi: (addr gap-buffer) <- address in-storage
initialize-gap-buffer-with in, "`(a)"
#
2021-06-17 16:03:24 +00:00
var stream-storage: (stream token 0x10)
var stream/edi: (addr stream token) <- address stream-storage
#
var trace-storage: trace
var trace/edx: (addr trace) <- address trace-storage
initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
tokenize in, stream, trace
#
2021-06-17 16:03:24 +00:00
var curr-token-storage: token
var curr-token/ebx: (addr token) <- address curr-token-storage
read-from-stream stream, curr-token
2021-06-19 04:41:17 +00:00
var curr-token-type/eax: (addr int) <- get curr-token, type
check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-backquote/before-indent-type"
var curr-token-data/eax: (addr int) <- get curr-token, number-data
check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-backquote/before-indent"
read-from-stream stream, curr-token
var backquote?/eax: boolean <- backquote-token? curr-token
check backquote?, "F - test-tokenize-backquote: backquote"
read-from-stream stream, curr-token
var open-paren?/eax: boolean <- open-paren-token? curr-token
check open-paren?, "F - test-tokenize-backquote: open paren"
read-from-stream stream, curr-token # skip a
read-from-stream stream, curr-token
var close-paren?/eax: boolean <- close-paren-token? curr-token
check close-paren?, "F - test-tokenize-backquote: close paren"
}
fn test-tokenize-unquote {
var in-storage: gap-buffer
var in/esi: (addr gap-buffer) <- address in-storage
initialize-gap-buffer-with in, ",(a)"
#
2021-06-17 16:03:24 +00:00
var stream-storage: (stream token 0x10)
var stream/edi: (addr stream token) <- address stream-storage
#
var trace-storage: trace
var trace/edx: (addr trace) <- address trace-storage
initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
tokenize in, stream, trace
#
2021-06-17 16:03:24 +00:00
var curr-token-storage: token
var curr-token/ebx: (addr token) <- address curr-token-storage
read-from-stream stream, curr-token
2021-06-19 04:41:17 +00:00
var curr-token-type/eax: (addr int) <- get curr-token, type
check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-unquote/before-indent-type"
var curr-token-data/eax: (addr int) <- get curr-token, number-data
check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-unquote/before-indent"
read-from-stream stream, curr-token
var unquote?/eax: boolean <- unquote-token? curr-token
check unquote?, "F - test-tokenize-unquote: unquote"
read-from-stream stream, curr-token
var open-paren?/eax: boolean <- open-paren-token? curr-token
check open-paren?, "F - test-tokenize-unquote: open paren"
read-from-stream stream, curr-token # skip a
read-from-stream stream, curr-token
var close-paren?/eax: boolean <- close-paren-token? curr-token
check close-paren?, "F - test-tokenize-unquote: close paren"
}
fn test-tokenize-unquote-splice {
var in-storage: gap-buffer
var in/esi: (addr gap-buffer) <- address in-storage
initialize-gap-buffer-with in, ",@a"
#
2021-06-17 16:03:24 +00:00
var stream-storage: (stream token 0x10)
var stream/edi: (addr stream token) <- address stream-storage
#
var trace-storage: trace
var trace/edx: (addr trace) <- address trace-storage
initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
tokenize in, stream, trace
#
2021-06-17 16:03:24 +00:00
var curr-token-storage: token
var curr-token/ebx: (addr token) <- address curr-token-storage
read-from-stream stream, curr-token
2021-06-19 04:41:17 +00:00
var curr-token-type/eax: (addr int) <- get curr-token, type
check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-unquote-splice/before-indent-type"
var curr-token-data/eax: (addr int) <- get curr-token, number-data
check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-unquote-splice/before-indent"
read-from-stream stream, curr-token
var unquote-splice?/eax: boolean <- unquote-splice-token? curr-token
check unquote-splice?, "F - test-tokenize-unquote-splice: unquote-splice"
}
2021-04-15 15:48:53 +00:00
fn test-tokenize-dotted-list {
var in-storage: gap-buffer
var in/esi: (addr gap-buffer) <- address in-storage
2021-04-28 05:14:08 +00:00
initialize-gap-buffer-with in, "(a . b)"
2021-04-15 15:48:53 +00:00
#
2021-06-17 16:03:24 +00:00
var stream-storage: (stream token 0x10)
var stream/edi: (addr stream token) <- address stream-storage
2021-04-15 15:48:53 +00:00
#
var trace-storage: trace
var trace/edx: (addr trace) <- address trace-storage
initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
tokenize in, stream, trace
2021-04-15 15:48:53 +00:00
#
2021-06-17 16:03:24 +00:00
var curr-token-storage: token
var curr-token/ebx: (addr token) <- address curr-token-storage
2021-04-15 15:48:53 +00:00
read-from-stream stream, curr-token
2021-06-19 04:41:17 +00:00
var curr-token-type/eax: (addr int) <- get curr-token, type
check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-dotted-list/before-indent-type"
var curr-token-data/eax: (addr int) <- get curr-token, number-data
check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-dotted-list/before-indent"
read-from-stream stream, curr-token
2021-04-15 15:48:53 +00:00
var open-paren?/eax: boolean <- open-paren-token? curr-token
check open-paren?, "F - test-tokenize-dotted-list: open paren"
read-from-stream stream, curr-token # skip a
read-from-stream stream, curr-token
var dot?/eax: boolean <- dot-token? curr-token
check dot?, "F - test-tokenize-dotted-list: dot"
read-from-stream stream, curr-token # skip b
read-from-stream stream, curr-token
var close-paren?/eax: boolean <- close-paren-token? curr-token
check close-paren?, "F - test-tokenize-dotted-list: close paren"
}
fn test-tokenize-stream-literal {
var in-storage: gap-buffer
var in/esi: (addr gap-buffer) <- address in-storage
initialize-gap-buffer-with in, "[abc def]"
#
2021-06-17 16:03:24 +00:00
var stream-storage: (stream token 0x10)
var stream/edi: (addr stream token) <- address stream-storage
#
var trace-storage: trace
var trace/edx: (addr trace) <- address trace-storage
initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
tokenize in, stream, trace
#
2021-06-17 16:03:24 +00:00
var curr-token-storage: token
var curr-token/ebx: (addr token) <- address curr-token-storage
read-from-stream stream, curr-token
2021-06-19 04:41:17 +00:00
var curr-token-type/eax: (addr int) <- get curr-token, type
check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-stream-literal/before-indent-type"
var curr-token-data/eax: (addr int) <- get curr-token, number-data
check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-stream-literal/before-indent"
read-from-stream stream, curr-token
var stream?/eax: boolean <- stream-token? curr-token
check stream?, "F - test-tokenize-stream-literal: type"
var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
var data-equal?/eax: boolean <- stream-data-equal? curr-token-data, "abc def"
check data-equal?, "F - test-tokenize-stream-literal"
var empty?/eax: boolean <- stream-empty? stream
check empty?, "F - test-tokenize-stream-literal: empty?"
}
fn test-tokenize-stream-literal-in-tree {
var in-storage: gap-buffer
var in/esi: (addr gap-buffer) <- address in-storage
initialize-gap-buffer-with in, "([abc def])"
#
2021-06-17 16:03:24 +00:00
var stream-storage: (stream token 0x10)
var stream/edi: (addr stream token) <- address stream-storage
#
var trace-storage: trace
var trace/edx: (addr trace) <- address trace-storage
initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
tokenize in, stream, trace
#
2021-06-17 16:03:24 +00:00
var curr-token-storage: token
var curr-token/ebx: (addr token) <- address curr-token-storage
read-from-stream stream, curr-token
2021-06-19 04:41:17 +00:00
var curr-token-type/eax: (addr int) <- get curr-token, type
check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-stream-literal-in-tree/before-indent-type"
var curr-token-data/eax: (addr int) <- get curr-token, number-data
check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-stream-literal-in-tree/before-indent"
read-from-stream stream, curr-token
var bracket?/eax: boolean <- bracket-token? curr-token
check bracket?, "F - test-tokenize-stream-literal-in-tree: open paren"
read-from-stream stream, curr-token
var stream?/eax: boolean <- stream-token? curr-token
check stream?, "F - test-tokenize-stream-literal-in-tree: type"
var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
var data-equal?/eax: boolean <- stream-data-equal? curr-token-data, "abc def"
check data-equal?, "F - test-tokenize-stream-literal-in-tree"
read-from-stream stream, curr-token
var bracket?/eax: boolean <- bracket-token? curr-token
check bracket?, "F - test-tokenize-stream-literal-in-tree: close paren"
var empty?/eax: boolean <- stream-empty? stream
check empty?, "F - test-tokenize-stream-literal-in-tree: empty?"
}
2021-06-19 04:41:17 +00:00
fn test-tokenize-indent {
var in-storage: gap-buffer
var in/esi: (addr gap-buffer) <- address in-storage
initialize-gap-buffer-with in, "abc\n def"
#
var stream-storage: (stream token 0x10)
var stream/edi: (addr stream token) <- address stream-storage
#
var trace-storage: trace
var trace/edx: (addr trace) <- address trace-storage
initialize-trace trace, 1/only-errors, 0x10/capacity, 0/visible
tokenize in, stream, trace
#
var curr-token-storage: token
var curr-token/ebx: (addr token) <- address curr-token-storage
read-from-stream stream, curr-token
var curr-token-type/eax: (addr int) <- get curr-token, type
check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-indent/before-indent-type"
var curr-token-data/eax: (addr int) <- get curr-token, number-data
check-ints-equal *curr-token-data, 0/spaces, "F - test-tokenize-indent/before-indent"
read-from-stream stream, curr-token
var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
check-stream-equal curr-token-data, "abc", "F - test-tokenize-indent/before"
#
read-from-stream stream, curr-token
var curr-token-type/eax: (addr int) <- get curr-token, type
check-ints-equal *curr-token-type, 3/indent, "F - test-tokenize-indent/type"
var curr-token-data/eax: (addr int) <- get curr-token, number-data
check-ints-equal *curr-token-data, 2/spaces, "F - test-tokenize-indent"
#
read-from-stream stream, curr-token
var curr-token-data-ah/eax: (addr handle stream byte) <- get curr-token, text-data
var curr-token-data/eax: (addr stream byte) <- lookup *curr-token-data-ah
check-stream-equal curr-token-data, "def", "F - test-tokenize-indent/after"
}
2021-06-19 03:00:29 +00:00
# caller is responsible for threading start-of-line? between calls to next-token
# 'in' may contain whitespace if start-of-line?
fn next-token in: (addr gap-buffer), out: (addr token), start-of-line?: boolean, trace: (addr trace) -> _/edi: boolean {
trace-text trace, "tokenize", "next-token"
2021-02-27 06:11:57 +00:00
trace-lower trace
2021-06-21 05:24:03 +00:00
# save an indent token if necessary
2021-06-19 04:41:17 +00:00
{
compare start-of-line?, 0/false
break-if-=
next-indent-token in, out, trace # might not be returned
2021-06-19 04:41:17 +00:00
}
2021-06-19 02:54:01 +00:00
skip-spaces-from-gap-buffer in
var g/eax: grapheme <- peek-from-gap-buffer in
{
compare g, 0x23/comment
break-if-!=
skip-rest-of-line in
}
var g/eax: grapheme <- peek-from-gap-buffer in
2021-06-19 02:54:01 +00:00
{
compare g, 0xa/newline
break-if-!=
2021-06-19 03:00:29 +00:00
trace-text trace, "tokenize", "newline"
2021-06-19 02:54:01 +00:00
g <- read-from-gap-buffer in
initialize-skip-token out # might drop indent if that's all there was in this line
2021-06-21 05:24:03 +00:00
trace-higher trace
2021-06-19 03:00:29 +00:00
return 1/at-start-of-line
2021-06-19 02:54:01 +00:00
}
{
compare start-of-line?, 0/false
break-if-=
2021-06-21 05:24:03 +00:00
# still here? no comment or newline? return saved indent
trace-higher trace
return 0/not-at-start-of-line
}
{
var done?/eax: boolean <- gap-buffer-scan-done? in
compare done?, 0/false
break-if-=
2021-06-19 03:00:29 +00:00
trace-text trace, "tokenize", "end"
initialize-skip-token out
2021-06-21 05:24:03 +00:00
trace-higher trace
2021-06-19 03:00:29 +00:00
return 1/at-start-of-line
}
2021-05-29 23:01:50 +00:00
var _g/eax: grapheme <- peek-from-gap-buffer in
var g/ecx: grapheme <- copy _g
{
2021-06-06 04:08:32 +00:00
var should-trace?/eax: boolean <- should-trace? trace
compare should-trace?, 0/false
break-if-=
2021-05-29 23:01:50 +00:00
var stream-storage: (stream byte 0x40)
var stream/esi: (addr stream byte) <- address stream-storage
write stream, "next: "
var gval/eax: int <- copy g
write-int32-hex stream, gval
trace trace, "tokenize", stream
}
2021-05-29 22:53:31 +00:00
$next-token:case: {
# open square brackets begin streams
{
compare g, 0x5b/open-square-bracket
break-if-!=
2021-05-29 23:00:59 +00:00
var dummy/eax: grapheme <- read-from-gap-buffer in # skip open bracket
next-stream-token in, out, trace
2021-05-29 22:53:31 +00:00
break $next-token:case
}
2021-02-27 06:11:57 +00:00
# other symbol char
{
2021-03-08 03:46:21 +00:00
var symbol?/eax: boolean <- symbol-grapheme? g
2021-02-27 06:11:57 +00:00
compare symbol?, 0/false
break-if-=
next-symbol-token in, out, trace
2021-05-29 22:53:31 +00:00
break $next-token:case
2021-02-27 06:11:57 +00:00
}
# unbalanced close square brackets are errors
{
compare g, 0x5d/close-square-bracket
break-if-!=
error trace, "unbalanced ']'"
2021-06-19 03:00:29 +00:00
return start-of-line?
}
# other brackets are always single-char tokens
2021-02-27 06:11:57 +00:00
{
2021-03-08 03:46:21 +00:00
var bracket?/eax: boolean <- bracket-grapheme? g
2021-02-27 06:11:57 +00:00
compare bracket?, 0/false
break-if-=
var g/eax: grapheme <- read-from-gap-buffer in
next-bracket-token g, out, trace
2021-05-29 22:53:31 +00:00
break $next-token:case
2021-02-27 06:11:57 +00:00
}
# quote
{
compare g, 0x27/single-quote
break-if-!=
2021-05-29 23:00:59 +00:00
var g/eax: grapheme <- read-from-gap-buffer in # consume
initialize-token out, "'"
2021-05-29 22:53:31 +00:00
break $next-token:case
}
# backquote
{
2021-05-30 23:41:38 +00:00
compare g, 0x60/backquote
break-if-!=
2021-05-29 23:00:59 +00:00
var g/eax: grapheme <- read-from-gap-buffer in # consume
initialize-token out, "`"
2021-05-29 22:53:31 +00:00
break $next-token:case
}
# unquote
{
compare g, 0x2c/comma
break-if-!=
2021-05-29 23:00:59 +00:00
var g/eax: grapheme <- read-from-gap-buffer in # consume
# check for unquote-splice
{
g <- peek-from-gap-buffer in
compare g, 0x40/at-sign
break-if-!=
g <- read-from-gap-buffer in
initialize-token out, ",@"
break $next-token:case
}
initialize-token out, ","
2021-05-29 22:53:31 +00:00
break $next-token:case
}
set-cursor-position 0/screen, 0x40 0x20
{
var foo/eax: int <- copy g
draw-int32-decimal-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, foo, 7/fg 0/bg
}
2021-05-07 16:22:54 +00:00
abort "unknown token type"
2021-02-27 06:11:57 +00:00
}
trace-higher trace
2021-06-06 04:08:32 +00:00
{
var should-trace?/eax: boolean <- should-trace? trace
compare should-trace?, 0/false
break-if-=
var stream-storage: (stream byte 0x400) # maximum possible token size (next-stream-token)
var stream/eax: (addr stream byte) <- address stream-storage
write stream, "=> "
write-token-text-data stream, out
2021-06-06 04:08:32 +00:00
trace trace, "tokenize", stream
}
2021-06-19 03:00:29 +00:00
return start-of-line?
2021-02-27 06:11:57 +00:00
}
fn next-symbol-token in: (addr gap-buffer), _out: (addr token), trace: (addr trace) {
trace-text trace, "tokenize", "looking for a symbol"
2021-02-27 06:11:57 +00:00
trace-lower trace
var out/eax: (addr token) <- copy _out
var out-data-ah/eax: (addr handle stream byte) <- get out, text-data
populate-stream out-data-ah, 0x40/max-symbol-size
var _out-data/eax: (addr stream byte) <- lookup *out-data-ah
var out-data/edi: (addr stream byte) <- copy _out-data
2021-02-27 06:11:57 +00:00
$next-symbol-token:loop: {
var done?/eax: boolean <- gap-buffer-scan-done? in
compare done?, 0/false
break-if-!=
var g/eax: grapheme <- peek-from-gap-buffer in
{
2021-06-06 04:08:32 +00:00
{
var should-trace?/eax: boolean <- should-trace? trace
compare should-trace?, 0/false
}
break-if-=
2021-02-27 06:11:57 +00:00
var stream-storage: (stream byte 0x40)
var stream/esi: (addr stream byte) <- address stream-storage
write stream, "next: "
var gval/eax: int <- copy g
write-int32-hex stream, gval
trace trace, "tokenize", stream
2021-02-27 06:11:57 +00:00
}
# if non-symbol, return
{
2021-03-08 03:46:21 +00:00
var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
2021-02-27 06:11:57 +00:00
compare symbol-grapheme?, 0/false
break-if-!=
trace-text trace, "tokenize", "stop"
2021-02-27 06:11:57 +00:00
break $next-symbol-token:loop
}
var g/eax: grapheme <- read-from-gap-buffer in
write-grapheme out-data, g
2021-02-27 06:11:57 +00:00
loop
}
trace-higher trace
2021-06-06 04:08:32 +00:00
{
var should-trace?/eax: boolean <- should-trace? trace
compare should-trace?, 0/false
break-if-=
var stream-storage: (stream byte 0x40)
var stream/esi: (addr stream byte) <- address stream-storage
write stream, "=> "
rewind-stream out-data
write-stream stream, out-data
2021-06-06 04:08:32 +00:00
trace trace, "tokenize", stream
}
2021-02-27 06:11:57 +00:00
}
fn next-number-token in: (addr gap-buffer), _out: (addr token), trace: (addr trace) {
trace-text trace, "tokenize", "looking for a number"
2021-02-27 06:11:57 +00:00
trace-lower trace
var out/eax: (addr token) <- copy _out
var out-data-ah/eax: (addr handle stream byte) <- get out, text-data
populate-stream out-data-ah, 0x40
var _out-data/eax: (addr stream byte) <- lookup *out-data-ah
var out-data/edi: (addr stream byte) <- copy _out-data
$next-number-token:check-minus: {
var g/eax: grapheme <- peek-from-gap-buffer in
compare g, 0x2d/minus
g <- read-from-gap-buffer in # consume
write-grapheme out-data, g
}
2021-02-27 06:11:57 +00:00
$next-number-token:loop: {
var done?/eax: boolean <- gap-buffer-scan-done? in
compare done?, 0/false
break-if-!=
var g/eax: grapheme <- peek-from-gap-buffer in
{
2021-06-06 04:08:32 +00:00
{
var should-trace?/eax: boolean <- should-trace? trace
compare should-trace?, 0/false
}
break-if-=
2021-02-27 06:11:57 +00:00
var stream-storage: (stream byte 0x40)
var stream/esi: (addr stream byte) <- address stream-storage
write stream, "next: "
var gval/eax: int <- copy g
write-int32-hex stream, gval
trace trace, "tokenize", stream
2021-02-27 06:11:57 +00:00
}
# if not symbol grapheme, return
{
2021-03-08 03:46:21 +00:00
var symbol-grapheme?/eax: boolean <- symbol-grapheme? g
2021-02-27 06:11:57 +00:00
compare symbol-grapheme?, 0/false
break-if-!=
trace-text trace, "tokenize", "stop"
2021-02-27 06:11:57 +00:00
break $next-number-token:loop
}
# if not digit grapheme, abort
{
2021-03-08 03:46:21 +00:00
var digit?/eax: boolean <- decimal-digit? g
2021-02-27 06:11:57 +00:00
compare digit?, 0/false
break-if-!=
error trace, "invalid number"
return
}
trace-text trace, "tokenize", "append"
2021-02-27 06:11:57 +00:00
var g/eax: grapheme <- read-from-gap-buffer in
write-grapheme out-data, g
2021-02-27 06:11:57 +00:00
loop
}
trace-higher trace
}
fn next-stream-token in: (addr gap-buffer), _out: (addr token), trace: (addr trace) {
trace-text trace, "tokenize", "stream"
var out/edi: (addr token) <- copy _out
var out-type/eax: (addr int) <- get out, type
copy-to *out-type, 1/stream
var out-data-ah/eax: (addr handle stream byte) <- get out, text-data
# stream tokens contain whole function definitions on boot, so we always
# give them plenty of space
populate-stream out-data-ah, 0x400/max-definition-size=1KB
var _out-data/eax: (addr stream byte) <- lookup *out-data-ah
var out-data/edi: (addr stream byte) <- copy _out-data
{
var empty?/eax: boolean <- gap-buffer-scan-done? in
compare empty?, 0/false
{
break-if-=
error trace, "unbalanced '['"
return
}
var g/eax: grapheme <- read-from-gap-buffer in
compare g, 0x5d/close-square-bracket
break-if-=
write-grapheme out-data, g
loop
}
2021-06-06 04:08:32 +00:00
{
var should-trace?/eax: boolean <- should-trace? trace
compare should-trace?, 0/false
break-if-=
var stream-storage: (stream byte 0x400) # max-definition-size
var stream/esi: (addr stream byte) <- address stream-storage
write stream, "=> "
rewind-stream out-data
write-stream-immutable stream, out-data
2021-06-06 04:08:32 +00:00
trace trace, "tokenize", stream
}
}
fn next-bracket-token g: grapheme, _out: (addr token), trace: (addr trace) {
trace-text trace, "tokenize", "bracket"
var out/eax: (addr token) <- copy _out
var out-data-ah/eax: (addr handle stream byte) <- get out, text-data
populate-stream out-data-ah, 0x40
var _out-data/eax: (addr stream byte) <- lookup *out-data-ah
var out-data/edi: (addr stream byte) <- copy _out-data
write-grapheme out-data, g
2021-06-06 04:08:32 +00:00
{
var should-trace?/eax: boolean <- should-trace? trace
compare should-trace?, 0/false
break-if-=
var stream-storage: (stream byte 0x40)
var stream/esi: (addr stream byte) <- address stream-storage
write stream, "=> "
rewind-stream out-data
write-stream stream, out-data
2021-06-06 04:08:32 +00:00
trace trace, "tokenize", stream
}
2021-02-27 06:11:57 +00:00
}
fn skip-rest-of-line in: (addr gap-buffer) {
2021-04-30 06:48:36 +00:00
{
var done?/eax: boolean <- gap-buffer-scan-done? in
compare done?, 0/false
break-if-!=
var g/eax: grapheme <- peek-from-gap-buffer in
2021-04-30 06:48:36 +00:00
compare g, 0xa/newline
break-if-=
g <- read-from-gap-buffer in # consume
2021-04-30 06:48:36 +00:00
loop
}
}
2021-06-19 04:41:17 +00:00
fn next-indent-token in: (addr gap-buffer), _out: (addr token), trace: (addr trace) {
trace-text trace, "tokenize", "indent"
trace-lower trace
var out/edi: (addr token) <- copy _out
var out-type/eax: (addr int) <- get out, type
copy-to *out-type, 3/indent
var dest/edi: (addr int) <- get out, number-data
copy-to *dest, 0
{
var done?/eax: boolean <- gap-buffer-scan-done? in
compare done?, 0/false
break-if-!=
var g/eax: grapheme <- peek-from-gap-buffer in
{
{
var should-trace?/eax: boolean <- should-trace? trace
compare should-trace?, 0/false
}
break-if-=
var stream-storage: (stream byte 0x40)
var stream/esi: (addr stream byte) <- address stream-storage
write stream, "next: "
var gval/eax: int <- copy g
write-int32-hex stream, gval
trace trace, "tokenize", stream
}
# if non-space, break
compare g, 0x20/space
break-if-!=
g <- read-from-gap-buffer in
increment *dest
loop
}
trace-higher trace
{
var should-trace?/eax: boolean <- should-trace? trace
compare should-trace?, 0/false
break-if-=
var stream-storage: (stream byte 0x40)
var stream/esi: (addr stream byte) <- address stream-storage
write stream, "=> indent "
write-int32-hex stream, *dest
trace trace, "tokenize", stream
}
}
2021-06-23 04:43:44 +00:00
# Mu carves up the space of graphemes into 4 categories:
# whitespace
# quotes and unquotes (from a Lisp perspective; doesn't include double
# quotes or other Unicode quotes)
# operators
# symbols
# (Numbers have their own parsing rules that don't fit cleanly in this
# partition.)
#
# During tokenization operators and symbols are treated identically.
# A later phase digs into that nuance.
2021-03-08 03:46:21 +00:00
fn symbol-grapheme? g: grapheme -> _/eax: boolean {
2021-06-23 04:43:44 +00:00
var whitespace?/eax: boolean <- whitespace-grapheme? g
compare whitespace?, 0/false
2021-02-27 06:11:57 +00:00
{
2021-06-23 04:43:44 +00:00
break-if-=
2021-02-27 06:11:57 +00:00
return 0/false
}
2021-06-23 04:43:44 +00:00
var quote-or-unquote?/eax: boolean <- quote-or-unquote-grapheme? g
compare quote-or-unquote?, 0/false
2021-02-27 06:11:57 +00:00
{
2021-06-23 04:43:44 +00:00
break-if-=
2021-02-27 06:11:57 +00:00
return 0/false
}
2021-06-23 04:43:44 +00:00
var bracket?/eax: boolean <- bracket-grapheme? g
compare bracket?, 0/false
2021-02-27 06:11:57 +00:00
{
2021-06-23 04:43:44 +00:00
break-if-=
2021-02-27 06:11:57 +00:00
return 0/false
}
2021-06-23 04:43:44 +00:00
compare g, 0x23/hash # comments get filtered out
2021-02-27 06:11:57 +00:00
{
break-if-!=
return 0/false
}
2021-06-23 04:43:44 +00:00
compare g, 0x22/double-quote # double quotes reserved for now
2021-02-27 06:11:57 +00:00
{
break-if-!=
return 0/false
}
2021-06-23 04:43:44 +00:00
return 1/true
}
fn whitespace-grapheme? g: grapheme -> _/eax: boolean {
compare g, 9/tab
2021-02-27 06:11:57 +00:00
{
break-if-!=
2021-06-23 04:43:44 +00:00
return 1/true
2021-02-27 06:11:57 +00:00
}
2021-06-23 04:43:44 +00:00
compare g, 0xa/newline
2021-02-27 06:11:57 +00:00
{
break-if-!=
2021-06-23 04:43:44 +00:00
return 1/true
2021-02-27 06:11:57 +00:00
}
2021-06-23 04:43:44 +00:00
compare g, 0x20/space
2021-02-27 06:11:57 +00:00
{
break-if-!=
2021-06-23 04:43:44 +00:00
return 1/true
2021-02-27 06:11:57 +00:00
}
2021-06-23 04:43:44 +00:00
return 0/false
}
fn quote-or-unquote-grapheme? g: grapheme -> _/eax: boolean {
2021-03-05 23:18:46 +00:00
compare g, 0x27/single-quote
{
break-if-!=
2021-06-23 04:43:44 +00:00
return 1/true
2021-03-05 23:18:46 +00:00
}
compare g, 0x60/backquote
2021-02-27 06:11:57 +00:00
{
break-if-!=
2021-06-23 04:43:44 +00:00
return 1/true
2021-02-27 06:11:57 +00:00
}
compare g, 0x2c/comma
2021-02-27 06:11:57 +00:00
{
break-if-!=
2021-06-23 04:43:44 +00:00
return 1/true
2021-02-27 06:11:57 +00:00
}
compare g, 0x40/at-sign
{
break-if-!=
2021-06-23 04:43:44 +00:00
return 1/true
2021-02-27 06:11:57 +00:00
}
2021-06-23 04:43:44 +00:00
return 0/false
2021-02-27 06:11:57 +00:00
}
2021-03-08 03:46:21 +00:00
fn bracket-grapheme? g: grapheme -> _/eax: boolean {
2021-02-27 06:11:57 +00:00
compare g, 0x28/open-paren
{
break-if-!=
return 1/true
}
compare g, 0x29/close-paren
{
break-if-!=
return 1/true
}
compare g, 0x5b/open-square-bracket
{
break-if-!=
return 1/true
}
compare g, 0x5d/close-square-bracket
{
break-if-!=
return 1/true
}
compare g, 0x7b/open-curly-bracket
{
break-if-!=
return 1/true
}
compare g, 0x7d/close-curly-bracket
{
break-if-!=
return 1/true
}
return 0/false
}
2021-02-27 15:21:29 +00:00
2021-06-19 03:08:19 +00:00
fn number-token? _self: (addr token) -> _/eax: boolean {
var self/eax: (addr token) <- copy _self
var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
var in-data/ecx: (addr stream byte) <- copy _in-data
2021-02-27 15:21:29 +00:00
rewind-stream in-data
var g/eax: grapheme <- read-grapheme in-data
# if '-', read another
{
compare g, 0x2d/minus
break-if-!=
g <- read-grapheme in-data
}
{
{
var result/eax: boolean <- decimal-digit? g
compare result, 0/false
break-if-!=
return 0/false
}
{
var done?/eax: boolean <- stream-empty? in-data
compare done?, 0/false
}
break-if-!=
g <- read-grapheme in-data
loop
}
return 1/true
2021-02-27 15:21:29 +00:00
}
2021-03-02 07:25:22 +00:00
2021-06-19 03:08:19 +00:00
fn bracket-token? _self: (addr token) -> _/eax: boolean {
var self/eax: (addr token) <- copy _self
2021-04-28 06:10:30 +00:00
{
2021-06-19 03:08:19 +00:00
var in-type/eax: (addr int) <- get self, type
2021-06-18 14:53:04 +00:00
compare *in-type, 1/stream
2021-04-28 06:10:30 +00:00
break-if-!=
# streams are never paren tokens
return 0/false
}
2021-06-19 03:08:19 +00:00
var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
2021-03-02 07:25:22 +00:00
var in-data/eax: (addr stream byte) <- lookup *in-data-ah
rewind-stream in-data
var g/eax: grapheme <- read-grapheme in-data
2021-03-08 03:46:21 +00:00
var result/eax: boolean <- bracket-grapheme? g
2021-03-02 07:25:22 +00:00
return result
}
2021-06-19 03:08:19 +00:00
fn quote-token? _self: (addr token) -> _/eax: boolean {
var self/eax: (addr token) <- copy _self
var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
2021-04-06 16:40:13 +00:00
var in-data/eax: (addr stream byte) <- lookup *in-data-ah
rewind-stream in-data
var result/eax: boolean <- stream-data-equal? in-data, "'"
return result
}
2021-06-19 03:08:19 +00:00
fn backquote-token? _self: (addr token) -> _/eax: boolean {
var self/eax: (addr token) <- copy _self
var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
var in-data/eax: (addr stream byte) <- lookup *in-data-ah
rewind-stream in-data
var result/eax: boolean <- stream-data-equal? in-data, "`"
return result
}
2021-06-19 03:08:19 +00:00
fn unquote-token? _self: (addr token) -> _/eax: boolean {
var self/eax: (addr token) <- copy _self
var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
var in-data/eax: (addr stream byte) <- lookup *in-data-ah
rewind-stream in-data
var result/eax: boolean <- stream-data-equal? in-data, ","
return result
}
2021-06-19 03:08:19 +00:00
fn unquote-splice-token? _self: (addr token) -> _/eax: boolean {
var self/eax: (addr token) <- copy _self
var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
var in-data/eax: (addr stream byte) <- lookup *in-data-ah
rewind-stream in-data
var result/eax: boolean <- stream-data-equal? in-data, ",@"
return result
2021-04-06 16:40:13 +00:00
}
2021-06-19 03:08:19 +00:00
fn open-paren-token? _self: (addr token) -> _/eax: boolean {
var self/eax: (addr token) <- copy _self
var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
2021-04-15 15:45:51 +00:00
var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
var in-data/ecx: (addr stream byte) <- copy _in-data
2021-03-02 07:25:22 +00:00
rewind-stream in-data
var g/eax: grapheme <- read-grapheme in-data
compare g, 0x28/open-paren
{
break-if-!=
2021-04-15 15:45:51 +00:00
var result/eax: boolean <- stream-empty? in-data
return result
2021-03-02 07:25:22 +00:00
}
return 0/false
}
2021-06-19 03:08:19 +00:00
fn close-paren-token? _self: (addr token) -> _/eax: boolean {
var self/eax: (addr token) <- copy _self
var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
2021-04-15 15:45:51 +00:00
var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
var in-data/ecx: (addr stream byte) <- copy _in-data
2021-03-02 07:25:22 +00:00
rewind-stream in-data
var g/eax: grapheme <- read-grapheme in-data
2021-04-15 15:45:51 +00:00
compare g, 0x29/close-paren
2021-03-02 07:25:22 +00:00
{
break-if-!=
2021-04-15 15:45:51 +00:00
var result/eax: boolean <- stream-empty? in-data
return result
2021-03-02 07:25:22 +00:00
}
return 0/false
}
2021-04-15 15:48:53 +00:00
2021-06-19 03:08:19 +00:00
fn dot-token? _self: (addr token) -> _/eax: boolean {
var self/eax: (addr token) <- copy _self
var in-data-ah/eax: (addr handle stream byte) <- get self, text-data
2021-04-15 15:48:53 +00:00
var _in-data/eax: (addr stream byte) <- lookup *in-data-ah
var in-data/ecx: (addr stream byte) <- copy _in-data
rewind-stream in-data
var g/eax: grapheme <- read-grapheme in-data
compare g, 0x2e/dot
{
break-if-!=
var result/eax: boolean <- stream-empty? in-data
return result
}
return 0/false
}
fn test-dot-token {
2021-06-17 16:03:24 +00:00
var tmp-storage: (handle token)
var tmp-ah/eax: (addr handle token) <- address tmp-storage
allocate-token tmp-ah
2021-06-17 16:03:24 +00:00
var tmp/eax: (addr token) <- lookup *tmp-ah
initialize-token tmp, "."
2021-04-15 15:48:53 +00:00
var result/eax: boolean <- dot-token? tmp
check result, "F - test-dot-token"
}
2021-06-19 03:08:19 +00:00
fn stream-token? _self: (addr token) -> _/eax: boolean {
var self/eax: (addr token) <- copy _self
var in-type/eax: (addr int) <- get self, type
2021-06-18 14:53:04 +00:00
compare *in-type, 1/stream
{
break-if-=
return 0/false
}
return 1/true
}
2021-04-30 06:48:36 +00:00
2021-06-19 03:08:19 +00:00
fn skip-token? _self: (addr token) -> _/eax: boolean {
var self/eax: (addr token) <- copy _self
var in-type/eax: (addr int) <- get self, type
compare *in-type, 2/skip
{
break-if-=
return 0/false
}
return 1/true
}
2021-06-19 04:41:17 +00:00
fn indent-token? _self: (addr token) -> _/eax: boolean {
var self/eax: (addr token) <- copy _self
var in-type/eax: (addr int) <- get self, type
compare *in-type, 3/indent
{
break-if-=
return 0/false
}
return 1/true
}
2021-06-19 03:08:19 +00:00
fn allocate-token _self-ah: (addr handle token) {
var self-ah/eax: (addr handle token) <- copy _self-ah
allocate self-ah
var self/eax: (addr token) <- lookup *self-ah
var dest-ah/eax: (addr handle stream byte) <- get self, text-data
2021-06-17 16:03:24 +00:00
populate-stream dest-ah, 0x40/max-symbol-size
}
fn initialize-token _self: (addr token), val: (addr array byte) {
var self/eax: (addr token) <- copy _self
2021-06-19 03:08:19 +00:00
var dest-ah/eax: (addr handle stream byte) <- get self, text-data
populate-stream dest-ah, 0x40
2021-06-17 16:03:24 +00:00
var dest/eax: (addr stream byte) <- lookup *dest-ah
write dest, val
}
fn initialize-skip-token _self: (addr token) {
var self/eax: (addr token) <- copy _self
var self-type/eax: (addr int) <- get self, type
copy-to *self-type, 2/skip
}
fn write-token-text-data out: (addr stream byte), _self: (addr token) {
var self/eax: (addr token) <- copy _self
var data-ah/eax: (addr handle stream byte) <- get self, text-data
var data/eax: (addr stream byte) <- lookup *data-ah
rewind-stream data
write-stream out, data
2021-06-17 16:03:24 +00:00
}
fn tokens-equal? _a: (addr token), _b: (addr token) -> _/eax: boolean {
var a/edx: (addr token) <- copy _a
var b/ebx: (addr token) <- copy _b
var a-type-addr/eax: (addr int) <- get a, type
var a-type/eax: int <- copy *a-type-addr
var b-type-addr/ecx: (addr int) <- get b, type
compare a-type, *b-type-addr
{
break-if-=
return 0/false
}
compare a-type, 2/skip
{
break-if-!=
# skip tokens have no other data
return 1/true
}
compare a-type, 3/indent
{
break-if-!=
# indent tokens have no other data
var a-number-data-addr/eax: (addr int) <- get a, number-data
var a-number-data/eax: int <- copy *a-number-data-addr
var b-number-data-addr/ecx: (addr int) <- get b, number-data
compare a-number-data, *b-number-data-addr
{
break-if-=
return 0/false
}
return 1/true
}
var b-data-ah/eax: (addr handle stream byte) <- get b, text-data
var _b-data/eax: (addr stream byte) <- lookup *b-data-ah
var b-data/ebx: (addr stream byte) <- copy _b-data
var a-data-ah/eax: (addr handle stream byte) <- get a, text-data
var a-data/eax: (addr stream byte) <- lookup *a-data-ah
var data-match?/eax: boolean <- streams-data-equal? a-data, b-data
return data-match?
}
fn dump-token-from-cursor _t: (addr token) {
var t/esi: (addr token) <- copy _t
var type/eax: (addr int) <- get t, type
draw-int32-decimal-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, *type, 7/fg 0/bg
draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, " ", 7/fg 0/bg
var text-ah/eax: (addr handle stream byte) <- get t, text-data
var text/eax: (addr stream byte) <- lookup *text-ah
rewind-stream text
draw-stream-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, text, 7/fg 0/bg
draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, " ", 7/fg 0/bg
var num/eax: (addr int) <- get t, number-data
draw-int32-decimal-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, *num, 7/fg 0/bg
draw-text-wrapping-right-then-down-from-cursor-over-full-screen 0/screen, "\n", 7/fg 0/bg
}