mu/subx/038---literal_strings.cc

//: Allow instructions to mention literals directly.
//:
//: This layer will transparently move them to the global segment (assumed to
//: always be the second segment).

:(scenario transform_literal_string)
== code
b8/copy  "test"/imm32
== data  # need to manually create this for now
+transform: -- move literal strings to data segment
+transform: adding global variable '__subx_global_1' containing "test"
+transform: instruction after transform: 'b8 __subx_global_1'

//: We don't rely on any transforms running in previous layers, but this layer
//: knows about labels and global variables and will emit them for previous
//: layers to transform.
:(after "Begin Transforms")
// Begin Level-3 Transforms
Transform.push_back(transform_literal_strings);
// End Level-3 Transforms

:(before "End Globals")
int Next_auto_global = 1;
:(code)
void transform_literal_strings(program& p) {
  trace(99, "transform") << "-- move literal strings to data segment" << end();
  if (p.segments.empty()) return;
  segment& code = p.segments.at(0);
  segment data;
  for (int i = 0;  i < SIZE(code.lines);  ++i) {
    line& inst = code.lines.at(i);
    for (int j = 0;  j < SIZE(inst.words);  ++j) {
      word& curr = inst.words.at(j);
      if (curr.data.at(0) != '"') continue;
      ostringstream global_name;
      global_name << "__subx_global_" << Next_auto_global;
      ++Next_auto_global;
      add_global_to_data_segment(global_name.str(), curr, data);
      curr.data = global_name.str();
    }
    trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end();
  }
  if (data.lines.empty()) return;
  if (SIZE(p.segments) < 2) {
    p.segments.resize(2);
    p.segments.at(1).lines.swap(data.lines);
  }
  vector<line>& existing_data = p.segments.at(1).lines;
  existing_data.insert(existing_data.end(), data.lines.begin(), data.lines.end());
}

void add_global_to_data_segment(const string& name, const word& value, segment& data) {
  trace(99, "transform") << "adding global variable '" << name << "' containing " << value.data << end();
  // emit label
  data.lines.push_back(label(name));
  // emit size for size-prefixed array
  data.lines.push_back(line());
  emit_hex_bytes(data.lines.back(), SIZE(value.data)-/*skip quotes*/2, 4/*bytes*/);
  // emit data byte by byte
  data.lines.push_back(line());
  line& curr = data.lines.back();
  for (int i = /*skip start quote*/1;  i < SIZE(value.data)-/*skip end quote*/1;  ++i) {
    char c = value.data.at(i);
    curr.words.push_back(word());
    curr.words.back().data = hex_byte_to_string(c);
    curr.words.back().metadata.push_back(string(1, c));
  }
}

//: Within strings, whitespace is significant. So we need to redo our instruction
//: parsing.

:(scenarios parse_instruction_character_by_character)
:(scenario instruction_with_string_literal)
a "abc  def" z  # two spaces inside string
+parse2: word: a
+parse2: word: "abc  def"
+parse2: word: z
# no other words
$parse2: 3

:(before "End Line Parsing Special-cases(line_data -> l)")
if (line_data.find('"') != string::npos) {  // can cause false-positives, but we can handle them
  parse_instruction_character_by_character(line_data, l);
  continue;
}

:(code)
void parse_instruction_character_by_character(const string& line_data, vector<line>& out) {
  if (line_data.find('\n') != string::npos  && line_data.find('\n') != line_data.size()-1) {
    raise << "parse_instruction_character_by_character: should receive only a single line\n" << end();
    return;
  }
  // parse literals
  istringstream in(line_data);
  in >> std::noskipws;
  line result;
  // add tokens (words or strings) one by one
  while (has_data(in)) {
    skip_whitespace(in);
    if (!has_data(in)) break;
    char c = in.get();
    if (c == '#') break;  // comment; drop rest of line
    if (c == ':') break;  // line metadata; skip for now
    if (c == '.') {
      if (!has_data(in)) break;  // comment token at end of line
      if (isspace(in.peek()))
        continue;  // '.' followed by space is comment token; skip
    }
    result.words.push_back(word());
    if (c == '"') {
      // slurp word data
      ostringstream d;
      d << c;
      while (has_data(in)) {
        in >> c;
        d << c;
        if (c == '"') break;
      }
      result.words.back().data = d.str();
      // slurp metadata
      ostringstream m;
      while (!isspace(in.peek()) && has_data(in)) {
        in >> c;
        if (c == '/') {
          if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
          m.str("");
        }
        else {
          m << c;
        }
      }
      if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
    }
    else {
      // slurp all characters until whitespace
      ostringstream w;
      w << c;
      while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
        in >> c;
        w << c;
      }
      parse_word(w.str(), result.words.back());
    }
    trace(99, "parse2") << "word: " << to_string(result.words.back()) << end();
  }
  if (!result.words.empty())
    out.push_back(result);
}

void skip_whitespace(istream& in) {
  while (true) {
    if (has_data(in) && isspace(in.peek())) in.get();
    else break;
  }
}

void skip_comment(istream& in) {
  if (has_data(in) && in.peek() == '#') {
    in.get();
    while (has_data(in) && in.peek() != '\n') in.get();
  }
}

// helper for tests
void parse_instruction_character_by_character(const string& line_data) {
  vector<line> out;
  parse_instruction_character_by_character(line_data, out);
}

:(scenario parse2_comment_token_in_middle)
a . z
+parse2: word: a
+parse2: word: z
-parse2: word: .
# no other words
$parse2: 2

:(scenario parse2_word_starting_with_dot)
a .b c
+parse2: word: a
+parse2: word: .b
+parse2: word: c

:(scenario parse2_comment_token_at_start)
. a b
+parse2: word: a
+parse2: word: b
-parse2: word: .

:(scenario parse2_comment_token_at_end)
a b .
+parse2: word: a
+parse2: word: b
-parse2: word: .

:(scenario parse2_word_starting_with_dot_at_start)
.a b c
+parse2: word: .a
+parse2: word: b
+parse2: word: c

:(scenario parse2_metadata)
.a b/c d
+parse2: word: .a
+parse2: word: b /c
+parse2: word: d

:(scenario parse2_string_with_metadata)
a "bc  def"/disp32 g
+parse2: word: a
+parse2: word: "bc  def" /disp32
+parse2: word: g

:(scenario parse2_string_with_metadata_at_end)
a "bc  def"/disp32
+parse2: word: a
+parse2: word: "bc  def" /disp32

:(code)
void test_parse2_string_with_metadata_at_end_of_line_without_newline() {
  parse_instruction_character_by_character(
      "68/push \"test\"/f"  // no newline, which is how calls from parse() will look
  );
  CHECK_TRACE_CONTENTS(
      "parse2: word: 68 /push"
      "parse2: word: \"test\" /f"
  );
}

//: Make sure slashes inside strings don't trigger adding stuff from inside the
//: string to metadata.
:(scenario parse2_string_containing_slashes)
a "bc/def"/disp32
+parse2: word: "bc/def" /disp32
-- support string literals directly in code

Doesn't de-duplicate in the data segment, though. If you use the literal
"foo" a hundred times in your code segment you're gonna spend a hundred
times the space you need to.

We can now simplify our test harness a bit in the factorial app, but we
still have to put in commandline args to compare with manually. We only
support length-prefixed strings, not null-terminated ones.

											
										
										
											2018-09-23 04:56:00 +00:00
+								//: Allow instructions to mention literals directly.
 								//:
 								//: This layer will transparently move them to the global segment (assumed to
 								//: always be the second segment).
 								:(scenario transform_literal_string)
 								== code
-

											
										
										
											2018-10-06 04:30:22 +00:00
+								b8/copy  "test"/imm32
-- redo simulated RAM

Now simulated 'Memory' isn't just a single flat array. Instead it knows
about segments and VMAs.

The code segment will always be first, and the data/heap segment will always
be second. The brk() syscall knows about the data segment.

One nice side-effect is that I no longer need to mess with Memory initialization
regardless of where I place my segments.

											
										
										
											2018-09-29 06:08:27 +00:00
+								== data  # need to manually create this for now
-- support string literals directly in code

Doesn't de-duplicate in the data segment, though. If you use the literal
"foo" a hundred times in your code segment you're gonna spend a hundred
times the space you need to.

We can now simplify our test harness a bit in the factorial app, but we
still have to put in commandline args to compare with manually. We only
support length-prefixed strings, not null-terminated ones.

											
										
										
											2018-09-23 04:56:00 +00:00
+								+transform: -- move literal strings to data segment
 								+transform: adding global variable '__subx_global_1' containing "test"
 								+transform: instruction after transform: 'b8 __subx_global_1'
 								//: We don't rely on any transforms running in previous layers, but this layer
 								//: knows about labels and global variables and will emit them for previous
 								//: layers to transform.
 								:(after "Begin Transforms")
 								// Begin Level-3 Transforms
 								Transform.push_back(transform_literal_strings);
 								// End Level-3 Transforms
 								:(before "End Globals")
 								int Next_auto_global = 1;
 								:(code)
 								void transform_literal_strings(program& p) {
 								  trace(99, "transform") << "-- move literal strings to data segment" << end();
 								  if (p.segments.empty()) return;
 								  segment& code = p.segments.at(0);
 								  segment data;
 								  for (int i = 0;  i < SIZE(code.lines);  ++i) {
 								    line& inst = code.lines.at(i);
 								    for (int j = 0;  j < SIZE(inst.words);  ++j) {
 								      word& curr = inst.words.at(j);
 								      if (curr.data.at(0) != '"') continue;
 								      ostringstream global_name;
 								      global_name << "__subx_global_" << Next_auto_global;
 								      ++Next_auto_global;
 								      add_global_to_data_segment(global_name.str(), curr, data);
 								      curr.data = global_name.str();
 								    }
 								    trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end();
 								  }
 								  if (data.lines.empty()) return;
 								  if (SIZE(p.segments) < 2) {
 								    p.segments.resize(2);
 								    p.segments.at(1).lines.swap(data.lines);
 								  }
 								  vector<line>& existing_data = p.segments.at(1).lines;
 								  existing_data.insert(existing_data.end(), data.lines.begin(), data.lines.end());
 								}
 								void add_global_to_data_segment(const string& name, const word& value, segment& data) {
 								  trace(99, "transform") << "adding global variable '" << name << "' containing " << value.data << end();
 								  // emit label
 								  data.lines.push_back(label(name));
 								  // emit size for size-prefixed array
 								  data.lines.push_back(line());
 								  emit_hex_bytes(data.lines.back(), SIZE(value.data)-/*skip quotes*/2, 4/*bytes*/);
 								  // emit data byte by byte
 								  data.lines.push_back(line());
 								  line& curr = data.lines.back();
 								  for (int i = /*skip start quote*/1;  i < SIZE(value.data)-/*skip end quote*/1;  ++i) {
 								    char c = value.data.at(i);
 								    curr.words.push_back(word());
 								    curr.words.back().data = hex_byte_to_string(c);
 								    curr.words.back().metadata.push_back(string(1, c));
 								  }
 								}
 								//: Within strings, whitespace is significant. So we need to redo our instruction
 								//: parsing.
 								:(scenarios parse_instruction_character_by_character)
 								:(scenario instruction_with_string_literal)
 								a "abc  def" z  # two spaces inside string
 								+parse2: word: a
 								+parse2: word: "abc  def"
 								+parse2: word: z
 								# no other words
 								$parse2: 3
 								:(before "End Line Parsing Special-cases(line_data -> l)")
 								if (line_data.find('"') != string::npos) {  // can cause false-positives, but we can handle them
 								  parse_instruction_character_by_character(line_data, l);
 								  continue;
 								}
 								:(code)
 								void parse_instruction_character_by_character(const string& line_data, vector<line>& out) {
-

											
										
										
											2018-11-25 03:55:59 +00:00
+								  if (line_data.find('\n') != string::npos  && line_data.find('\n') != line_data.size()-1) {
 								    raise << "parse_instruction_character_by_character: should receive only a single line\n" << end();
 								    return;
 								  }
-- support string literals directly in code

Doesn't de-duplicate in the data segment, though. If you use the literal
"foo" a hundred times in your code segment you're gonna spend a hundred
times the space you need to.

We can now simplify our test harness a bit in the factorial app, but we
still have to put in commandline args to compare with manually. We only
support length-prefixed strings, not null-terminated ones.

											
										
										
											2018-09-23 04:56:00 +00:00
+								  // parse literals
 								  istringstream in(line_data);
 								  in >> std::noskipws;
 								  line result;
 								  // add tokens (words or strings) one by one
 								  while (has_data(in)) {
 								    skip_whitespace(in);
 								    if (!has_data(in)) break;
 								    char c = in.get();
 								    if (c == '#') break;  // comment; drop rest of line
 								    if (c == ':') break;  // line metadata; skip for now
 								    if (c == '.') {
 								      if (!has_data(in)) break;  // comment token at end of line
 								      if (isspace(in.peek()))
 								        continue;  // '.' followed by space is comment token; skip
 								    }
-

											
										
										
											2018-11-25 03:55:59 +00:00
+								    result.words.push_back(word());
-- support string literals directly in code

Doesn't de-duplicate in the data segment, though. If you use the literal
"foo" a hundred times in your code segment you're gonna spend a hundred
times the space you need to.

We can now simplify our test harness a bit in the factorial app, but we
still have to put in commandline args to compare with manually. We only
support length-prefixed strings, not null-terminated ones.

											
										
										
											2018-09-23 04:56:00 +00:00
+								    if (c == '"') {
-

											
										
										
											2018-11-25 03:55:59 +00:00
+								      // slurp word data
 								      ostringstream d;
 								      d << c;
-- support string literals directly in code

Doesn't de-duplicate in the data segment, though. If you use the literal
"foo" a hundred times in your code segment you're gonna spend a hundred
times the space you need to.

We can now simplify our test harness a bit in the factorial app, but we
still have to put in commandline args to compare with manually. We only
support length-prefixed strings, not null-terminated ones.

											
										
										
											2018-09-23 04:56:00 +00:00
+								      while (has_data(in)) {
 								        in >> c;
-

											
										
										
											2018-11-25 03:55:59 +00:00
+								        d << c;
-- support string literals directly in code

Doesn't de-duplicate in the data segment, though. If you use the literal
"foo" a hundred times in your code segment you're gonna spend a hundred
times the space you need to.

We can now simplify our test harness a bit in the factorial app, but we
still have to put in commandline args to compare with manually. We only
support length-prefixed strings, not null-terminated ones.

											
										
										
											2018-09-23 04:56:00 +00:00
+								        if (c == '"') break;
 								      }
-

											
										
										
											2018-11-25 03:55:59 +00:00
+								      result.words.back().data = d.str();
 								      // slurp metadata
 								      ostringstream m;
 								      while (!isspace(in.peek()) && has_data(in)) {
 								        in >> c;
 								        if (c == '/') {
 								          if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
 								          m.str("");
 								        }
 								        else {
 								          m << c;
 								        }
 								      }
 								      if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
-- support string literals directly in code

Doesn't de-duplicate in the data segment, though. If you use the literal
"foo" a hundred times in your code segment you're gonna spend a hundred
times the space you need to.

We can now simplify our test harness a bit in the factorial app, but we
still have to put in commandline args to compare with manually. We only
support length-prefixed strings, not null-terminated ones.

											
										
										
											2018-09-23 04:56:00 +00:00
+								    }
-

											
										
										
											2018-11-25 03:55:59 +00:00
+								    else {
 								      // slurp all characters until whitespace
 								      ostringstream w;
-- support string literals directly in code

Doesn't de-duplicate in the data segment, though. If you use the literal
"foo" a hundred times in your code segment you're gonna spend a hundred
times the space you need to.

We can now simplify our test harness a bit in the factorial app, but we
still have to put in commandline args to compare with manually. We only
support length-prefixed strings, not null-terminated ones.

											
										
										
											2018-09-23 04:56:00 +00:00
+								      w << c;
-

											
										
										
											2018-11-25 03:55:59 +00:00
+								      while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
 								        in >> c;
 								        w << c;
 								      }
 								      parse_word(w.str(), result.words.back());
-- support string literals directly in code

Doesn't de-duplicate in the data segment, though. If you use the literal
"foo" a hundred times in your code segment you're gonna spend a hundred
times the space you need to.

We can now simplify our test harness a bit in the factorial app, but we
still have to put in commandline args to compare with manually. We only
support length-prefixed strings, not null-terminated ones.

											
										
										
											2018-09-23 04:56:00 +00:00
+								    }
 								    trace(99, "parse2") << "word: " << to_string(result.words.back()) << end();
 								  }
 								  if (!result.words.empty())
 								    out.push_back(result);
 								}
 								void skip_whitespace(istream& in) {
 								  while (true) {
 								    if (has_data(in) && isspace(in.peek())) in.get();
 								    else break;
 								  }
 								}
 								void skip_comment(istream& in) {
 								  if (has_data(in) && in.peek() == '#') {
 								    in.get();
 								    while (has_data(in) && in.peek() != '\n') in.get();
 								  }
 								}
 								// helper for tests
 								void parse_instruction_character_by_character(const string& line_data) {
 								  vector<line> out;
 								  parse_instruction_character_by_character(line_data, out);
 								}
 								:(scenario parse2_comment_token_in_middle)
 								a . z
 								+parse2: word: a
 								+parse2: word: z
 								-parse2: word: .
 								# no other words
 								$parse2: 2
 								:(scenario parse2_word_starting_with_dot)
 								a .b c
 								+parse2: word: a
 								+parse2: word: .b
 								+parse2: word: c
 								:(scenario parse2_comment_token_at_start)
 								. a b
 								+parse2: word: a
 								+parse2: word: b
 								-parse2: word: .
 								:(scenario parse2_comment_token_at_end)
 								a b .
 								+parse2: word: a
 								+parse2: word: b
 								-parse2: word: .
 								:(scenario parse2_word_starting_with_dot_at_start)
 								.a b c
 								+parse2: word: .a
 								+parse2: word: b
 								+parse2: word: c
 								:(scenario parse2_metadata)
 								.a b/c d
 								+parse2: word: .a
 								+parse2: word: b /c
 								+parse2: word: d
 								:(scenario parse2_string_with_metadata)
 								a "bc  def"/disp32 g
 								+parse2: word: a
 								+parse2: word: "bc  def" /disp32
 								+parse2: word: g
 								:(scenario parse2_string_with_metadata_at_end)
 								a "bc  def"/disp32
 								+parse2: word: a
 								+parse2: word: "bc  def" /disp32
 								:(code)
 								void test_parse2_string_with_metadata_at_end_of_line_without_newline() {
 								  parse_instruction_character_by_character(
 								      "68/push \"test\"/f"  // no newline, which is how calls from parse() will look
 								  );
 								  CHECK_TRACE_CONTENTS(
 								      "parse2: word: 68 /push"
 								      "parse2: word: \"test\" /f"
 								  );
 								}
-

											
										
										
											2018-11-25 03:55:59 +00:00
 								//: Make sure slashes inside strings don't trigger adding stuff from inside the
 								//: string to metadata.
 								:(scenario parse2_string_containing_slashes)
 								a "bc/def"/disp32
 								+parse2: word: "bc/def" /disp32