mu/subx/038---literal_strings.cc

//: Allow instructions to mention literals directly.
//:
//: This layer will transparently move them to the global segment (assumed to
//: always be the second segment).

:(scenario transform_literal_string)
== code
b8/copy  "test"/imm32
== data  # need to manually create this for now
+transform: -- move literal strings to data segment
+transform: adding global variable '__subx_global_1' containing "test"
+transform: instruction after transform: 'b8 __subx_global_1'

//: We don't rely on any transforms running in previous layers, but this layer
//: knows about labels and global variables and will emit them for previous
//: layers to transform.
:(after "Begin Transforms")
// Begin Level-3 Transforms
Transform.push_back(transform_literal_strings);
// End Level-3 Transforms

:(before "End Globals")
int Next_auto_global = 1;
:(code)
void transform_literal_strings(program& p) {
  trace(99, "transform") << "-- move literal strings to data segment" << end();
  if (p.segments.empty()) return;
  segment& code = p.segments.at(0);
  segment data;
  for (int i = 0;  i < SIZE(code.lines);  ++i) {
    line& inst = code.lines.at(i);
    for (int j = 0;  j < SIZE(inst.words);  ++j) {
      word& curr = inst.words.at(j);
      if (curr.data.at(0) != '"') continue;
      ostringstream global_name;
      global_name << "__subx_global_" << Next_auto_global;
      ++Next_auto_global;
      add_global_to_data_segment(global_name.str(), curr, data);
      curr.data = global_name.str();
    }
    trace(99, "transform") << "instruction after transform: '" << data_to_string(inst) << "'" << end();
  }
  if (data.lines.empty()) return;
  if (SIZE(p.segments) < 2) {
    p.segments.resize(2);
    p.segments.at(1).lines.swap(data.lines);
  }
  vector<line>& existing_data = p.segments.at(1).lines;
  existing_data.insert(existing_data.end(), data.lines.begin(), data.lines.end());
}

void add_global_to_data_segment(const string& name, const word& value, segment& data) {
  trace(99, "transform") << "adding global variable '" << name << "' containing " << value.data << end();
  // emit label
  data.lines.push_back(label(name));
  // emit size for size-prefixed array
  data.lines.push_back(line());
  emit_hex_bytes(data.lines.back(), SIZE(value.data)-/*skip quotes*/2, 4/*bytes*/);
  // emit data byte by byte
  data.lines.push_back(line());
  line& curr = data.lines.back();
  for (int i = /*skip start quote*/1;  i < SIZE(value.data)-/*skip end quote*/1;  ++i) {
    char c = value.data.at(i);
    curr.words.push_back(word());
    curr.words.back().data = hex_byte_to_string(c);
    curr.words.back().metadata.push_back(string(1, c));
  }
}

//: Within strings, whitespace is significant. So we need to redo our instruction
//: parsing.

:(scenarios parse_instruction_character_by_character)
:(scenario instruction_with_string_literal)
a "abc  def" z  # two spaces inside string
+parse2: word: a
+parse2: word: "abc  def"
+parse2: word: z
# no other words
$parse2: 3

:(before "End Line Parsing Special-cases(line_data -> l)")
if (line_data.find('"') != string::npos) {  // can cause false-positives, but we can handle them
  parse_instruction_character_by_character(line_data, l);
  continue;
}

:(code)
void parse_instruction_character_by_character(const string& line_data, vector<line>& out) {
  if (line_data.find('\n') != string::npos  && line_data.find('\n') != line_data.size()-1) {
    raise << "parse_instruction_character_by_character: should receive only a single line\n" << end();
    return;
  }
  // parse literals
  istringstream in(line_data);
  in >> std::noskipws;
  line result;
  // add tokens (words or strings) one by one
  while (has_data(in)) {
    skip_whitespace(in);
    if (!has_data(in)) break;
    char c = in.get();
    if (c == '#') break;  // comment; drop rest of line
    if (c == ':') break;  // line metadata; skip for now
    if (c == '.') {
      if (!has_data(in)) break;  // comment token at end of line
      if (isspace(in.peek()))
        continue;  // '.' followed by space is comment token; skip
    }
    result.words.push_back(word());
    if (c == '"') {
      // slurp word data
      ostringstream d;
      d << c;
      while (has_data(in)) {
        in >> c;
        d << c;
        if (c == '"') break;
      }
      result.words.back().data = d.str();
      // slurp metadata
      ostringstream m;
      while (!isspace(in.peek()) && has_data(in)) {
        in >> c;
        if (c == '/') {
          if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
          m.str("");
        }
        else {
          m << c;
        }
      }
      if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
    }
    else {
      // slurp all characters until whitespace
      ostringstream w;
      w << c;
      while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
        in >> c;
        w << c;
      }
      parse_word(w.str(), result.words.back());
    }
    trace(99, "parse2") << "word: " << to_string(result.words.back()) << end();
  }
  if (!result.words.empty())
    out.push_back(result);
}

void skip_whitespace(istream& in) {
  while (true) {
    if (has_data(in) && isspace(in.peek())) in.get();
    else break;
  }
}

void skip_comment(istream& in) {
  if (has_data(in) && in.peek() == '#') {
    in.get();
    while (has_data(in) && in.peek() != '\n') in.get();
  }
}

// helper for tests
void parse_instruction_character_by_character(const string& line_data) {
  vector<line> out;
  parse_instruction_character_by_character(line_data, out);
}

:(scenario parse2_comment_token_in_middle)
a . z
+parse2: word: a
+parse2: word: z
-parse2: word: .
# no other words
$parse2: 2

:(scenario parse2_word_starting_with_dot)
a .b c
+parse2: word: a
+parse2: word: .b
+parse2: word: c

:(scenario parse2_comment_token_at_start)
. a b
+parse2: word: a
+parse2: word: b
-parse2: word: .

:(scenario parse2_comment_token_at_end)
a b .
+parse2: word: a
+parse2: word: b
-parse2: word: .

:(scenario parse2_word_starting_with_dot_at_start)
.a b c
+parse2: word: .a
+parse2: word: b
+parse2: word: c

:(scenario parse2_metadata)
.a b/c d
+parse2: word: .a
+parse2: word: b /c
+parse2: word: d

:(scenario parse2_string_with_metadata)
a "bc  def"/disp32 g
+parse2: word: a
+parse2: word: "bc  def" /disp32
+parse2: word: g

:(scenario parse2_string_with_metadata_at_end)
a "bc  def"/disp32
+parse2: word: a
+parse2: word: "bc  def" /disp32

:(code)
void test_parse2_string_with_metadata_at_end_of_line_without_newline() {
  parse_instruction_character_by_character(
      "68/push \"test\"/f"  // no newline, which is how calls from parse() will look
  );
  CHECK_TRACE_CONTENTS(
      "parse2: word: 68 /push"
      "parse2: word: \"test\" /f"
  );
}

//: Make sure slashes inside strings don't trigger adding stuff from inside the
//: string to metadata.
:(scenario parse2_string_containing_slashes)
a "bc/def"/disp32
+parse2: word: "bc/def" /disp32