mu/013literal_string.cc

//: For convenience, some instructions will take literal arrays of characters (strings).
//:
//: Instead of quotes, we'll use [] to delimit strings. That'll reduce the
//: need for escaping since we can support nested brackets. And we can also
//: imagine that 'recipe' might one day itself be defined in mu, doing its own
//: parsing.

:(scenarios load)
:(scenario string_literal)
recipe main [
  1:address:array:character <- copy [abc def]  # copy can't really take a string
]
+parse:   ingredient: {name: "abc def", properties: [_: "literal-string"]}

:(scenario string_literal_with_colons)
recipe main [
  1:address:array:character <- copy [abc:def/ghi]
]
+parse:   ingredient: {name: "abc:def/ghi", properties: [_: "literal-string"]}

:(before "End Mu Types Initialization")
Type_ordinal["literal-string"] = 0;

:(after "string next_word(istream& in)")
  if (in.peek() == '[') {
    string result = slurp_quoted(in);
    skip_whitespace(in);
    skip_comment(in);
    return result;
  }

:(code)
string slurp_quoted(istream& in) {
  ostringstream out;
  assert(!in.eof());  assert(in.peek() == '[');  out << static_cast<char>(in.get());  // slurp the '['
  if (code_string(in, out))
    slurp_quoted_comment_aware(in, out);
  else
    slurp_quoted_comment_oblivious(in, out);
  return out.str();
}

// A string is a code string if it contains a newline before any non-whitespace
// todo: support comments before the newline. But that gets messy.
bool code_string(istream& in, ostringstream& out) {
  while (!in.eof()) {
    char c = in.get();
    if (!isspace(c)) {
      in.putback(c);
      return false;
    }
    out << c;
    if (c == '\n') {
      return true;
    }
  }
  return false;
}

// Read a regular string. Regular strings can only contain other regular
// strings.
void slurp_quoted_comment_oblivious(istream& in, ostringstream& out) {
  int brace_depth = 1;
  while (!in.eof()) {
    char c = in.get();
    if (c == '\\') {
      out << static_cast<char>(in.get());
      continue;
    }
    out << c;
    if (c == '[') ++brace_depth;
    if (c == ']') --brace_depth;
    if (brace_depth == 0) break;
  }
  if (in.eof() && brace_depth > 0) {
    raise << "unbalanced '['\n" << end();
    out.clear();
  }
}

// Read a code string. Code strings can contain either code or regular strings.
void slurp_quoted_comment_aware(istream& in, ostringstream& out) {
  char c;
  while (in >> c) {
    if (c == '\\') {
      out << static_cast<char>(in.get());
      continue;
    }
    if (c == '#') {
      out << c;
      while (!in.eof() && in.peek() != '\n') out << static_cast<char>(in.get());
      continue;
    }
    if (c == '[') {
      in.putback(c);
      // recurse
      out << slurp_quoted(in);
      continue;
    }
    out << c;
    if (c == ']') return;
  }
  raise << "unbalanced '['\n" << end();
  out.clear();
}

:(after "Parsing reagent(string s)")
if (s.at(0) == '[') {
  assert(*s.rbegin() == ']');
  // delete [] delimiters
  s.erase(0, 1);
  s.erase(SIZE(s)-1);
  name = s;
  types.push_back(0);
  properties.push_back(pair<string, vector<string> >(name, vector<string>()));
  properties.back().second.push_back("literal-string");
  return;
}

//: Two tweaks to printing literal strings compared to other reagents:
//:   a) Don't print the string twice in the representation, just put '_' in
//:   the property list.
//:   b) Escape newlines in the string to make it more friendly to trace().

:(after "string reagent::to_string()")
  if (is_literal_string(*this))
    return emit_literal_string(name);

:(code)
bool is_literal_string(const reagent& x) {
  return !x.properties.at(0).second.empty() && x.properties.at(0).second.at(0) == "literal-string";
}

string emit_literal_string(string name) {
  size_t pos = 0;
  while (pos != string::npos)
    pos = replace(name, "\n", "\\n", pos);
  return "{name: \""+name+"\", properties: [_: \"literal-string\"]}";
}

size_t replace(string& str, const string& from, const string& to, size_t n) {
  size_t result = str.find(from, n);
  if (result != string::npos)
    str.replace(result, from.length(), to);
  return result;
}

:(scenario string_literal_nested)
recipe main [
  1:address:array:character <- copy [abc [def]]
]
+parse:   ingredient: {name: "abc [def]", properties: [_: "literal-string"]}

:(scenario string_literal_escaped)
recipe main [
  1:address:array:character <- copy [abc \[def]
]
+parse:   ingredient: {name: "abc [def", properties: [_: "literal-string"]}

:(scenario string_literal_escaped_comment_aware)
recipe main [
  1:address:array:character <- copy [
abc \\\[def]
]
+parse:   ingredient: {name: "\nabc \[def", properties: [_: "literal-string"]}

:(scenario string_literal_and_comment)
recipe main [
  1:address:array:character <- copy [abc]  # comment
]
+parse: instruction: copy
+parse:   ingredient: {name: "abc", properties: [_: "literal-string"]}
+parse:   product: {name: "1", properties: ["1": "address":"array":"character"]}
# no other ingredients
$parse: 3

:(scenario string_literal_escapes_newlines_in_trace)
recipe main [
  copy [abc
def]
]
+parse:   ingredient: {name: "abc\ndef", properties: [_: "literal-string"]}

:(scenario string_literal_can_skip_past_comments)
recipe main [
  copy [
    # ']' inside comment
    bar
  ]
]
+parse:   ingredient: {name: "\n    # ']' inside comment\n    bar\n  ", properties: [_: "literal-string"]}

:(scenario string_literal_empty)
recipe main [
  copy []
]
+parse:   ingredient: {name: "", properties: [_: "literal-string"]}
1077 2015-04-17 18:22:59 +00:00			`//: For convenience, some instructions will take literal arrays of characters (strings).`
998 - convenient string initialization 2015-03-31 06:15:03 +00:00			`//:`
			`//: Instead of quotes, we'll use [] to delimit strings. That'll reduce the`
999 2015-03-31 17:17:19 +00:00			`//: need for escaping since we can support nested brackets. And we can also`
			`//: imagine that 'recipe' might one day itself be defined in mu, doing its own`
			`//: parsing.`
998 - convenient string initialization 2015-03-31 06:15:03 +00:00
1155 - three phases of mu: load, transform, run Each phase implicitly calls previous phases. Most C++ scenarios implicitly call one, two or three of the phases. More clear now that 'load' does more than just add recipes. 2015-04-24 07:28:24 +00:00			`:(scenarios load)`
1162 2015-04-24 17:19:03 +00:00			`:(scenario string_literal)`
996 - string literals 2015-03-31 04:22:29 +00:00			`recipe main [`
1216 2015-04-29 05:42:54 +00:00			`1:address:array:character <- copy [abc def] # copy can't really take a string`
996 - string literals 2015-03-31 04:22:29 +00:00			`]`
1557 More concise traces for literal strings. 2015-06-14 06:08:46 +00:00			`+parse: ingredient: {name: "abc def", properties: [_: "literal-string"]}`
996 - string literals 2015-03-31 04:22:29 +00:00
1162 2015-04-24 17:19:03 +00:00			`:(scenario string_literal_with_colons)`
1066 - bugfix: support string literals with colons 2015-04-15 17:27:16 +00:00			`recipe main [`
1216 2015-04-29 05:42:54 +00:00			`1:address:array:character <- copy [abc:def/ghi]`
1066 - bugfix: support string literals with colons 2015-04-15 17:27:16 +00:00			`]`
1557 More concise traces for literal strings. 2015-06-14 06:08:46 +00:00			`+parse: ingredient: {name: "abc:def/ghi", properties: [_: "literal-string"]}`
1066 - bugfix: support string literals with colons 2015-04-15 17:27:16 +00:00
996 - string literals 2015-03-31 04:22:29 +00:00			`:(before "End Mu Types Initialization")`
1702 - experiment: start using 'ordinal' in names It comes up pretty early in the codebase, but hopefully won't come up in the mu level until we get to higher-order recipes. Potentially intimidating name, but such prime real estate with no confusing overloadings in other projects! 2015-07-04 16:40:50 +00:00			`Type_ordinal["literal-string"] = 0;`
996 - string literals 2015-03-31 04:22:29 +00:00
			`:(after "string next_word(istream& in)")`
1217 - string literals weren't handling later comments 2015-04-29 05:45:38 +00:00			`if (in.peek() == '[') {`
			`string result = slurp_quoted(in);`
			`skip_whitespace(in);`
			`skip_comment(in);`
			`return result;`
			`}`
996 - string literals 2015-03-31 04:22:29 +00:00
			`:(code)`
			`string slurp_quoted(istream& in) {`
			`ostringstream out;`
1564 - a better way to support string literals Our new heuristic is: all string literals are the same. If they contain newline before non-whitespace, we scan for comments assuming there might be code inside: foofoofoo [ ... # ']' inside comment ignored ] If they contain non-whitespace first, then we ignore comments assuming it's just a regular string: foofoofoo [abc#def] # valid string literal The big hole in this approach: foofoofoo [ # what about comments here containing ']'? ... # abc ] Currently this reads as a 'code comment' and terminates before the newline or '?' and will probably trigger errors down the line. Temporary workaround: don't start code strings with a comment on the same line as the '['. Eventually we'll tighten up the logic. We're still not using the new heuristic in scenarios, but that's up next. 2015-06-14 23:11:47 +00:00			`assert(!in.eof()); assert(in.peek() == '['); out << static_cast<char>(in.get()); // slurp the '['`
			`if (code_string(in, out))`
			`slurp_quoted_comment_aware(in, out);`
			`else`
			`slurp_quoted_comment_oblivious(in, out);`
			`return out.str();`
			`}`

			`// A string is a code string if it contains a newline before any non-whitespace`
			`// todo: support comments before the newline. But that gets messy.`
			`bool code_string(istream& in, ostringstream& out) {`
			`while (!in.eof()) {`
			`char c = in.get();`
			`if (!isspace(c)) {`
			`in.putback(c);`
			`return false;`
			`}`
			`out << c;`
			`if (c == '\n') {`
			`return true;`
			`}`
			`}`
			`return false;`
			`}`

			`// Read a regular string. Regular strings can only contain other regular`
			`// strings.`
1598 Some tests weren't actually running for the past 5 days. Performed 5 why's. 2015-06-19 20:37:11 +00:00			`void slurp_quoted_comment_oblivious(istream& in, ostringstream& out) {`
1564 - a better way to support string literals Our new heuristic is: all string literals are the same. If they contain newline before non-whitespace, we scan for comments assuming there might be code inside: foofoofoo [ ... # ']' inside comment ignored ] If they contain non-whitespace first, then we ignore comments assuming it's just a regular string: foofoofoo [abc#def] # valid string literal The big hole in this approach: foofoofoo [ # what about comments here containing ']'? ... # abc ] Currently this reads as a 'code comment' and terminates before the newline or '?' and will probably trigger errors down the line. Temporary workaround: don't start code strings with a comment on the same line as the '['. Eventually we'll tighten up the logic. We're still not using the new heuristic in scenarios, but that's up next. 2015-06-14 23:11:47 +00:00			`int brace_depth = 1;`
996 - string literals 2015-03-31 04:22:29 +00:00			`while (!in.eof()) {`
			`char c = in.get();`
1497 - 2 boneheaded bugs in parsing scenarios 2015-05-28 18:28:15 +00:00			`if (c == '\\') {`
1563 2015-06-14 19:57:51 +00:00			`out << static_cast<char>(in.get());`
1497 - 2 boneheaded bugs in parsing scenarios 2015-05-28 18:28:15 +00:00			`continue;`
			`}`
996 - string literals 2015-03-31 04:22:29 +00:00			`out << c;`
1502 2015-05-28 20:31:20 +00:00			`if (c == '[') ++brace_depth;`
			`if (c == ']') --brace_depth;`
			`if (brace_depth == 0) break;`
996 - string literals 2015-03-31 04:22:29 +00:00			`}`
1515 - complain on unbalanced brackets 2015-05-30 19:34:40 +00:00			`if (in.eof() && brace_depth > 0) {`
1844 - explicitly end each trace line More verbose, but it saves trouble when debugging; there's never something you thought should be traced but just never came out the other end. Also got rid of fatal errors entirely. Everything's a warning now, and code after a warning isn't guaranteed to run. 2015-07-25 07:02:20 +00:00			`raise << "unbalanced '['\n" << end();`
1564 - a better way to support string literals Our new heuristic is: all string literals are the same. If they contain newline before non-whitespace, we scan for comments assuming there might be code inside: foofoofoo [ ... # ']' inside comment ignored ] If they contain non-whitespace first, then we ignore comments assuming it's just a regular string: foofoofoo [abc#def] # valid string literal The big hole in this approach: foofoofoo [ # what about comments here containing ']'? ... # abc ] Currently this reads as a 'code comment' and terminates before the newline or '?' and will probably trigger errors down the line. Temporary workaround: don't start code strings with a comment on the same line as the '['. Eventually we'll tighten up the logic. We're still not using the new heuristic in scenarios, but that's up next. 2015-06-14 23:11:47 +00:00			`out.clear();`
			`}`
			`}`

			`// Read a code string. Code strings can contain either code or regular strings.`
1598 Some tests weren't actually running for the past 5 days. Performed 5 why's. 2015-06-19 20:37:11 +00:00			`void slurp_quoted_comment_aware(istream& in, ostringstream& out) {`
1564 - a better way to support string literals Our new heuristic is: all string literals are the same. If they contain newline before non-whitespace, we scan for comments assuming there might be code inside: foofoofoo [ ... # ']' inside comment ignored ] If they contain non-whitespace first, then we ignore comments assuming it's just a regular string: foofoofoo [abc#def] # valid string literal The big hole in this approach: foofoofoo [ # what about comments here containing ']'? ... # abc ] Currently this reads as a 'code comment' and terminates before the newline or '?' and will probably trigger errors down the line. Temporary workaround: don't start code strings with a comment on the same line as the '['. Eventually we'll tighten up the logic. We're still not using the new heuristic in scenarios, but that's up next. 2015-06-14 23:11:47 +00:00			`char c;`
			`while (in >> c) {`
1598 Some tests weren't actually running for the past 5 days. Performed 5 why's. 2015-06-19 20:37:11 +00:00			`if (c == '\\') {`
			`out << static_cast<char>(in.get());`
			`continue;`
			`}`
1564 - a better way to support string literals Our new heuristic is: all string literals are the same. If they contain newline before non-whitespace, we scan for comments assuming there might be code inside: foofoofoo [ ... # ']' inside comment ignored ] If they contain non-whitespace first, then we ignore comments assuming it's just a regular string: foofoofoo [abc#def] # valid string literal The big hole in this approach: foofoofoo [ # what about comments here containing ']'? ... # abc ] Currently this reads as a 'code comment' and terminates before the newline or '?' and will probably trigger errors down the line. Temporary workaround: don't start code strings with a comment on the same line as the '['. Eventually we'll tighten up the logic. We're still not using the new heuristic in scenarios, but that's up next. 2015-06-14 23:11:47 +00:00			`if (c == '#') {`
			`out << c;`
			`while (!in.eof() && in.peek() != '\n') out << static_cast<char>(in.get());`
			`continue;`
			`}`
			`if (c == '[') {`
			`in.putback(c);`
			`// recurse`
			`out << slurp_quoted(in);`
			`continue;`
			`}`
			`out << c;`
1598 Some tests weren't actually running for the past 5 days. Performed 5 why's. 2015-06-19 20:37:11 +00:00			`if (c == ']') return;`
1515 - complain on unbalanced brackets 2015-05-30 19:34:40 +00:00			`}`
1844 - explicitly end each trace line More verbose, but it saves trouble when debugging; there's never something you thought should be traced but just never came out the other end. Also got rid of fatal errors entirely. Everything's a warning now, and code after a warning isn't guaranteed to run. 2015-07-25 07:02:20 +00:00			`raise << "unbalanced '['\n" << end();`
1598 Some tests weren't actually running for the past 5 days. Performed 5 why's. 2015-06-19 20:37:11 +00:00			`out.clear();`
996 - string literals 2015-03-31 04:22:29 +00:00			`}`
998 - convenient string initialization 2015-03-31 06:15:03 +00:00
1876 2015-07-28 23:38:37 +00:00			`:(after "Parsing reagent(string s)")`
			`if (s.at(0) == '[') {`
			`assert(*s.rbegin() == ']');`
			`// delete [] delimiters`
			`s.erase(0, 1);`
			`s.erase(SIZE(s)-1);`
			`name = s;`
			`types.push_back(0);`
			`properties.push_back(pair<string, vector<string> >(name, vector<string>()));`
			`properties.back().second.push_back("literal-string");`
			`return;`
			`}`
1066 - bugfix: support string literals with colons 2015-04-15 17:27:16 +00:00
1562 2015-06-14 18:30:32 +00:00			`//: Two tweaks to printing literal strings compared to other reagents:`
			`//: a) Don't print the string twice in the representation, just put '_' in`
			`//: the property list.`
			`//: b) Escape newlines in the string to make it more friendly to trace().`

1560 2015-06-14 06:17:13 +00:00			`:(after "string reagent::to_string()")`
1913 - save expected response for each sandbox 2015-08-02 05:16:09 +00:00			`if (is_literal_string(*this))`
1560 2015-06-14 06:17:13 +00:00			`return emit_literal_string(name);`

			`:(code)`
1913 - save expected response for each sandbox 2015-08-02 05:16:09 +00:00			`bool is_literal_string(const reagent& x) {`
			`return !x.properties.at(0).second.empty() && x.properties.at(0).second.at(0) == "literal-string";`
			`}`

1560 2015-06-14 06:17:13 +00:00			`string emit_literal_string(string name) {`
			`size_t pos = 0;`
			`while (pos != string::npos)`
			`pos = replace(name, "\n", "\\n", pos);`
			`return "{name: \""+name+"\", properties: [_: \"literal-string\"]}";`
			`}`

			`size_t replace(string& str, const string& from, const string& to, size_t n) {`
			`size_t result = str.find(from, n);`
			`if (result != string::npos)`
			`str.replace(result, from.length(), to);`
			`return result;`
			`}`

1162 2015-04-24 17:19:03 +00:00			`:(scenario string_literal_nested)`
999 2015-03-31 17:17:19 +00:00			`recipe main [`
1216 2015-04-29 05:42:54 +00:00			`1:address:array:character <- copy [abc [def]]`
999 2015-03-31 17:17:19 +00:00			`]`
1557 More concise traces for literal strings. 2015-06-14 06:08:46 +00:00			`+parse: ingredient: {name: "abc [def]", properties: [_: "literal-string"]}`
1217 - string literals weren't handling later comments 2015-04-29 05:45:38 +00:00
1497 - 2 boneheaded bugs in parsing scenarios 2015-05-28 18:28:15 +00:00			`:(scenario string_literal_escaped)`
			`recipe main [`
			`1:address:array:character <- copy [abc \[def]`
			`]`
1557 More concise traces for literal strings. 2015-06-14 06:08:46 +00:00			`+parse: ingredient: {name: "abc [def", properties: [_: "literal-string"]}`
1497 - 2 boneheaded bugs in parsing scenarios 2015-05-28 18:28:15 +00:00
1598 Some tests weren't actually running for the past 5 days. Performed 5 why's. 2015-06-19 20:37:11 +00:00			`:(scenario string_literal_escaped_comment_aware)`
			`recipe main [`
			`1:address:array:character <- copy [`
			`abc \\\[def]`
			`]`
			`+parse: ingredient: {name: "\nabc \[def", properties: [_: "literal-string"]}`

1217 - string literals weren't handling later comments 2015-04-29 05:45:38 +00:00			`:(scenario string_literal_and_comment)`
			`recipe main [`
			`1:address:array:character <- copy [abc] # comment`
			`]`
1223 - more stable traces for parse scenarios 2015-04-30 04:49:09 +00:00			`+parse: instruction: copy`
1557 More concise traces for literal strings. 2015-06-14 06:08:46 +00:00			`+parse: ingredient: {name: "abc", properties: [_: "literal-string"]}`
1414 - traces now robust to new recipes/types 2015-05-21 19:36:59 +00:00			`+parse: product: {name: "1", properties: ["1": "address":"array":"character"]}`
1220 - permit mu comments in tangle scenarios 2015-04-29 18:45:43 +00:00			`# no other ingredients`
1217 - string literals weren't handling later comments 2015-04-29 05:45:38 +00:00			`$parse: 3`
1561 2015-06-14 17:07:00 +00:00
			`:(scenario string_literal_escapes_newlines_in_trace)`
			`recipe main [`
			`copy [abc`
			`def]`
			`]`
			`+parse: ingredient: {name: "abc\ndef", properties: [_: "literal-string"]}`
1564 - a better way to support string literals Our new heuristic is: all string literals are the same. If they contain newline before non-whitespace, we scan for comments assuming there might be code inside: foofoofoo [ ... # ']' inside comment ignored ] If they contain non-whitespace first, then we ignore comments assuming it's just a regular string: foofoofoo [abc#def] # valid string literal The big hole in this approach: foofoofoo [ # what about comments here containing ']'? ... # abc ] Currently this reads as a 'code comment' and terminates before the newline or '?' and will probably trigger errors down the line. Temporary workaround: don't start code strings with a comment on the same line as the '['. Eventually we'll tighten up the logic. We're still not using the new heuristic in scenarios, but that's up next. 2015-06-14 23:11:47 +00:00
			`:(scenario string_literal_can_skip_past_comments)`
			`recipe main [`
			`copy [`
			`# ']' inside comment`
			`bar`
			`]`
			`]`
			`+parse: ingredient: {name: "\n # ']' inside comment\n bar\n ", properties: [_: "literal-string"]}`

			`:(scenario string_literal_empty)`
			`recipe main [`
			`copy []`
			`]`
			`+parse: ingredient: {name: "", properties: [_: "literal-string"]}`