mu/014literal_string.cc

226 lines
5.8 KiB
C++
Raw Normal View History

2016-03-14 05:03:52 +00:00
//: For convenience, some instructions will take literal arrays of characters
//: (text or strings).
2015-03-31 06:15:03 +00:00
//:
//: Instead of quotes, we'll use [] to delimit strings. That'll reduce the
2015-03-31 17:17:19 +00:00
//: need for escaping since we can support nested brackets. And we can also
2016-10-22 23:56:07 +00:00
//: imagine that 'recipe' might one day itself be defined in Mu, doing its own
2015-03-31 17:17:19 +00:00
//: parsing.
2015-03-31 06:15:03 +00:00
:(scenarios load)
2015-04-24 17:19:03 +00:00
:(scenario string_literal)
def main [
2016-08-31 16:53:11 +00:00
1:address:array:character <- copy [abc def]
2015-03-31 04:22:29 +00:00
]
+parse: ingredient: {"abc def": "literal-string"}
2015-03-31 04:22:29 +00:00
2015-04-24 17:19:03 +00:00
:(scenario string_literal_with_colons)
def main [
2015-04-29 05:42:54 +00:00
1:address:array:character <- copy [abc:def/ghi]
]
+parse: ingredient: {"abc:def/ghi": "literal-string"}
2015-03-31 04:22:29 +00:00
:(before "End Mu Types Initialization")
put(Type_ordinal, "literal-string", 0);
2015-03-31 04:22:29 +00:00
2015-10-27 18:31:05 +00:00
:(before "End next_word Special-cases")
2015-11-17 06:39:14 +00:00
if (in.peek() == '[') {
string result = slurp_quoted(in);
skip_whitespace_and_comments_but_not_newline(in);
2015-11-17 06:39:14 +00:00
return result;
}
2015-03-31 04:22:29 +00:00
:(code)
string slurp_quoted(istream& in) {
ostringstream out;
assert(has_data(in)); assert(in.peek() == '['); out << static_cast<char>(in.get()); // slurp the '['
2015-09-15 04:38:58 +00:00
if (is_code_string(in, out))
slurp_quoted_comment_aware(in, out);
else
slurp_quoted_comment_oblivious(in, out);
return out.str();
}
2016-08-14 04:21:08 +00:00
// A string is a code string (ignores comments when scanning for matching
// brackets) if it contains a newline at the start before any non-whitespace.
2015-10-27 21:49:36 +00:00
bool is_code_string(istream& in, ostream& out) {
while (has_data(in)) {
char c = in.get();
if (!isspace(c)) {
in.putback(c);
return false;
}
out << c;
if (c == '\n') {
return true;
}
}
return false;
}
// Read a regular string. Regular strings can only contain other regular
// strings.
2015-10-27 21:49:36 +00:00
void slurp_quoted_comment_oblivious(istream& in, ostream& out) {
int brace_depth = 1;
while (has_data(in)) {
2015-03-31 04:22:29 +00:00
char c = in.get();
if (c == '\\') {
slurp_one_past_backslashes(in, out);
continue;
}
2015-03-31 04:22:29 +00:00
out << c;
2015-05-28 20:31:20 +00:00
if (c == '[') ++brace_depth;
if (c == ']') --brace_depth;
if (brace_depth == 0) break;
2015-03-31 04:22:29 +00:00
}
if (!has_data(in) && brace_depth > 0) {
2016-02-26 21:04:55 +00:00
raise << "unbalanced '['\n" << end();
out.clear();
}
}
// Read a code string. Code strings can contain either code or regular strings.
2015-10-27 21:49:36 +00:00
void slurp_quoted_comment_aware(istream& in, ostream& out) {
char c;
while (in >> c) {
if (c == '\\') {
slurp_one_past_backslashes(in, out);
continue;
}
if (c == '#') {
out << c;
while (has_data(in) && in.peek() != '\n') out << static_cast<char>(in.get());
continue;
}
if (c == '[') {
in.putback(c);
// recurse
out << slurp_quoted(in);
continue;
}
out << c;
if (c == ']') return;
2015-05-30 19:34:40 +00:00
}
2016-02-26 21:04:55 +00:00
raise << "unbalanced '['\n" << end();
out.clear();
2015-03-31 04:22:29 +00:00
}
2015-03-31 06:15:03 +00:00
2015-07-28 23:38:37 +00:00
:(after "Parsing reagent(string s)")
2016-09-12 01:07:29 +00:00
if (starts_with(s, "[")) {
if (*s.rbegin() != ']') return; // unbalanced bracket; handled elsewhere
2015-07-28 23:38:37 +00:00
name = s;
2016-05-26 01:56:37 +00:00
// delete [] delimiters
name.erase(0, 1);
strip_last(name);
type = new type_tree("literal-string", 0);
2015-07-28 23:38:37 +00:00
return;
}
//: Unlike other reagents, escape newlines in literal strings to make them
//: more friendly to trace().
2015-06-14 18:30:32 +00:00
:(after "string to_string(const reagent& r)")
2016-09-17 06:52:15 +00:00
if (is_literal_text(r))
return emit_literal_string(r.name);
2015-06-14 06:17:13 +00:00
:(code)
2016-09-17 06:52:15 +00:00
bool is_literal_text(const reagent& x) {
return x.type && x.type->name == "literal-string";
}
2015-06-14 06:17:13 +00:00
string emit_literal_string(string name) {
size_t pos = 0;
while (pos != string::npos)
pos = replace(name, "\n", "\\n", pos);
return "{\""+name+"\": \"literal-string\"}";
2015-06-14 06:17:13 +00:00
}
size_t replace(string& str, const string& from, const string& to, size_t n) {
size_t result = str.find(from, n);
if (result != string::npos)
str.replace(result, from.length(), to);
return result;
}
void strip_last(string& s) {
if (!s.empty()) s.erase(SIZE(s)-1);
}
void slurp_one_past_backslashes(istream& in, ostream& out) {
// When you encounter a backslash, strip it out and pass through any
// following run of backslashes. If we 'escaped' a single following
// character, then the character '\' would be:
// '\\' escaped once
// '\\\\' escaped twice
// '\\\\\\\\' escaped thrice (8 backslashes)
// ..and so on. With our approach it'll be:
// '\\' escaped once
// '\\\' escaped twice
// '\\\\' escaped thrice
// This only works as long as backslashes aren't also overloaded to create
// special characters. So Mu doesn't follow C's approach of overloading
// backslashes both to escape quote characters and also as a notation for
// unprintable characters like '\n'.
while (has_data(in)) {
char c = in.get();
out << c;
if (c != '\\') break;
}
}
2015-04-24 17:19:03 +00:00
:(scenario string_literal_nested)
def main [
2015-04-29 05:42:54 +00:00
1:address:array:character <- copy [abc [def]]
2015-03-31 17:17:19 +00:00
]
+parse: ingredient: {"abc [def]": "literal-string"}
:(scenario string_literal_escaped)
def main [
1:address:array:character <- copy [abc \[def]
]
+parse: ingredient: {"abc [def": "literal-string"}
:(scenario string_literal_escaped_twice)
def main [
1:address:array:character <- copy [
abc \\[def]
]
+parse: ingredient: {"\nabc \[def": "literal-string"}
:(scenario string_literal_and_comment)
def main [
1:address:array:character <- copy [abc] # comment
]
+parse: --- defining main
+parse: instruction: copy
2015-11-01 04:56:17 +00:00
+parse: number of ingredients: 1
+parse: ingredient: {"abc": "literal-string"}
+parse: product: {1: ("address" "array" "character")}
2015-06-14 17:07:00 +00:00
:(scenario string_literal_escapes_newlines_in_trace)
def main [
2015-06-14 17:07:00 +00:00
copy [abc
def]
]
+parse: ingredient: {"abc\ndef": "literal-string"}
:(scenario string_literal_can_skip_past_comments)
def main [
copy [
# ']' inside comment
bar
]
]
+parse: ingredient: {"\n # ']' inside comment\n bar\n ": "literal-string"}
:(scenario string_literal_empty)
def main [
copy []
]
+parse: ingredient: {"": "literal-string"}
:(scenario multiple_unfinished_recipes)
% Hide_errors = true;
2016-09-15 20:23:00 +00:00
def f1 [
def f2 [
+error: unbalanced '['