mu/archive/1.vm/014literal_string.cc

275 lines
6.8 KiB
C++
Raw Permalink Normal View History

2016-03-14 05:03:52 +00:00
//: For convenience, some instructions will take literal arrays of characters
//: (text or strings).
2015-03-31 06:15:03 +00:00
//:
//: Instead of quotes, we'll use [] to delimit strings. That'll reduce the
2015-03-31 17:17:19 +00:00
//: need for escaping since we can support nested brackets. And we can also
2016-10-22 23:56:07 +00:00
//: imagine that 'recipe' might one day itself be defined in Mu, doing its own
2015-03-31 17:17:19 +00:00
//: parsing.
2015-03-31 06:15:03 +00:00
5001 - drop the :(scenario) DSL I've been saying for a while[1][2][3] that adding extra abstractions makes things harder for newcomers, and adding new notations doubly so. And then I notice this DSL in my own backyard. Makes me feel like a hypocrite. [1] https://news.ycombinator.com/item?id=13565743#13570092 [2] https://lobste.rs/s/to8wpr/configuration_files_are_canary_warning [3] https://lobste.rs/s/mdmcdi/little_languages_by_jon_bentley_1986#c_3miuf2 The implementation of the DSL was also highly hacky: a) It was happening in the tangle/ tool, but was utterly unrelated to tangling layers. b) There were several persnickety constraints on the different kinds of lines and the specific order they were expected in. I kept finding bugs where the translator would silently do the wrong thing. Or the error messages sucked, and readers may be stuck looking at the generated code to figure out what happened. Fixing error messages would require a lot more code, which is one of my arguments against DSLs in the first place: they may be easy to implement, but they're hard to design to go with the grain of the underlying platform. They require lots of iteration. Is that effort worth prioritizing in this project? On the other hand, the DSL did make at least some readers' life easier, the ones who weren't immediately put off by having to learn a strange syntax. There were fewer quotes to parse, fewer backslash escapes. Anyway, since there are also people who dislike having to put up with strange syntaxes, we'll call that consideration a wash and tear this DSL out. --- This commit was sheer drudgery. Hopefully it won't need to be redone with a new DSL because I grow sick of backslashes.
2019-03-13 01:56:55 +00:00
void test_string_literal() {
load(
"def main [\n"
" 1:address:array:character <- copy [abc def]\n"
"]\n"
);
CHECK_TRACE_CONTENTS(
"parse: ingredient: {\"abc def\": \"literal-string\"}\n"
);
}
void test_string_literal_with_colons() {
load(
"def main [\n"
" 1:address:array:character <- copy [abc:def/ghi]\n"
"]\n"
);
CHECK_TRACE_CONTENTS(
"parse: ingredient: {\"abc:def/ghi\": \"literal-string\"}\n"
);
}
2015-03-31 04:22:29 +00:00
:(before "End Mu Types Initialization")
put(Type_ordinal, "literal-string", 0);
2015-03-31 04:22:29 +00:00
2015-10-27 18:31:05 +00:00
:(before "End next_word Special-cases")
2015-11-17 06:39:14 +00:00
if (in.peek() == '[') {
string result = slurp_quoted(in);
skip_whitespace_and_comments_but_not_newline(in);
2015-11-17 06:39:14 +00:00
return result;
}
2015-03-31 04:22:29 +00:00
:(code)
string slurp_quoted(istream& in) {
ostringstream out;
assert(has_data(in)); assert(in.peek() == '['); out << static_cast<char>(in.get()); // slurp the '['
2015-09-15 04:38:58 +00:00
if (is_code_string(in, out))
slurp_quoted_comment_aware(in, out);
else
slurp_quoted_comment_oblivious(in, out);
return out.str();
}
2016-08-14 04:21:08 +00:00
// A string is a code string (ignores comments when scanning for matching
// brackets) if it contains a newline at the start before any non-whitespace.
2015-10-27 21:49:36 +00:00
bool is_code_string(istream& in, ostream& out) {
while (has_data(in)) {
char c = in.get();
if (!isspace(c)) {
in.putback(c);
return false;
}
out << c;
if (c == '\n') {
return true;
}
}
return false;
}
// Read a regular string. Regular strings can only contain other regular
// strings.
2015-10-27 21:49:36 +00:00
void slurp_quoted_comment_oblivious(istream& in, ostream& out) {
int brace_depth = 1;
while (has_data(in)) {
2015-03-31 04:22:29 +00:00
char c = in.get();
if (c == '\\') {
slurp_one_past_backslashes(in, out);
continue;
}
2015-03-31 04:22:29 +00:00
out << c;
2015-05-28 20:31:20 +00:00
if (c == '[') ++brace_depth;
if (c == ']') --brace_depth;
if (brace_depth == 0) break;
2015-03-31 04:22:29 +00:00
}
if (!has_data(in) && brace_depth > 0) {
2016-02-26 21:04:55 +00:00
raise << "unbalanced '['\n" << end();
out.clear();
}
}
// Read a code string. Code strings can contain either code or regular strings.
2015-10-27 21:49:36 +00:00
void slurp_quoted_comment_aware(istream& in, ostream& out) {
char c;
while (in >> c) {
if (c == '\\') {
slurp_one_past_backslashes(in, out);
continue;
}
if (c == '#') {
out << c;
while (has_data(in) && in.peek() != '\n') out << static_cast<char>(in.get());
continue;
}
if (c == '[') {
in.putback(c);
// recurse
out << slurp_quoted(in);
continue;
}
out << c;
if (c == ']') return;
2015-05-30 19:34:40 +00:00
}
2016-02-26 21:04:55 +00:00
raise << "unbalanced '['\n" << end();
out.clear();
2015-03-31 04:22:29 +00:00
}
2015-03-31 06:15:03 +00:00
2015-07-28 23:38:37 +00:00
:(after "Parsing reagent(string s)")
2016-09-12 01:07:29 +00:00
if (starts_with(s, "[")) {
if (*s.rbegin() != ']') return; // unbalanced bracket; handled elsewhere
2015-07-28 23:38:37 +00:00
name = s;
2016-05-26 01:56:37 +00:00
// delete [] delimiters
name.erase(0, 1);
strip_last(name);
type = new type_tree("literal-string", 0);
2015-07-28 23:38:37 +00:00
return;
}
//: Unlike other reagents, escape newlines in literal strings to make them
//: more friendly to trace().
2015-06-14 18:30:32 +00:00
:(after "string to_string(const reagent& r)")
2016-09-17 06:52:15 +00:00
if (is_literal_text(r))
return emit_literal_string(r.name);
2015-06-14 06:17:13 +00:00
:(code)
2016-09-17 06:52:15 +00:00
bool is_literal_text(const reagent& x) {
return x.type && x.type->name == "literal-string";
}
2015-06-14 06:17:13 +00:00
string emit_literal_string(string name) {
size_t pos = 0;
while (pos != string::npos)
pos = replace(name, "\n", "\\n", pos);
return "{\""+name+"\": \"literal-string\"}";
2015-06-14 06:17:13 +00:00
}
size_t replace(string& str, const string& from, const string& to, size_t n) {
size_t result = str.find(from, n);
if (result != string::npos)
str.replace(result, from.length(), to);
return result;
}
void strip_last(string& s) {
if (!s.empty()) s.erase(SIZE(s)-1);
}
void slurp_one_past_backslashes(istream& in, ostream& out) {
// When you encounter a backslash, strip it out and pass through any
// following run of backslashes. If we 'escaped' a single following
// character, then the character '\' would be:
// '\\' escaped once
// '\\\\' escaped twice
// '\\\\\\\\' escaped thrice (8 backslashes)
// ..and so on. With our approach it'll be:
// '\\' escaped once
// '\\\' escaped twice
// '\\\\' escaped thrice
// This only works as long as backslashes aren't also overloaded to create
// special characters. So Mu doesn't follow C's approach of overloading
// backslashes both to escape quote characters and also as a notation for
// unprintable characters like '\n'.
while (has_data(in)) {
char c = in.get();
out << c;
if (c != '\\') break;
}
}
5001 - drop the :(scenario) DSL I've been saying for a while[1][2][3] that adding extra abstractions makes things harder for newcomers, and adding new notations doubly so. And then I notice this DSL in my own backyard. Makes me feel like a hypocrite. [1] https://news.ycombinator.com/item?id=13565743#13570092 [2] https://lobste.rs/s/to8wpr/configuration_files_are_canary_warning [3] https://lobste.rs/s/mdmcdi/little_languages_by_jon_bentley_1986#c_3miuf2 The implementation of the DSL was also highly hacky: a) It was happening in the tangle/ tool, but was utterly unrelated to tangling layers. b) There were several persnickety constraints on the different kinds of lines and the specific order they were expected in. I kept finding bugs where the translator would silently do the wrong thing. Or the error messages sucked, and readers may be stuck looking at the generated code to figure out what happened. Fixing error messages would require a lot more code, which is one of my arguments against DSLs in the first place: they may be easy to implement, but they're hard to design to go with the grain of the underlying platform. They require lots of iteration. Is that effort worth prioritizing in this project? On the other hand, the DSL did make at least some readers' life easier, the ones who weren't immediately put off by having to learn a strange syntax. There were fewer quotes to parse, fewer backslash escapes. Anyway, since there are also people who dislike having to put up with strange syntaxes, we'll call that consideration a wash and tear this DSL out. --- This commit was sheer drudgery. Hopefully it won't need to be redone with a new DSL because I grow sick of backslashes.
2019-03-13 01:56:55 +00:00
void test_string_literal_nested() {
load(
"def main [\n"
" 1:address:array:character <- copy [abc [def]]\n"
"]\n"
);
CHECK_TRACE_CONTENTS(
"parse: ingredient: {\"abc [def]\": \"literal-string\"}\n"
);
}
void test_string_literal_escaped() {
load(
"def main [\n"
" 1:address:array:character <- copy [abc \\[def]\n"
"]\n"
);
CHECK_TRACE_CONTENTS(
"parse: ingredient: {\"abc [def\": \"literal-string\"}\n"
);
}
void test_string_literal_escaped_twice() {
load(
"def main [\n"
" 1:address:array:character <- copy [\n"
"abc \\\\[def]\n"
"]\n"
);
CHECK_TRACE_CONTENTS(
"parse: ingredient: {\"\\nabc \\[def\": \"literal-string\"}\n"
);
}
void test_string_literal_and_comment() {
load(
"def main [\n"
" 1:address:array:character <- copy [abc] # comment\n"
"]\n"
);
CHECK_TRACE_CONTENTS(
"parse: --- defining main\n"
"parse: instruction: copy\n"
"parse: number of ingredients: 1\n"
"parse: ingredient: {\"abc\": \"literal-string\"}\n"
"parse: product: {1: (\"address\" \"array\" \"character\")}\n"
);
}
void test_string_literal_escapes_newlines_in_trace() {
load(
"def main [\n"
" copy [abc\n"
"def]\n"
"]\n"
);
CHECK_TRACE_CONTENTS(
"parse: ingredient: {\"abc\\ndef\": \"literal-string\"}\n"
);
}
void test_string_literal_can_skip_past_comments() {
load(
"def main [\n"
" copy [\n"
" # ']' inside comment\n"
" bar\n"
" ]\n"
"]\n"
);
CHECK_TRACE_CONTENTS(
"parse: ingredient: {\"\\n # ']' inside comment\\n bar\\n \": \"literal-string\"}\n"
);
}
void test_string_literal_empty() {
load(
"def main [\n"
" copy []\n"
"]\n"
);
CHECK_TRACE_CONTENTS(
"parse: ingredient: {\"\": \"literal-string\"}\n"
);
}
void test_multiple_unfinished_recipes() {
Hide_errors = true;
load(
"def f1 [\n"
"def f2 [\n"
);
CHECK_TRACE_CONTENTS(
"error: unbalanced '['\n"
);
}