https://github.com/akkartik/mu/blob/master/038---literal_strings.cc
  1 //: Allow instructions to mention literals directly.
  2 //:
  3 //: This layer will transparently move them to the global segment (assumed to
  4 //: always be the second segment).
  5 
  6 void test_transform_literal_string() {
  7   run(
  8       "== code 0x1\n"
  9       "b8/copy  \"test\"/imm32\n"
 10       "== data 0x2000\n"  // need an empty segment
 11   );
 12   CHECK_TRACE_CONTENTS(
 13       "transform: -- move literal strings to data segment\n"
 14       "transform: adding global variable '__subx_global_1' containing \"test\"\n"
 15       "transform: line after transform: 'b8 __subx_global_1'\n"
 16   );
 17 }
 18 
 19 //: We don't rely on any transforms running in previous layers, but this layer
 20 //: knows about labels and global variables and will emit them for previous
 21 //: layers to transform.
 22 :(after "Begin Transforms")
 23 // Begin Level-3 Transforms
 24 Transform.push_back(transform_literal_strings);
 25 // End Level-3 Transforms
 26 
 27 :(before "End Globals")
 28 int Next_auto_global = 1;
 29 :(before "End Reset")
 30 Next_auto_global = 1;
 31 :(code)
 32 void transform_literal_strings(program& p) {
 33   trace(3, "transform") << "-- move literal strings to data segment" << end();
 34   if (p.segments.empty()) return;
 35   vector<line> new_lines;
 36   for (int s = 0;  s < SIZE(p.segments);  ++s) {
 37     segment& seg = p.segments.at(s);
 38     trace(99, "transform") << "segment '" << seg.name << "'" << end();
 39     for (int i = 0;  i < SIZE(seg.lines);  ++i) {
 40 //?       cerr << seg.name << '/' << i << '\n';
 41       line& line = seg.lines.at(i);
 42       for (int j = 0;  j < SIZE(line.words);  ++j) {
 43         word& curr = line.words.at(j);
 44         if (curr.data.at(0) != '"') continue;
 45         ostringstream global_name;
 46         global_name << "__subx_global_" << Next_auto_global;
 47         ++Next_auto_global;
 48         add_global_to_data_segment(global_name.str(), curr, new_lines);
 49         curr.data = global_name.str();
 50       }
 51       trace(99, "transform") << "line after transform: '" << data_to_string(line) << "'" << end();
 52     }
 53   }
 54   segment* data = find(p, "data");
 55   if (data)
 56     data->lines.insert(data->lines.end(), new_lines.begin(), new_lines.end());
 57 }
 58 
 59 void add_global_to_data_segment(const string& name, const word& value, vector<line>& out) {
 60   trace(99, "transform") << "adding global variable '" << name << "' containing " << value.data << end();
 61   // emit label
 62   out.push_back(label(name));
 63   // emit size for size-prefixed array
 64   out.push_back(line());
 65   emit_hex_bytes(out.back(), SIZE(value.data)-/*skip quotes*/2, 4/*bytes*/);
 66   // emit data byte by byte
 67   out.push_back(line());
 68   line& curr = out.back();
 69   for (int i = /*skip start quote*/1;  i < SIZE(value.data)-/*skip end quote*/1;  ++i) {
 70     char c = value.data.at(i);
 71     curr.words.push_back(word());
 72     curr.words.back().data = hex_byte_to_string(c);
 73     curr.words.back().metadata.push_back(string(1, c));
 74   }
 75 }
 76 
 77 //: Within strings, whitespace is significant. So we need to redo our instruction
 78 //: parsing.
 79 
 80 void test_instruction_with_string_literal() {
 81   parse_instruction_character_by_character(
 82       "a \"abc  def\" z\n"  // two spaces inside string
 83   );
 84   CHECK_TRACE_CONTENTS(
 85       "parse2: word: a\n"
 86       "parse2: word: \"abc  def\"\n"
 87       "parse2: word: z\n"
 88   );
 89   // no other words
 90   CHECK_TRACE_COUNT("parse2", 3);
 91 }
 92 
 93 void test_string_literal_in_data_segment() {
 94   run(
 95       "== code 0x1\n"
 96       "b8/copy  X/imm32\n"
 97       "== data 0x2000\n"
 98       "X:\n"
 99       "\"test\"/imm32\n"
100   );
101   CHECK_TRACE_CONTENTS(
102       "transform: -- move literal strings to data segment\n"
103       "transform: adding global variable '__subx_global_1' containing \"test\"\n"
104       "transform: line after transform: '__subx_global_1'\n"
105   );
106 }
107 
108 :(before "End Line Parsing Special-cases(line_data -> l)")
109 if (line_data.find('"') != string::npos) {  // can cause false-positives, but we can handle them
110   parse_instruction_character_by_character(line_data, l);
111   continue;
112 }
113 
114 :(code)
115 void parse_instruction_character_by_character(const string& line_data, vector<line>& out) {
116   if (line_data.find('\n') != string::npos  && line_data.find('\n') != line_data.size()-1) {
117     raise << "parse_instruction_character_by_character: should receive only a single line\n" << end();
118     return;
119   }
120   // parse literals
121   istringstream in(line_data);
122   in >> std::noskipws;
123   line result;
124   result.original = line_data;
125   // add tokens (words or strings) one by one
126   while (has_data(in)) {
127     skip_whitespace(in);
128     if (!has_data(in)) break;
129     char c = in.get();
130     if (c == '#') break;  // comment; drop rest of line
131     if (c == ':') break;  // line metadata; skip for now
132     if (c == '.') {
133       if (!has_data(in)) break;  // comment token at end of line
134       if (isspace(in.peek()))
135         continue;  // '.' followed by space is comment token; skip
136     }
137     result.words.push_back(word());
138     if (c == '"') {
139       // string literal; slurp everything between quotes into data
140       ostringstream d;
141       d << c;
142       while (has_data(in)) {
143         in >> c;
144         if (c == '\\') {
145           in >> c;
146           if (c == 'n') d << '\n';
147           else if (c == '"') d << '"';
148           else if (c == '\\') d << '\\';
149           else {
150             raise << "parse_instruction_character_by_character: unknown escape sequence '\\" << c << "'\n" << end();
151             return;
152           }
153           continue;
154         } else {
155           d << c;
156         }
157         if (c == '"') break;
158       }
159       result.words.back().data = d.str();
160       result.words.back().original = d.str();
161       // slurp metadata
162       ostringstream m;
163       while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
164         in >> c;
165         if (c == '/') {
166           if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
167           m.str("");
168         }
169         else {
170           m << c;
171         }
172       }
173       if (!m.str().empty()) result.words.back().metadata.push_back(m.str());
174     }
175     else {
176       // not a string literal; slurp all characters until whitespace
177       ostringstream w;
178       w << c;
179       while (!isspace(in.peek()) && has_data(in)) {  // peek can sometimes trigger eof(), so do it first
180         in >> c;
181         w << c;
182       }
183       parse_word(w.str(), result.words.back());
184     }
185     trace(99, "parse2") << "word: " << to_string(result.words.back()) << end();
186   }
187   if (!result.words.empty())
188     out.push_back(result);
189 }
190 
191 void skip_whitespace(istream& in) {
192   while (true) {
193     if (has_data(in) && isspace(in.peek())) in.get();
194     else break;
195   }
196 }
197 
198 void skip_comment(istream& in) {
199   if (has_data(in) && in.peek() == '#') {
200     in.get();
201     while (has_data(in) && in.peek() != '\n') in.get();
202   }
203 }
204 
205 line label(string s) {
206   line result;
207   result.words.push_back(word());
208   result.words.back().data = (s+":");
209   return result;
210 }
211 
212 // helper for tests
213 void parse_instruction_character_by_character(const string& line_data) {
214   vector<line> out;
215   parse_instruction_character_by_character(line_data, out);
216 }
217 
218 void test_parse2_comment_token_in_middle() {
219   parse_instruction_character_by_character(
220       "a . z\n"
221   );
222   CHECK_TRACE_CONTENTS(
223       "parse2: word: a\n"
224       "parse2: word: z\n"
225   );
226   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
227   // no other words
228   CHECK_TRACE_COUNT("parse2", 2);
229 }
230 
231 void test_parse2_word_starting_with_dot() {
232   parse_instruction_character_by_character(
233       "a .b c\n"
234   );
235   CHECK_TRACE_CONTENTS(
236       "parse2: word: a\n"
237       "parse2: word: .b\n"
238       "parse2: word: c\n"
239   );
240 }
241 
242 void test_parse2_comment_token_at_start() {
243   parse_instruction_character_by_character(
244       ". a b\n"
245   );
246   CHECK_TRACE_CONTENTS(
247       "parse2: word: a\n"
248       "parse2: word: b\n"
249   );
250   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
251 }
252 
253 void test_parse2_comment_token_at_end() {
254   parse_instruction_character_by_character(
255       "a b .\n"
256   );
257   CHECK_TRACE_CONTENTS(
258       "parse2: word: a\n"
259       "parse2: word: b\n"
260   );
261   CHECK_TRACE_DOESNT_CONTAIN("parse2: word: .");
262 }
263 
264 void test_parse2_word_starting_with_dot_at_start() {
265   parse_instruction_character_by_character(
266       ".a b c\n"
267   );
268   CHECK_TRACE_CONTENTS(
269       "parse2: word: .a\n"
270       "parse2: word: b\n"
271       "parse2: word: c\n"
272   );
273 }
274 
275 void test_parse2_metadata() {
276   parse_instruction_character_by_character(
277       ".a b/c d\n"
278   );
279   CHECK_TRACE_CONTENTS(
280       "parse2: word: .a\n"
281       "parse2: word: b /c\n"
282       "parse2: word: d\n"
283   );
284 }
285 
286 void test_parse2_string_with_metadata() {
287   parse_instruction_character_by_character(
288       "a \"bc  def\"/disp32 g\n"
289   );
290   CHECK_TRACE_CONTENTS(
291       "parse2: word: a\n"
292       "parse2: word: \"bc  def\" /disp32\n"
293       "parse2: word: g\n"
294   );
295 }
296 
297 void test_parse2_string_with_metadata_at_end() {
298   parse_instruction_character_by_character(
299       "a \"bc  def\"/disp32\n"
300   );
301   CHECK_TRACE_CONTENTS(
302       "parse2: word: a\n"
303       "parse2: word: \"bc  def\" /disp32\n"
304   );
305 }
306 
307 void test_parse2_string_with_metadata_at_end_of_line_without_newline() {
308   parse_instruction_character_by_character(
309       "68/push \"test\"/f"  // no newline, which is how calls from parse() will look
310   );
311   CHECK_TRACE_CONTENTS(
312       "parse2: word: 68 /push\n"
313       "parse2: word: \"test\" /f\n"
314   );
315 }
316 
317 //: Make sure slashes inside strings don't trigger adding stuff from inside the
318 //: string to metadata.
319 
320 void test_parse2_string_containing_slashes() {
321   parse_instruction_character_by_character(
322       "a \"bc/def\"/disp32\n"
323   );
324   CHECK_TRACE_CONTENTS(
325       "parse2: word: \"bc/def\" /disp32\n"
326   );
327 }
328 
329 void test_instruction_with_string_literal_with_escaped_quote() {
330   parse_instruction_character_by_character(
331       "\"a\\\"b\"\n"  // escaped quote inside string
332   );
333   CHECK_TRACE_CONTENTS(
334       "parse2: word: \"a\"b\"\n"
335   );
336   // no other words
337   CHECK_TRACE_COUNT("parse2", 1);
338 }
339 
340 void test_instruction_with_string_literal_with_escaped_backslash() {
341   parse_instruction_character_by_character(
342       "\"a\\\\b\"\n"  // escaped backslash inside string
343   );
344   CHECK_TRACE_CONTENTS(
345       "parse2: word: \"a\\b\"\n"
346   );
347   // no other words
348   CHECK_TRACE_COUNT("parse2", 1);
349 }