https://github.com/akkartik/mu/blob/master/subx/030---operands.cc
  1 //: Beginning of "level 2": tagging bytes with metadata around what field of
  2 //: an x86 instruction they're for.
  3 //:
  4 //: The x86 instruction set is variable-length, and how a byte is interpreted
  5 //: affects later instruction boundaries. A lot of the pain in programming
  6 //: machine code stems from computer and programmer going out of sync on what
  7 //: a byte means. The miscommunication is usually not immediately caught, and
  8 //: metastasizes at runtime into kilobytes of misinterpreted instructions.
  9 //:
 10 //: To mitigate these issues, we'll start programming in terms of logical
 11 //: operands rather than physical bytes. Some operands are smaller than a
 12 //: byte, and others may consist of multiple bytes. This layer will correctly
 13 //: pack and order the bytes corresponding to the operands in an instruction.
 14 
 15 :(before "End Help Texts")
 16 put_new(Help, "instructions",
 17   "Each x86 instruction consists of an instruction or opcode and some number\n"
 18   "of operands.\n"
 19   "Each operand has a type. An instruction won't have more than one operand of\n"
 20   "any type.\n"
 21   "Each instruction has some set of allowed operand types. It'll reject others.\n"
 22   "The complete list of operand types: mod, subop, r32 (register), rm32\n"
 23   "(register or memory), scale, index, base, disp8, disp16, disp32, imm8,\n"
 24   "imm32.\n"
 25   "Each of these has its own help page. Try reading 'subx help mod' next.\n"
 26 );
 27 :(before "End Help Contents")
 28 cerr << "  instructions\n";
 29 
 30 :(code)
 31 void test_pack_immediate_constants() {
 32   run(
 33       "== code 0x1\n"
 34       "bb  0x2a/imm32\n"
 35   );
 36   CHECK_TRACE_CONTENTS(
 37       "transform: packing instruction 'bb 0x2a/imm32'\n"
 38       "transform: instruction after packing: 'bb 2a 00 00 00'\n"
 39       "run: copy imm32 0x0000002a to EBX\n"
 40   );
 41 }
 42 
 43 //: complete set of valid operand types
 44 
 45 :(before "End Globals")
 46 set<string> Instruction_operands;
 47 :(before "End One-time Setup")
 48 Instruction_operands.insert("subop");
 49 Instruction_operands.insert("mod");
 50 Instruction_operands.insert("rm32");
 51 Instruction_operands.insert("base");
 52 Instruction_operands.insert("index");
 53 Instruction_operands.insert("scale");
 54 Instruction_operands.insert("r32");
 55 Instruction_operands.insert("disp8");
 56 Instruction_operands.insert("disp16");
 57 Instruction_operands.insert("disp32");
 58 Instruction_operands.insert("imm8");
 59 Instruction_operands.insert("imm32");
 60 
 61 :(before "End Help Texts")
 62 init_operand_type_help();
 63 :(code)
 64 void init_operand_type_help() {
 65   put(Help, "mod",
 66     "2-bit operand controlling the _addressing mode_ of many instructions,\n"
 67     "to determine how to compute the _effective address_ to look up memory at\n"
 68     "based on the 'rm32' operand and potentially others.\n"
 69     "\n"
 70     "If mod = 3, just operate on the contents of the register specified by rm32\n"
 71     "            (direct mode).\n"
 72     "If mod = 2, effective address is usually* rm32 + disp32\n"
 73     "            (indirect mode with displacement).\n"
 74     "If mod = 1, effective address is usually* rm32 + disp8\n"
 75     "            (indirect mode with displacement).\n"
 76     "If mod = 0, effective address is usually* rm32 (indirect mode).\n"
 77     "(* - The exception is when rm32 is '4'. Register 4 is the stack pointer (ESP).\n"
 78     "     Using it as an address gets more involved. For more details,\n"
 79     "     try reading the help pages for 'base', 'index' and 'scale'.)\n"
 80     "\n"
 81     "For complete details, spend some time with two tables in the IA-32 software\n"
 82     "developer's manual that are also included in this repo:\n"
 83     "  - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n"
 84     "  - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n"
 85   );
 86   put(Help, "subop",
 87     "Additional 3-bit operand for determining the instruction when the opcode is 81, 8f or ff.\n"
 88     "Can't coexist with operand of type 'r32' in a single instruction, because the two use the same bits.\n"
 89   );
 90   put(Help, "r32",
 91     "3-bit operand specifying a register operand used directly, without any further addressing modes.\n"
 92   );
 93   put(Help, "rm32",
 94     "32-bit value in register or memory. The precise details of its construction\n"
 95     "depend on the eponymous 3-bit 'rm32' operand, the 'mod' operand, and also\n"
 96     "potentially the 'SIB' operands ('scale', 'index' and 'base') and a displacement\n"
 97     "('disp8' or 'disp32').\n"
 98     "\n"
 99     "For complete details, spend some time with two tables in the IA-32 software\n"
100     "developer's manual that are also included in this repo:\n"
101     "  - modrm.pdf: volume 2, table 2-2, \"32-bit addressing with the ModR/M byte.\".\n"
102     "  - sib.pdf: volume 2, table 2-3, \"32-bit addressing with the SIB byte.\".\n"
103   );
104   put(Help, "base",
105     "Additional 3-bit operand (when 'rm32' is 4, unless 'mod' is 3) specifying the\n"
106     "register containing an address to look up.\n"
107     "This address may be further modified by 'index' and 'scale' operands.\n"
108     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
109     "For complete details, spend some time with the IA-32 software developer's manual,\n"
110     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
111     "It is included in this repository as 'sib.pdf'.\n"
112   );
113   put(Help, "index",
114     "Optional 3-bit operand (when 'rm32' is 4 unless 'mod' is 3) that can be added to\n"
115     "the 'base' operand to compute the 'effective address' at which to look up memory.\n"
116     "  effective address = base + index*scale + displacement (disp8 or disp32)\n"
117     "For complete details, spend some time with the IA-32 software developer's manual,\n"
118     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
119     "It is included in this repository as 'sib.pdf'.\n"
120   );
121   put(Help, "scale",
122     "Optional 2-bit operand (when 'rm32' is 4 unless 'mod' is 3) that encodes a\n"
123     "power of 2 to be multiplied to the 'index' operand before adding the result to\n"
124     "the 'base' operand to compute the _effective address_ to operate on.\n"
125     "  effective address = base + index * scale + displacement (disp8 or disp32)\n"
126     "\n"
127     "When scale is 0, use index unmodified.\n"
128     "When scale is 1, multiply index by 2.\n"
129     "When scale is 2, multiply index by 4.\n"
130     "When scale is 3, multiply index by 8.\n"
131     "\n"
132     "For complete details, spend some time with the IA-32 software developer's manual,\n"
133     "volume 2, table 2-3, \"32-bit addressing with the SIB byte\".\n"
134     "It is included in this repository as 'sib.pdf'.\n"
135   );
136   put(Help, "disp8",
137     "8-bit value to be added in many instructions.\n"
138   );
139   put(Help, "disp16",
140     "16-bit value to be added in many instructions.\n"
141     "Currently not used in any SubX instructions.\n"
142   );
143   put(Help, "disp32",
144     "32-bit value to be added in many instructions.\n"
145   );
146   put(Help, "imm8",
147     "8-bit value for many instructions.\n"
148   );
149   put(Help, "imm32",
150     "32-bit value for many instructions.\n"
151   );
152 }
153 
154 //:: transform packing operands into bytes in the right order
155 
156 :(after "Begin Transforms")
157 // Begin Level-2 Transforms
158 Transform.push_back(pack_operands);
159 // End Level-2 Transforms
160 
161 :(code)
162 void pack_operands(program& p) {
163   if (p.segments.empty()) return;
164   segment& code = *find(p, "code");
165   // Pack Operands(segment code)
166   trace(3, "transform") << "-- pack operands" << end();
167   for (int i = 0;  i < SIZE(code.lines);  ++i) {
168     line& inst = code.lines.at(i);
169     if (all_hex_bytes(inst)) continue;
170     trace(99, "transform") << "packing instruction '" << to_string(/*with metadata*/inst) << "'" << end();
171     pack_operands(inst);
172     trace(99, "transform") << "instruction after packing: '" << to_string(/*without metadata*/inst.words) << "'" << end();
173   }
174 }
175 
176 void pack_operands(line& inst) {
177   line new_inst;
178   add_opcodes(inst, new_inst);
179   add_modrm_byte(inst, new_inst);
180   add_sib_byte(inst, new_inst);
181   add_disp_bytes(inst, new_inst);
182   add_imm_bytes(inst, new_inst);
183   inst.words.swap(new_inst.words);
184 }
185 
186 void add_opcodes(const line& in, line& out) {
187   out.words.push_back(in.words.at(0));
188   if (in.words.at(0).data == "0f" || in.words.at(0).data == "f2" || in.words.at(0).data == "f3")
189     out.words.push_back(in.words.at(1));
190   if (in.words.at(0).data == "f3" && in.words.at(1).data == "0f")
191     out.words.push_back(in.words.at(2));
192   if (in.words.at(0).data == "f2" && in.words.at(1).data == "0f")
193     out.words.push_back(in.words.at(2));
194 }
195 
196 void add_modrm_byte(const line& in, line& out) {
197   uint8_t mod=0, reg_subop=0, rm32=0;
198   bool emit = false;
199   for (int i = 0;  i < SIZE(in.words);  ++i) {
200     const word& curr = in.words.at(i);
201     if (has_operand_metadata(curr, "mod")) {
202       mod = hex_byte(curr.data);
203       emit = true;
204     }
205     else if (has_operand_metadata(curr, "rm32")) {
206       rm32 = hex_byte(curr.data);
207       emit = true;
208     }
209     else if (has_operand_metadata(curr, "r32")) {
210       reg_subop = hex_byte(curr.data);
211       emit = true;
212     }
213     else if (has_operand_metadata(curr, "subop")) {
214       reg_subop = hex_byte(curr.data);
215       emit = true;
216     }
217   }
218   if (emit)
219     out.words.push_back(hex_byte_text((mod << 6) | (reg_subop << 3) | rm32));
220 }
221 
222 void add_sib_byte(const line& in, line& out) {
223   uint8_t scale=0, index=0, base=0;
224   bool emit = false;
225   for (int i = 0;  i < SIZE(in.words);  ++i) {
226     const word& curr = in.words.at(i);
227     if (has_operand_metadata(curr, "scale")) {
228       scale = hex_byte(curr.data);
229       emit = true;
230     }
231     else if (has_operand_metadata(curr, "index")) {
232       index = hex_byte(curr.data);
233       emit = true;
234     }
235     else if (has_operand_metadata(curr, "base")) {
236       base = hex_byte(curr.data);
237       emit = true;
238     }
239   }
240   if (emit)
241     out.words.push_back(hex_byte_text((scale << 6) | (index << 3) | base));
242 }
243 
244 void add_disp_bytes(const line& in, line& out) {
245   for (int i = 0;  i < SIZE(in.words);  ++i) {
246     const word& curr = in.words.at(i);
247     if (has_operand_metadata(curr, "disp8"))
248       emit_hex_bytes(out, curr, 1);
249     if (has_operand_metadata(curr, "disp16"))
250       emit_hex_bytes(out, curr, 2);
251     else if (has_operand_metadata(curr, "disp32"))
252       emit_hex_bytes(out, curr, 4);
253   }
254 }
255 
256 void add_imm_bytes(const line& in, line& out) {
257   for (int i = 0;  i < SIZE(in.words);  ++i) {
258     const word& curr = in.words.at(i);
259     if (has_operand_metadata(curr, "imm8"))
260       emit_hex_bytes(out, curr, 1);
261     else if (has_operand_metadata(curr, "imm32"))
262       emit_hex_bytes(out, curr, 4);
263   }
264 }
265 
266 void emit_hex_bytes(line& out, const word& w, int num) {
267   assert(num <= 4);
268   bool is_number = looks_like_hex_int(w.data);
269   if (num == 1 || !is_number) {
270     out.words.push_back(w);  // preserve existing metadata
271     if (is_number)
272       out.words.back().data = hex_byte_to_string(parse_int(w.data));
273     return;
274   }
275   emit_hex_bytes(out, static_cast<uint32_t>(parse_int(w.data)), num);
276 }
277 
278 void emit_hex_bytes(line& out, uint32_t val, int num) {
279   assert(num <= 4);
280   for (int i = 0;  i < num;  ++i) {
281     out.words.push_back(hex_byte_text(val & 0xff));
282     val = val >> 8;
283   }
284 }
285 
286 word hex_byte_text(uint8_t val) {
287   word result;
288   result.data = hex_byte_to_string(val);
289   result.original = result.data+"/auto";
290   return result;
291 }
292 
293 string hex_byte_to_string(uint8_t val) {
294   ostringstream out;
295   // uint8_t prints without padding, but int8_t will expand to 32 bits again
296   out << HEXBYTE << NUM(val);
297   return out.str();
298 }
299 
300 string to_string(const vector<word>& in) {
301   ostringstream out;
302   for (int i = 0;  i < SIZE(in);  ++i) {
303     if (i > 0) out << ' ';
304     out << in.at(i).data;
305   }
306   return out.str();
307 }
308 
309 :(before "End Unit Tests")
310 void test_preserve_metadata_when_emitting_single_byte() {
311   word in;
312   in.data = "f0";
313   in.original = "f0/foo";
314   line out;
315   emit_hex_bytes(out, in, 1);
316   CHECK_EQ(out.words.at(0).data, "f0");
317   CHECK_EQ(out.words.at(0).original, "f0/foo");
318 }
319 
320 :(code)
321 void test_pack_disp8() {
322   run(
323       "== code 0x1\n"
324       "74 2/disp8\n"  // jump 2 bytes away if ZF is set
325   );
326   CHECK_TRACE_CONTENTS(
327       "transform: packing instruction '74 2/disp8'\n"
328       "transform: instruction after packing: '74 02'\n"
329   );
330 }
331 
332 void test_pack_disp8_negative() {
333   transform(
334       "== code 0x1\n"
335       // running this will cause an infinite loop
336       "74 -1/disp8\n"  // jump 1 byte before if ZF is set
337   );
338   CHECK_TRACE_CONTENTS(
339       "transform: packing instruction '74 -1/disp8'\n"
340       "transform: instruction after packing: '74 ff'\n"
341   );
342 }
343 
344 //: helper for scenario
345 void transform(const string& text_bytes) {
346   program p;
347   istringstream in(text_bytes);
348   parse(in, p);
349   if (trace_contains_errors()) return;
350   transform(p);
351 }
352 
353 void test_pack_modrm_imm32() {
354   run(
355       "== code 0x1\n"
356       // instruction                     effective address                                                   operand     displacement    immediate\n"
357       // op          subop               mod             rm32          base        index         scale       r32\n"
358       // 1-3 bytes   3 bits              2 bits          3 bits        3 bits      3 bits        2 bits      2 bits      0/1/2/4 bytes   0/1/2/4 bytes\n"
359       "  81          0/add/subop         3/mod/direct    3/ebx/rm32                                                                      1/imm32      \n"  // add 1 to EBX
360   );
361   CHECK_TRACE_CONTENTS(
362       "transform: packing instruction '81 0/add/subop 3/mod/direct 3/ebx/rm32 1/imm32'\n"
363       "transform: instruction after packing: '81 c3 01 00 00 00'\n"
364   );
365 }
366 
367 void test_pack_imm32_large() {
368   run(
369       "== code 0x1\n"
370       "b9  0x080490a7/imm32\n"
371   );
372   CHECK_TRACE_CONTENTS(
373       "transform: packing instruction 'b9 0x080490a7/imm32'\n"
374       "transform: instruction after packing: 'b9 a7 90 04 08'\n"
375   );
376 }
377 
378 void test_pack_immediate_constants_hex() {
379   run(
380       "== code 0x1\n"
381       "b9  0x2a/imm32\n"
382   );
383   CHECK_TRACE_CONTENTS(
384       "transform: packing instruction 'b9 0x2a/imm32'\n"
385       "transform: instruction after packing: 'b9 2a 00 00 00'\n"
386       "run: copy imm32 0x0000002a to ECX\n"
387   );
388 }
389 
390 void test_pack_silently_ignores_non_hex() {
391   Hide_errors = true;
392   transform(
393       "== code 0x1\n"
394       "b9  foo/imm32\n"
395   );
396   CHECK_TRACE_CONTENTS(
397       "transform: packing instruction 'b9 foo/imm32'\n"
398       // no change (we're just not printing metadata to the trace)
399       "transform: instruction after packing: 'b9 foo'\n"
400   );
401 }
402 
403 void test_pack_flags_bad_hex() {
404   Hide_errors = true;
405   run(
406       "== code 0x1\n"
407       "b9  0xfoo/imm32\n"
408   );
409   CHECK_TRACE_CONTENTS(
410       "error: not a number: 0xfoo\n"
411   );
412 }
413 
414 //:: helpers
415 
416 bool all_hex_bytes(const line& inst) {
417   for (int i = 0;  i < SIZE(inst.words);  ++i)
418     if (!is_hex_byte(inst.words.at(i)))
419       return false;
420   return true;
421 }
422 
423 bool is_hex_byte(const word& curr) {
424   if (contains_any_operand_metadata(curr))
425     return false;
426   if (SIZE(curr.data) != 2)
427     return false;
428   if (curr.data.find_first_not_of("0123456789abcdefABCDEF") != string::npos)
429     return false;
430   return true;
431 }
432 
433 bool contains_any_operand_metadata(const word& word) {
434   for (int i = 0;  i < SIZE(word.metadata);  ++i)
435     if (Instruction_operands.find(word.metadata.at(i)) != Instruction_operands.end())
436       return true;
437   return false;
438 }
439 
440 bool has_operand_metadata(const line& inst, const string& m) {
441   bool result = false;
442   for (int i = 0;  i < SIZE(inst.words);  ++i) {
443     if (!has_operand_metadata(inst.words.at(i), m)) continue;
444     if (result) {
445       raise << "'" << to_string(inst) << "' has conflicting " << m << " operands\n" << end();
446       return false;
447     }
448     result = true;
449   }
450   return result;
451 }
452 
453 bool has_operand_metadata(const word& w, const string& m) {
454   bool result = false;
455   bool metadata_found = false;
456   for (int i = 0;  i < SIZE(w.metadata);  ++i) {
457     const string& curr = w.metadata.at(i);
458     if (Instruction_operands.find(curr) == Instruction_operands.end()) continue;  // ignore unrecognized metadata
459     if (metadata_found) {
460       raise << "'" << w.original << "' has conflicting operand types; it should have only one\n" << end();
461       return false;
462     }
463     metadata_found = true;
464     result = (curr == m);
465   }
466   return result;
467 }
468 
469 word metadata(const line& inst, const string& m) {
470   for (int i = 0;  i < SIZE(inst.words);  ++i)
471     if (has_operand_metadata(inst.words.at(i), m))
472       return inst.words.at(i);
473   assert(false);
474 }
475 
476 bool looks_like_hex_int(const string& s) {
477   if (s.empty()) return false;
478   if (s.at(0) == '-' || s.at(0) == '+') return true;
479   if (isdigit(s.at(0))) return true;  // includes '0x' prefix
480   // End looks_like_hex_int(s) Detectors
481   return false;
482 }
483 
484 string to_string(const line& inst) {
485   ostringstream out;
486   for (int i = 0;  i < SIZE(inst.words);  ++i) {
487     if (i > 0) out << ' ';
488     out << inst.words.at(i).original;
489   }
490   return out.str();
491 }