Contents Introduction n Computer Architecture n ARM Architecture

  • Slides: 66
Download presentation

Contents Introduction n Computer Architecture n ARM Architecture n Development Tools : GNU Development

Contents Introduction n Computer Architecture n ARM Architecture n Development Tools : GNU Development Tools n ARM Instruction Set n ARM Assembly Language n ARM Assembly Programming : GNU ARM Tool. Chain n Interrupts and Monitor n 2

Lecture 8 ARM Assembly Programming

Lecture 8 ARM Assembly Programming

Outline n n n Assembly Programming Assembly C Interface Peephole Optimization 4

Outline n n n Assembly Programming Assembly C Interface Peephole Optimization 4

Example #4: String Length #include <stdio. h> extern int mystrlen(char *s); int main() {

Example #4: String Length #include <stdio. h> extern int mystrlen(char *s); int main() { char s[20] = “Hello, World!n”; printf("The length of the string is %dn", mystrlen(s)); } int mystrlen(char *s 1) { char *s 2; s 2 = s 1; while (*s 2 != 0) { s 2++; } return (s 2 s 1); } 5

Example #4: Pseudo Code int mystrlen(char *s 1) { char *s 2; s 2

Example #4: Pseudo Code int mystrlen(char *s 1) { char *s 2; s 2 = s 1; while (*s 2 != 0) { s 2++; } return (s 2 s 1); mystrlen: s 2 = s 1 start_loop: if (*s 2 == 0) goto end_loop s 2 = s 2 + 1 goto start_loop end_loop: return (s 2 s 1) } 6

Example #4: Storage Assignment mystrlen: s 2 = s 1 start_loop: if (*s 2

Example #4: Storage Assignment mystrlen: s 2 = s 1 start_loop: if (*s 2 == 0) goto end_loop r 4 = r 0 start_loop: r 5 = *r 4 if (r 5 == 0) goto end_loop s 4 = r 4 + 1 goto start_loop end_loop: return (r 4 r 0) s 2 = s 2 + 1 goto start_loop end_loop: return (s 2 s 1) 7

Example #4: Final Assembly Code mystrlen: r 4 = r 0 start_loop: r 5

Example #4: Final Assembly Code mystrlen: r 4 = r 0 start_loop: r 5 = *r 4 if (r 5 == 0) goto end_loop r 4 = r 4 + 1 goto start_loop end_loop: return (r 4 r 0) . text. align 2. global mystrlen: mov r 4, r 0 start_loop: ldrb r 5, [r 4] cmp r 5, #0 beq end_loop add r 4, #1 b start_loop end_loop: sub r 0, r 4, r 0 mov pc, lr 8

Example #5: Summation #include <stdio. h> extern int mysum(int n, int *array); int main()

Example #5: Summation #include <stdio. h> extern int mysum(int n, int *array); int main() { int a[5] = {1, 3, 5, 7, 9}; printf("The summation of the array is %dn", mysum(5, a)); } int mysum(int n, int *array) { int i, sum; sum = 0; for (i = 0; i < n; i++) { sum += array[i]; } return sum; } 9

Example #5: Pseudo Code int mysum(int n, int *array) { int i, sum; sum

Example #5: Pseudo Code int mysum(int n, int *array) { int i, sum; sum = 0; for (i = 0; i < n; i++) { sum += array[i]; } return sum; mysum: sum = 0 i = 0 start_loop: if (i >= n) goto end_loop sum = sum + array[i] i = i + 1 goto start_loop end_loop: return sum } 10

Example #5: Storage Assignment mysum: sum = 0 i = 0 start_loop: if (i

Example #5: Storage Assignment mysum: sum = 0 i = 0 start_loop: if (i >= n) goto end_loop sum = sum + array[i] r 5 = 0 r 4 = 0 start_loop: if (r 4 >= r 0) goto end_loop r 6 = r 1[r 4] r 5 = r 5 + r 6 r 4 = r 4 + 1 goto start_loop end_loop: return r 5 i = i + 1 goto start_loop end_loop: return sum 11

Example #5: Final Assembly Code mysum: r 5 = 0 r 4 = 0

Example #5: Final Assembly Code mysum: r 5 = 0 r 4 = 0 start_loop: if (r 4 >= r 0) goto end_loop r 6 = r 1[r 4] r 5 = r 5 + r 6 r 4 = r 4 + 1 goto start_loop end_loop: return r 5 . text. align 2. global mysum: mov r 5, #0 mov r 4, #0 start_loop: cmp r 4, r 0 bge end_loop ldr r 6, [r 1, r 4, LSL#2] add r 5, r 6 add r 4, #1 b start_loop end_loop: mov r 0, r 5 mov pc, lr 12

Example #6: Bubble Sort 1 #include <stdio. h> extern void bubble(int n, int *a);

Example #6: Bubble Sort 1 #include <stdio. h> extern void bubble(int n, int *a); int main() { int i; int a[5] = {9, 7, 5, 3, 1}; bubble(5, a); printf("The sorted array: n"); for (i = 0; i < 5; i++) { printf("a[%d] = %dn", i, a[i]); } } 13

Example #6: Bubble Sort 2 void sort 2(int *a, int *b) { int tmp;

Example #6: Bubble Sort 2 void sort 2(int *a, int *b) { int tmp; if (*b < *a) { tmp = *a; *a = *b; *b = tmp; } } void bubble(int n, int *a) { int i, j; for (i = 0; i < n 1; i++) { for (j = 0; j < n 1 i; j++) { sort 2(&a[j], &a[j+1]); } } } 14

Example #6: Pseudo Code void bubble(int n, int *a); { int i, j; for

Example #6: Pseudo Code void bubble(int n, int *a); { int i, j; for (i = 0; i < n 1; i++) { for (j = 0; j < n 1 i; j++) { sort 2(&a[j], &a[j+1]); } } } bubble: i = 0 start_outer: if (i >= n 1) goto end_outer j = 0 start_inner: if (j >= n 1 i) goto end_inner sort 2(&a[j], &a[j+1]) j = j + 1 goto start_inner end_inner: i = i + 1 goto start_outer end_outer: return 15

Example #6: Storage Assignment bubble: i = 0 start_outer: if (i >= n 1)

Example #6: Storage Assignment bubble: i = 0 start_outer: if (i >= n 1) goto end_outer j = 0 start_inner: if (j >= n 1 i) goto end_inner sort 2(&a[j], &a[j+1]) j = j + 1 goto start_inner end_inner: i = i + 1 goto start_outer end_outer: return bubble: r 2 = 0 start_outer: r 4 = r 0 1 if (r 2 >= r 4) goto end_outer r 3 = 0 start_inner: r 5 = r 4 – r 2 if (r 3 >= r 5) goto end_inner sort 2(r 1+r 3*4, r 1+r 3*4+4) r 3 = r 3 + 1 goto start_inner end_inner: r 2 = r 2 + 1 goto start_outer end_outer: return 16

Example #6: Assembly Code? bubble: r 2 = 0 start_outer: r 4 = r

Example #6: Assembly Code? bubble: r 2 = 0 start_outer: r 4 = r 0 1 if (r 2 >= r 4) goto end_outer r 3 = 0 start_inner: r 5 = r 4 – r 2 if (r 3 >= r 5) goto end_inner sort 2(r 1+r 3*4, r 1+r 3*4+4) r 3 = r 3 + 1 goto start_inner end_inner: r 2 = r 2 + 1 goto start_outer end_outer: return bubble: mov r 2, #0 start_outer: sub r 4, r 0, #1 cmp r 2, r 4 bge end_outer mov r 3, #0 start_inner: sub r 5, r 4, r 2 cmp r 3, r 5 bge end_inner add r 0, r 1, r 3, LSL #2 add r 1, r 0, #4 bl sort 2 add r 3, #1 b start_inner end_inner: add r 2, #1 b start_outer end_outer: mov pc, lr 17

Example #6: Final Assembly Code bubble: mov r 2, #0 start_outer: sub r 4,

Example #6: Final Assembly Code bubble: mov r 2, #0 start_outer: sub r 4, r 0, #1 cmp r 2, r 4 bge end_outer mov r 3, #0 start_inner: sub r 5, r 4, r 2 cmp r 3, r 5 bge end_inner add bl add b end_inner: add b end_outer: mov r 0, r 1, r 3, LSL #2 r 1, r 0, #4 sort 2 r 3, #1 start_inner r 2, #1 start_outer pc, lr bubble: mov r 2, #0 start_outer: sub r 4, r 0, #1 cmp r 2, r 4 bge end_outer mov r 3, #0 start_inner: sub r 5, r 4, r 2 cmp r 3, r 5 bge end_inner stmfd sp!, {r 0 r 3, lr} add r 0, r 1, r 3, LSL #2 add r 1, r 0, #4 bl sort 2 ldmfd sp, {r 0 r 3, lr} add r 3, #1 b start_inner end_inner: add r 2, #1 b start_outer end_outer: mov pc, lr 18

Outline n n n Assembly Programming Assembly C Interface Peephole Optimization 19

Outline n n n Assembly Programming Assembly C Interface Peephole Optimization 19

Generating Assembly Code from C n n In this course, we will be using

Generating Assembly Code from C n n In this course, we will be using the GNU ARM Tool. Chain. To compile a C program to assembly code l arm elf gcc –S filename. c l When you compile a. c file, you get a. s file l This. s file contains the assembly language code u n When assembled, this code can potentially be linked and loaded as an executable To display information from an object file l arm elf objdump –S –r filename 20

Example #7: A Simple Program int a, b; int main() { a = 3;

Example #7: A Simple Program int a, b; int main() { a = 3; b = 4; } /* end main() */ Loader will put addresses of a and b in this memory location Declare storage for a and b main: . L 4: . L 3: . file. text. align. global. type "example 4. c" mov stmfd sub ldr mov str ldmfd ip, sp sp!, {fp, ip, lr, pc} fp, ip, #4 r 2, . L 3 r 3, #3 r 3, [r 2, #0] r 2, . L 3+4 r 3, #4 r 3, [r 2, #0] sp, {fp, sp, pc} . align 2 . word. size. comm. ident a b main, . main a, 4, 4 b, 4, 4 "GCC: (GNU) 4. 0. 0" 2 main, %function 21

Example #7: Object File example 1. o: file format elf 32 littlearm Disassembly of

Example #7: Object File example 1. o: file format elf 32 littlearm Disassembly of section. text: 0000 <main>: 0: e 1 a 0 c 00 d mov ip, sp 4: e 92 dd 800 stmdb sp!, {fp, ip, lr, pc} 8: e 24 cb 004 sub fp, ip, #4 ; 0 x 4 c: e 59 f 2014 ldr r 2, [pc, #20] ; 28 <. text+0 x 28> 10: e 3 a 03003 mov r 3, #3 ; 0 x 3 14: e 5823000 str r 3, [r 2] 18: e 59 f 200 c ldr r 2, [pc, #12] ; 2 c <. text+0 x 2 c> 1 c: e 3 a 03004 mov r 3, #4 ; 0 x 4 20: e 5823000 str r 3, [r 2] 24: e 89 da 800 ldmia sp, {fp, sp, pc}. . . 28: R_ARM_ABS 32 a 2 c: R_ARM_ABS 32 b 22

Example #7: Executable File 00008208 <main>: 8208: e 1 a 0 c 00 d

Example #7: Executable File 00008208 <main>: 8208: e 1 a 0 c 00 d 820 c: e 92 dd 800 8210: e 24 cb 004 8214: e 59 f 2014 <. text+0 x 210> 8218: e 3 a 03003 821 c: e 5823000 8220: e 59 f 200 c <. text+0 x 214> 8224: e 3 a 03004 8228: e 5823000 822 c: e 89 da 800 8230: 0000 adc 4 8234: 0000 adc 0 mov stmdb sub ldr ip, sp sp!, {fp, ip, lr, pc} fp, ip, #4 ; 0 x 4 r 2, [pc, #20] ; 8230 mov str ldr r 3, #3 ; 0 x 3 r 3, [r 2] r 2, [pc, #12] mov str ldmia andeq r 3, sp, sl, ; 8234 #4 ; 0 x 4 [r 2] {fp, sp, pc} r 0, r 4, asr #27 r 0, asr #27 23

Example #8: Calling A Function int tmp; void swap(int a, int b); int main()

Example #8: Calling A Function int tmp; void swap(int a, int b); int main() { int a, b; a = 3; b = 4; swap(a, b); } /* end main() */ void swap(int a, int b) { tmp = a; a = b; b = tmp; } /* end swap() */ 24

Example #8: Assembly Listing main: mov stmfd pc} sub mov str ldr bl sub

Example #8: Assembly Listing main: mov stmfd pc} sub mov str ldr bl sub ldmfd swap: ip, sp sp!, {fp, ip, lr, fp, ip, #4 sp, #8 r 3, #3 r 3, [fp, # 20] r 3, #4 r 3, [fp, # 16] r 0, [fp, # 20] r 1, [fp, # 16] swap sp, fp, #12 sp, {fp, sp, pc} mov stmfd pc} sub str ldr ldr str sub ldmfd. L 6: . align. L 5: . word. comm ip, sp sp!, {fp, ip, lr, fp, sp, r 0, r 1, r 2, r 3, r 3, sp, ip, #4 sp, #8 [fp, # 16] [fp, # 20]. L 5 [fp, # 16] [r 2, #0] [fp, # 20] [fp, # 16]. L 5 [r 3, #0] [fp, # 20] fp, #12 {fp, sp, pc} 2 tmp, 4, 4 25

Example #9: Manipulating Pointers int tmp; int *pa, *pb; void swap(int a, int b);

Example #9: Manipulating Pointers int tmp; int *pa, *pb; void swap(int a, int b); int main() { int a, b; pa = &a; pb = &b; *pa = 3; *pb = 4; swap(*pa, *pb); } /* end main() */ void swap(int a, int b) { tmp = a; a = b; b = tmp; } /* end swap() */ 26

Example #9: Assembly Listing main: mov stmfd pc} sub ldr sub str ldr ldr

Example #9: Assembly Listing main: mov stmfd pc} sub ldr sub str ldr ldr mov str ldr ldr ldr mov bl sub ldmfd ip, sp sp!, {fp, ip, lr, fp, sp, r 2, r 3, r 3, r 2, r 3, ip, #4 sp, #8. L 3 fp, #16 [r 2, #0]. L 3+4 fp, #20 [r 2, #0]. L 3 [r 3, #0] #3 [r 2, #0]. L 3+4 [r 3, #0] #4 [r 2, #0] r 3, . L 3 r 3, [r 3, #0] r 2, [r 3, #0] r 3, . L 3+4 r 3, [r 3, #0] r 0, r 2 r 1, r 3 swap sp, fp, #12 sp, {fp, sp, pc} . L 4: . align 2 . word pa pb . L 3: 27

Example #10: Dealing with struct typedef struct test. Struct { unsigned int a; unsigned

Example #10: Dealing with struct typedef struct test. Struct { unsigned int a; unsigned int b; char c; } test. Struct; main: test. Struct *ptest; int main() { ptest >a = 4; ptest >b = 10; ptest >c = 'A'; } /* end main() */ mov stmfd sub ldr ldr mov strb ldmfd ip, sp sp!, {fp, ip, lr, pc} fp, ip, #4 r 3, . L 3 r 2, [r 3, #0] r 3, #4 r 3, [r 2, #0] r 3, . L 3 r 2, [r 3, #0] r 3, #10 r 3, [r 2, #4] r 3, . L 3 r 2, [r 3, #0] r 3, #65 r 3, [r 2, #8] sp, {fp, sp, pc} . align 2 . word ptest . L 4: . L 3: 28

Example #11: Passing Arguments int tmp; void test(int a, int b, int c, int

Example #11: Passing Arguments int tmp; void test(int a, int b, int c, int d, int *e); int main() { int a, b, c, d, e; a = 3; b = 4; c = 5; d = 6; e = 7; test(a, b, c, d, &e); } /* end main() */ void test(int a, int b, int c, int d, int *e) { tmp = a; a = b; b = tmp; c = b; b = d; *e = d; } /* end test() */ 29

Example #11: Assembly Listing 1 main: mov stmfd pc} sub mov str mov str

Example #11: Assembly Listing 1 main: mov stmfd pc} sub mov str mov str ip, sp sp!, {fp, ip, lr, fp, sp, r 3, r 3, r 3, ip, #4 sp, #24 #3 [fp, # 28] #4 [fp, # 24] #5 [fp, # 20] #6 [fp, # 16] #7 [fp, # 32] sub str ldr ldr bl sub ldmfd r 3, fp, #32 r 3, [sp, #0] r 0, [fp, # 28] r 1, [fp, # 24] r 2, [fp, # 20] r 3, [fp, # 16] test sp, fp, #12 sp, {fp, sp, pc} 30

Example #11: Assembly Listing 2 test: mov stmfd pc} sub str str ldr ldr

Example #11: Assembly Listing 2 test: mov stmfd pc} sub str str ldr ldr str ip, sp sp!, {fp, ip, lr, fp, sp, r 0, r 1, r 2, r 3, r 3, ip, #4 sp, #16 [fp, # 16] [fp, # 20] [fp, # 24] [fp, # 28]. L 5 [fp, # 16] [r 2, #0] [fp, # 20] [fp, # 16]. L 5 [r 3, #0] [fp, # 20] ldr str sub ldmfd r 3, r 2, r 3, sp, . align 2 . word tmp [fp, # 20] [fp, # 24] [fp, # 28] [fp, # 20] [fp, #4] [fp, # 28] [r 2, #0] fp, #12 {fp, sp, pc} . L 6: . L 5: 31

Interfacing C and Assembly n ARM has developed a standard called the “ARM Procedure

Interfacing C and Assembly n ARM has developed a standard called the “ARM Procedure Call Standard” (APCS) which defines: l l l n constraints on the use of registers stack conventions format of a stack backtrace data structure argument passing and result return support for ARM shared library mechanism Compiler generated code conforms to the APCS l l l It's just a standard not an architectural requirement Cannot avoid standard when interfacing C and assembly code Can avoid standard when just writing assembly code or when writing assembly code that isn't called by C code 32

Register Names and Use Register # R 0 a 1 R 1 a 2

Register Names and Use Register # R 0 a 1 R 1 a 2 R 2 a 3 R 3 a 4 R 4. . R 8 v 1. . v 5 R 9 sb/v 6 R 10 sl/v 7 R 11 fp R 12 ip R 13 sp R 14 lr R 15 pc APCS Name APCS Role argument 1 argument 2 argument 3 argument 4 register variables static base/register variable stack limit/register variable frame pointer scratch reg/new sbin inter link unit calls low end of current stack frame link address/scratch register program counter 33

How Does STM Work on Memory ? STM sp!, {r 0 r 15} l

How Does STM Work on Memory ? STM sp!, {r 0 r 15} l l l SPbefore The ARM processor uses a bit vector to represent each register to be saved The architecture places the lowest number register into the lowest address Default STM == STMDB == STMFD SPafter pc lr sp ip fp v 7 v 6 v 5 v 4 v 3 v 2 v 1 a 4 a 3 a 2 a 1 address 0 x 90 0 x 8 c 0 x 88 0 x 84 0 x 80 0 x 7 c 0 x 78 0 x 74 0 x 70 0 x 6 c 0 x 68 0 x 64 0 x 60 0 x 5 c 0 x 58 0 x 54 0 x 50 34

Passing and Returning Structures n Structures are usually passed in registers (and overflow onto

Passing and Returning Structures n Structures are usually passed in registers (and overflow onto the stack when necessary) When a function returns a struct, a pointer to where the struct result is to be placed is passed in a 1 (first argument) n Example n struct s f(int x); is compiled as void f(struct s *result, int x); 35

Example #12: Passing Structures typedef struct two_ch_struct{ char ch 1; char ch 2; }

Example #12: Passing Structures typedef struct two_ch_struct{ char ch 1; char ch 2; } two_ch; two_ch max(two_ch a, two_ch b){ return((a. ch 1 > b. ch 1)? a: b); } /* end max() */ max: mov stmfd pc} sub str str ldrb cmp bls ldr str b. L 2: ldr str. L 1: ldr sub ldmfd ip, sp sp!, {fp, ip, lr, fp, sp, r 0, r 1, r 2, r 3, r 2, . L 2 r 3, r 2, r 3, . L 1 ip, #4 sp, #12 [fp, # 24] [fp, # 16] [fp, # 20] r 3 [fp, # 16] [fp, # 24] [r 2, #0] r 3, [fp, # 20] r 2, [fp, # 24] r 3, [r 2, #0] r 0, [fp, # 24] sp, fp, #12 sp, {fp, sp, pc} 36

The Frame Pointer • Frame pointer (fp) points to the top of stack for

The Frame Pointer • Frame pointer (fp) points to the top of stack for function n By using the frame pointer and storing it at the same offset for every function call, it creates a singly linked list ofactivation records foo: mov ip, sp stmfd sp!, {a 1 a 3, fp, ip, lr, pc} sub fp, ip, #4 <computations go here> sub fp, #12 ldmfd fp, {fp, sp, pc} ip fp sp pc lr ip fp a 3 a 2 a 1 address 0 x 90 0 x 8 c 0 x 88 0 x 84 0 x 80 0 x 7 c 0 x 78 0 x 74 0 x 70 37

Backtrace n n n The fp register points to the stack backtrace structure for

Backtrace n n n The fp register points to the stack backtrace structure for the currently executing function. The saved fp value is (zero or) a pointer to a stack backtrace structure created by the function which called the current function. The saved fp value in this structure is a pointer to the stack backtrace structure for the function that called the current function; and so on back until the first function. 38

Creating the “Backtrace” Structure IPcurrent MOV STMFD SUB … … sub LDMFD SPbefore FPafter

Creating the “Backtrace” Structure IPcurrent MOV STMFD SUB … … sub LDMFD SPbefore FPafter ip, sp sp!, {a 1 a 4, v 1 v 7, fp, ip, sp, lr, pc} fp, ip, #4 fp, #16 fp, {fp, sb, pc} SPcurrent address 0 x 90 (saved) pc 0 x 8 c (saved) lr 0 x 88 (saved) sp 0 x 84 (saved) ip 0 x 80 (saved) fp 0 x 7 c v 7 0 x 78 0 x 74 v 6 0 x 70 v 5 0 x 6 c v 4 0 x 68 v 3 0 x 64 v 2 0 x 60 v 1 0 x 5 c a 4 0 x 58 a 3 0 x 54 a 2 0 x 50 a 1 39

Example Backtrace bar’s frame fp (saved) pc (saved) lr (saved) sp (saved) ip (saved)

Example Backtrace bar’s frame fp (saved) pc (saved) lr (saved) sp (saved) ip (saved) fp v 7 v 6 v 5 v 4 v 3 v 2 v 1 a 4 a 3 a 2 a 1 foo’s frame (saved) pc (saved) lr (saved) sp (saved) ip (saved) fp v 7 v 6 v 5 v 4 v 3 v 2 v 1 a 4 a 3 a 2 a 1 main’s frame (saved) pc (saved) lr (saved) sp (saved) ip (saved) fp v 7 v 6 v 5 v 4 v 3 v 2 v 1 a 4 a 3 a 2 a 1 40

Exercise #1 n Write an assembly subroutine that implements the quicksort algorithm to sort

Exercise #1 n Write an assembly subroutine that implements the quicksort algorithm to sort a list of unsigned integer values. l l The first entry in the list is the list’s length. void quick. Sort(unsigned int *list); list: Input 0 x 00000005 0 x. A 356 A 101 0 x. E 235 C 203 0 x 7 A 35 B 310 0 x 09250037 0 x 29567322 Output 0 x 00000005 0 x 09250037 0 x 29567322 0 x 7 A 35 B 310 0 x. A 356 A 101 0 x. E 235 C 203 41

Exercise #2 n Write an assembly subroutine that deletes an item from an ordered

Exercise #2 n Write an assembly subroutine that deletes an item from an ordered list of unsigned values if it is not already there. l l The first entry in the list is the list’s length. void remove. Item(unsigned int item, unsigned int *list); item: list: Input 0 x 7 A 35 B 310 0 x 00000005 0 x 09250037 0 x 29567322 0 x 7 A 35 B 310 0 x. A 356 A 101 0 x. E 235 C 203 Output 0 x 00000004 0 x 09250037 0 x 29567322 0 x. A 356 A 101 0 x. E 235 C 203 42

Outline n n n Assembly Programming Assembly C Interface Peephole Optimization 43

Outline n n n Assembly Programming Assembly C Interface Peephole Optimization 43

Peephole Optimization n Final pass over generated code: l n See if an obvious

Peephole Optimization n Final pass over generated code: l n See if an obvious replacement is possible: store/load pairs l n n Examine a few consecutive instructions: 2 to 4 MOV %eax => mema MOV mema => %eax Can eliminate the second instruction without needing any global knowledge of mema Use algebraic identities Special case individual instructions 44

Algebraic Identities n Worth recognizing single instructions with a constant operand: l l n

Algebraic Identities n Worth recognizing single instructions with a constant operand: l l n A*2=A+A A*1=A A*0=0 A/1=A More delicate with floating point 45

Is this ever helpful? n n n Why would anyone write X * 1?

Is this ever helpful? n n n Why would anyone write X * 1? Why bother to correct such obvious junk code? In fact one might write #define MAX_TASKS 1. . . a = b * MAX_TASKS; n Also, seemingly redundant code can be produced by other optimizations. This is an important effect. 46

Replace Multiply by Shift n A : = A * 4; l l n

Replace Multiply by Shift n A : = A * 4; l l n Can be replaced by 2 bit left shift (signed/unsigned) But must worry about overflow if language does A : = A / 4; l l l If unsigned, can replace with shift right But shift right arithmetic is a well known problem Language may allow it anyway (traditional C) 47

Addition Chains for Multiplication n If multiply is very slow (or on a machine

Addition Chains for Multiplication n If multiply is very slow (or on a machine with no multiply instruction like the original SPARC), decomposing a constant operand into sum of powers of two can be effective: l l l X * 125 = x * 128 – x * 4 + x Two shifts, one subtract and one add, which may be faster than one multiply Note similarity with efficient exponentiation method 48

The Right Shift Problem n Arithmetic Right shift: l l l Shift right and

The Right Shift Problem n Arithmetic Right shift: l l l Shift right and use sign bit to fill most significant bits 5 111111. . . 1111111011 SAR 111111. . . 111101 Which is 3, not 2 In most languages 5/2 = 2 Prior to C 99, implementations were allowed to truncate towards or away from zero if either operand was negative 49

Folding Jumps to Jumps n A jump to an unconditional jump can copy the

Folding Jumps to Jumps n A jump to an unconditional jump can copy the target address l l JNE lab 1. . . lab 1 JMP lab 2 Can be replaced by JNE lab 2 As a result, lab 1 may become dead (unreferenced) 50

Jump to Return n A jump to a return can be replaced by a

Jump to Return n A jump to a return can be replaced by a return l l JMP lab 1. . . lab 1 RET Can be replaced by RET lab 1 may become dead code 51

Tail Recursion Elimination 1 n A subprogram is tail recursive if the last computation

Tail Recursion Elimination 1 n A subprogram is tail recursive if the last computation is a call to itself: function last (lis : list_type) return lis_type is begin if lis. next = null then return lis; else return last (lis. next); end; l Recursive call can be replaced with lis : = lis. next; goto start; added label 52

Tail Recursion Elimination 2 n n n Saves time: an assignment and jump is

Tail Recursion Elimination 2 n n n Saves time: an assignment and jump is faster than a call with one parameter Saves stack space: converts linear stack usage to constant usage. In languages with no loops, this may be a required optimization: specified in Scheme standard. 53

Tail Recursion Elimination 3 n Consider the sequence on the x 86: l l

Tail Recursion Elimination 3 n Consider the sequence on the x 86: l l l CALL func RET CALL pushes return point on stack, RET in body of func removes it, RET in caller returns Can generate instead: JMP func Now RET in func returns to original caller, because single return address on stack 54

The REALIA COBOL Compiler 1 n n n Full compiler for Standard COBOL, targeted

The REALIA COBOL Compiler 1 n n n Full compiler for Standard COBOL, targeted to the IBM PC. Now distributed by Computer Associates Runs in 150 K bytes, but must be able to handle very large programs that run on mainframes 55

The REALIA COBOL Compiler 2 n n No global optimization possible: multiple linear passes

The REALIA COBOL Compiler 2 n n No global optimization possible: multiple linear passes over code, no global data structures, no flow graph. Multiple peephole optimizations, compiler iterates until code is stable. Each pass scan code backwards to minimize address recomputations 56

Typical COBOL Code Process Balance. if Balance is negative then perform Send Bill else

Typical COBOL Code Process Balance. if Balance is negative then perform Send Bill else perform Record Credit end if. Send Bill. . Record Credit. . 57

Simple Assembly Pb: jnl call jmp L 1: L 2: Sb: ret Rc: ret

Simple Assembly Pb: jnl call jmp L 1: L 2: Sb: ret Rc: ret cmp L 1 Sb L 2 call ret … balance, 0 jump to return Rc … 58

Fold Jump to Return Statement Pb: jnl call ret L 1: L 2: Sb:

Fold Jump to Return Statement Pb: jnl call ret L 1: L 2: Sb: ret Rc: ret cmp L 1 Sb balance, 0 call ret … Rc tail recursion folded tail recursion … 59

Eliminate Tail Recursion Pb: jnl imp ret L 1: L 2: Sb: ret Rc:

Eliminate Tail Recursion Pb: jnl imp ret L 1: L 2: Sb: ret Rc: ret cmp L 1 Sb balance, 0 jump to unconditional jump jmp ret … Rc will become useless … 60

Corresponding Assembly Pb: jnl jmp ret L 1: L 2: Sb: ret Rc: ret

Corresponding Assembly Pb: jnl jmp ret L 1: L 2: Sb: ret Rc: ret cmp Rc Sb jmp ret … balance, 0 folded Rc unreachable … 61

Remove Dead Code Pb: jnl jmp Sb: ret Rc: ret cmp Rc Sb …

Remove Dead Code Pb: jnl jmp Sb: ret Rc: ret cmp Rc Sb … balance, 0 jump to next instruction … 62

Final Code Pb: jnl Sb: ret Rc: ret l l l cmp Rc …

Final Code Pb: jnl Sb: ret Rc: ret l l l cmp Rc … balance, 0 … Final code as efficient as inlining. All transformations are local. Each optimization may yield further optimization opportunities. Iterate till no further change. 63

Arcane Tricks n Consider typical maximum computation l if A >= B then C

Arcane Tricks n Consider typical maximum computation l if A >= B then C : = A; else C : = B; end if; For simplicity assume all unsigned, and all in registers 64

Eliminating Max Jump on x 86 n Simple minded assembly code l CMP A,

Eliminating Max Jump on x 86 n Simple minded assembly code l CMP A, B JNAE L 1 MOV A=>C JMP L 2 L 1: MOV B=>C L 2: One jump in either case 65

Computing Max without Jumps n Architecture specific trick: use subtract with borrow instruction and

Computing Max without Jumps n Architecture specific trick: use subtract with borrow instruction and carry flag l l CMP A, B ; CF=1 if B > A, CF = 0 if A >= B SBB %eax, %eax ; all 1's if B > A, all 0's if A >= B MOV %eax, C NOT C ; all 0's if B > A, all 1's if A >= B AND B=>%eax ; B if B>A, 0 if A>=B AND A=>C ; 0 if B >A, A if A>=B OR %eax=>C ; B if B>A, A if A>=B More instructions, but NO JUMPS Supercompiler: exhaustive search of instruction patterns to uncover similar tricks 66