File: joshi.html

package info (click to toggle)
lg-issue71 1-2
  • links: PTS
  • area: main
  • in suites: woody
  • size: 2,104 kB
  • ctags: 218
  • sloc: perl: 89; makefile: 58; sh: 45; tcl: 14
file content (869 lines) | stat: -rw-r--r-- 40,973 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
<!-- saved from url=(0022)http://internet.e-mail -->
<!--startcut  ==============================================-->
<!-- *** BEGIN HTML header *** -->
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
<HTML><HEAD>
<title>Code Optimization Using the GNU C Compiler LG #71</title>
</HEAD>
<BODY BGCOLOR="#FFFFFF" TEXT="#000000" LINK="#0000FF" VLINK="#0000AF"
ALINK="#FF0000">
<!-- *** END HTML header *** -->

<CENTER>
<A HREF="http://www.linuxgazette.com/">
<IMG ALT="LINUX GAZETTE" SRC="../gx/lglogo.png" 
    WIDTH="600" HEIGHT="124" border="0"></A> 
<BR>

<!-- *** BEGIN navbar *** -->
<IMG ALT="" SRC="../gx/navbar/left.jpg" WIDTH="14" HEIGHT="45" BORDER="0" ALIGN="bottom"><A HREF="arndt.html"><IMG ALT="[ Prev ]" SRC="../gx/navbar/prev.jpg" WIDTH="16" HEIGHT="45" BORDER="0" ALIGN="bottom"></A><A HREF="lg_toc71.html"><IMG ALT="[ Table of Contents ]" SRC="../gx/navbar/toc.jpg" WIDTH="220" HEIGHT="45" BORDER="0" ALIGN="bottom" ></A><A HREF="../lg_frontpage.html"><IMG ALT="[ Front Page ]" SRC="../gx/navbar/frontpage.jpg" WIDTH="137" HEIGHT="45" BORDER="0" ALIGN="bottom"></A><A HREF="http://www.linuxgazette.com/cgi-bin/talkback/all.py?site=LG&article=http://www.linuxgazette.com/issue71/joshi.html"><IMG ALT="[ Talkback ]" SRC="../gx/navbar/talkback.jpg" WIDTH="121" HEIGHT="45" BORDER="0" ALIGN="bottom"  ></A><A HREF="../lg_faq.html"><IMG ALT="[ FAQ ]" SRC="./../gx/navbar/faq.jpg"WIDTH="62" HEIGHT="45" BORDER="0" ALIGN="bottom"></A><A HREF="orr.html"><IMG ALT="[ Next ]" SRC="../gx/navbar/next.jpg" WIDTH="15" HEIGHT="45" BORDER="0" ALIGN="bottom"  ></A><IMG ALT="" SRC="../gx/navbar/right.jpg" WIDTH="15" HEIGHT="45" ALIGN="bottom">
<!-- *** END navbar *** -->
<P>
</CENTER>

<!--endcut ============================================================-->

<H4 ALIGN="center">
"Linux Gazette...<I>making Linux just a little more fun!</I>"
</H4>

<P> <HR> <P> 
<!--===================================================================-->

<center>
<H1><font color="maroon">Code Optimization Using the GNU C Compiler</font></H1>
<H4>By <a href="mailto:rahulj@veritas.com">Rahul U Joshi</a></H4>
</center>
<P> <HR> <P>  

<!-- END header -->




<p>
<i>
This article describes some of the code optimization techniques used by the
GNU C Compiler, in order to give the reader a feel of what code optimization is
and how it can increase the efficiency of the generated object code.
</i></p>

<h2>Contents</h2>
<OL>
<LI><a href="#sect1">Introduction</a>
<LI><a href="#sect2">Assembly Language Code for a C Program</a>
<LI><a href="#sect3">Constant Folding</a>
<LI><a href="#sect4">Common Subexpression Elimination</a>
<LI><a href="#sect5">Dead Code Elimination</a>
<LI><a href="#sect6">Strength Reduction using Induction Variable</a>
<LI><a href="#sect7">Conclusion</a>
<LI><a href="#sect8">Acknowledgments</a>
<LI><a href="#sect9">References</a>
</OL>

<h2><A NAME="sect1">1. Introduction</A></h2>
<p>
    As we all know, a compiler is a program that reads the source program in
a high-level language and translates it into (typically) machine language. This
is a complicated process involving a number of stages. If the compiler is
an <i>optimizing compiler,</i> one of these stages "optimizes" the 
machine language code so that it either takes less time to run or occupies
less memory or sometimes both. Of course, whatever optimizations the compiler
does, it must not affect the logic of the program i.e. the optimization must
preserve the meaning of the program. One might wonder what type of optimizations
the compiler uses to produce efficient machine code? Since in no case the 
meaning of the program being compiled should be changed, the compiler must
inspect the program very thoroughly and find out the suitable optimizations
that can be applied. As we may wonder, such a thorough analysis of the program
and then finding and applying suitable optimizations is a complex and time
consuming process, the details of which are beyond the scope of this article.
</p>

<p>
    What I am going to describe in this article are a few type of code
optimizations that the GNU C Compiler uses so that we can understand how the
code is optimized and appreciate the complexity of the optimization process.
The GNU C Compiler is a sophisticated optimizing C compiler that uses a large
array of optimization techniques to produce efficient code. It may not be
possible to describe all of them, so I have chosen some that are interesting
and also easy to understand. A complete list of the optimization techniques
that are used by the GNU C compiler is available at <a href="http://www.redhat.com/products/support/gnupro/gnupro_gcc.html">http://www.redhat.com/products/support/gnupro/gnupro_gcc.html</a>.
Instead of simply describing what a particular optimization technique does,
I will describe them using the assembly language code that is generated by
the GNU C Compiler. This method will also enable the readers to further
explore code optimization carried out by the compiler and also the advanced
optimization techniques, in case they are interested.
</p>


<h2><A name="sect2">2. Assembly Language Code for a C Program</A></h2>
<p>
    The GNU C Compiler (called GCC henceforth) reads a C program source file and
translates it into an object program that contains the machine code for the
program in binary form. However, it can also produce assembly language source
code for the program instead of the object code, so that we can read it and
understand how the assembly language program looks. We will be generating such
assembly language files to see the optimizations being used by the compiler,
so it would be beneficial if we see how the assembly language code for a simple
C program looks like. However, also note that we need not understand <i>each</i>
assembly language statement in the generated code, so some statements that are
not crucial towards understanding the code optimization will not be explained
for simplicity. To generate the assembly language code, create a file
<tt>test1.c</tt> as shown below and give the following command:</p>

    <PRE>$ gcc -c -S test1.c</PRE>

<p>
This will generate a file <tt>test1.s</tt>, which has the assembly language listing of
the generated code for the C program. The file <tt>test1.c</tt> along with the assembly
language code is shown below.</p>

<PRE><TT>
 1 : /* test1.c */
 2 : /* This first file simply demonstrates how the assembly program that the
 3 :    compiler produces looks like and some peculiarities of the GNU 
 4 :    assembler that follows some different conventions from MASM/TASM.
 5 :  */
 6 : #include &lt;stdio.h&gt;
 7 : 
 8 : int main()
 9 : {
10 :     printf("Hello, World\n");
11 :     return 0;
12 : }
13 : 
14 : /* end test1.c */
15 : /* ----------------------------------------------------------------- */
16 : /* generated assembly language file */
17 :     .file   "test1.c"           /* some assembler directives to be */
18 :     .version    "01.01"         /* ignored */
19 : gcc2_compiled.:
20 : .section    .rodata             /* this segment has read-only data */
21 : .LC0:
22 :     .string "Hello, World\n"
23 : .text
24 :     .align 4
25 : .globl main
26 :     .type    main,@function
27 : main:                           /* main function begins */
28 :     pushl $.LC0                 /* push parameter for printf() on stack */
29 :     call printf                 /* call the function */
30 :     addl $4,%esp                /* clear the stack */
31 : 
32 :     xorl  %eax,%eax             /* make EAX = 0, functions use register */
33 :     jmp .L1                     /* EAX to return values */
34 :     .p2align 4,,7               /* this is an alignment directive */
35 : .L1:
36 :     ret                         /* return from main, done */
37 : .Lfe1:
38 :     /* other assembler directives to be ignored */
39 :     .size    main,.Lfe1-main        
40 :     .ident  "GCC: (GNU) egcs-2.91.66 19990314/Linux (egcs-1.1.2 release)"
41 : /* end of the generated assembly language file */
42 : /* ---------------------------------------------------------------- */
</TT></PRE>

<p>
In case you know about the architecture
of the 80386 or higher microprocessors and have experience with assembly
language programming, you will find that the assembly language that is produced
is only partly familiar to you. This assembly language follows the AT&amp;T
assembler syntax that is different from the Intel/Microsoft
Assembler/Turbo Assembler syntax that most of us are probably familiar with.
Let us go through the assembly language code. We will ignore the assembler
directives as they are not crucial towards understanding the generated code.
On line 20 a read-only data segment has been defined with the string <tt>"Hello,
World\n"</tt> in it. A label <tt>.LC0</tt> has been assigned to the string. On line 27, the code for the <tt>main()</tt>
function begins. As we know, in C language the parameters to functions are
passed by pushing them on the stack, and the parameters are pushed in the
reverse order i.e. the last parameter is pushed first on the stack. In this
case, the <tt>printf()</tt> function is being called with a single parameter, the
string <tt>"Hello, World\n"</tt>. The statement <tt>pushl $.LC0</tt> pushes the address of the
string <tt>"Hello, World\n"</tt> on the stack. The "<tt>l</tt>" in <tt>pushl</tt> stands for "long" as we
are dealing with 32 bit variables. In all the assembly language programs that
we will see, the mnemonics will be followed by an "<tt>l</tt>" to indicate that we are
dealing with 32 bit variables. The "<tt>$</tt>" preceding <tt>.LC0</tt> means the address of the
string. The next statement calls the <tt>printf()</tt> function. After the <tt>printf()</tt>
function finishes execution, we need to cleanup the stack, so we need to add 4
to ESP. (Why 4? That's because we pushed 4 bytes on the stack before calling
<tt>printf()</tt>.)
To do so, we would normally write, <tt>ADD ESP, 4</tt>. The Intel convention is
<tt>&lt;instruction&gt; dest, src</tt>. However, the AT&amp;T convention is <tt>&lt;instruction&gt; src, dest</tt>, so you can
see that the instruction on line 30 is <tt>addl $4, %esp</tt>. Immediate operands like
4 are prefixed by a $ and register names are prefixed by a %. This convention
was followed to maintain compatibility with the BSD assembler. The next
statement XOR's EAX with itself so that EAX = 0 afterwards. This is because
the return value from a function is stored in the EAX register. After that,
you will see a jump instruction and an alignment directive. These will not be
explained and it will suffice to know that <tt>main()</tt> function will return back at
this point. Now that we understand how the assembly language code looks like,
let us move towards the optimizations.</p>

<h2><A name="sect3">3. Constant Folding</a></h2>
<p>
    Constant folding is the simplest code optimization to understand. Let
us suppose that you write the statement <tt>x = 45 * 88;</tt> in your C program. A non-optimizing
compiler will generate code to multiply 45 by 88 and store the
value in x. An optimizing compiler will detect that both 45 and 88 are
constants, so their product will also be a constant. Hence it will find 45 *
88 = 3960 and generate code that simply copies 3960 into x. This is 
<i>constant folding</i>, and means the calculation is done just once, at
compile time, rather than every time the program is run. To illustrate this,
create the file <tt>test2.c</tt> as shown below. Generate the assembly code
for this program as was shown earlier. Since by default, GCC does not
optimize the program, you will see that the assembly code is a
straightforward translation of the C program (Lines 18 to 62).</p>

<PRE><TT>
 1 : /* test2.c */
 2 : /* Demonstration of constant propagation */
 3 : #include &lt;stdio.h&gt;
 4 : 
 5 : int main()
 6 : {
 7 :     int x, y, z;
 8 :     x = 10;
 9 :     y = x + 45;
10 :     z = y + 4;
11 :     printf("The value of z = %d", z);
12 :     return 0;
13 : }
14 : /* end of test2.c */
15 : 
16 : /* ---------------------------------------------------------------- */
17 : /* assembly language file without any optimizations */
18 :     .file   "test2.c"
19 :     .version    "01.01"
20 : gcc2_compiled.:
21 : .section    .rodata
22 : .LC0:
23 :     .string "The value of z = %d"
24 : .text
25 :     .align 4
26 : .globl main
27 :     .type    main,@function
28 : main:
29 :     pushl %ebp             /* save EBP register on stack */
30 :     movl %esp,%ebp         /* EBP = ESP */
31 :     subl $12,%esp          /* Create stack frame. 3 variables x 4 bytes */
32 : 
33 :     /* x = 10; */
34 :     movl $10,-4(%ebp)      /* x = 10. x is topmost on the stack */
35 : 
36 :     /* y = x + 45; */
37 :     movl -4(%ebp),%edx     /* EDX = x */
38 :     addl $45,%edx          /* EDX = EDX + 45 */
39 :     movl %edx,-8(%ebp)     /* y = EDX. y is second from top of stack */
40 : 
41 :     /* z = y + 4 */
42 :     movl -8(%ebp),%edx     /* EDX = y */
43 :     addl $4,%edx           /* EDX = EDX + 4 */
44 :     movl %edx,-12(%ebp)    /* z = EDX. z is third from top of stack */
45 : 
46 :     /* printf("The value of z = ", z); */
47 :     movl -12(%ebp),%eax    /* EAX = z */
48 :     pushl %eax             /* push EAX(=z) as first parameter of printf */
49 :     pushl $.LC0            /* second parameter for printf */
50 :     call printf                 
51 :     addl $8,%esp           /* clear stack */
52 : 
53 :     /* return 0; */
54 :     xorl %eax,%eax         /* for return 0 */
55 :     jmp .L1
56 :     .p2align 4,,7
57 : .L1:
58 :     leave
59 :     ret
60 : .Lfe1:
61 :     .size    main,.Lfe1-main
62 :     .ident  "GCC: (GNU) egcs-2.91.66 19990314/Linux (egcs-1.1.2 release)"
63 : /* end of assembly language code */
64 : /* ---------------------------------------------------------------- */
65 : 
66 : /* ---------------------------------------------------------------- */
67 : /* generated assembly language code after optimization */
68 : 
69 :     .file   "test2.c"
70 :     .version    "01.01"
71 : gcc2_compiled.:
72 : .section    .rodata
73 : .LC0:
74 :     .string "The value of z = %d"
75 : .text
76 :     .align 4
77 : .globl main
78 :     .type    main,@function
79 : main:
80 :     pushl %ebp             /* Save EBP register on stack */
81 :     movl %esp,%ebp         /* EBP = ESP */
82 : 
83 :     /* by constant propagation, z will always be 59 */
84 :     /* printf("The value of z = %d", z); */
85 :     pushl $59              /* first printf parameter */
86 :     pushl $.LC0            /* second printf parameter */
87 :     call printf
88 :                            /* no need of cleanup, we are exiting anyway */
89 :     /* return 0; */
90 :     xorl %eax,%eax              
91 :     leave
92 :     ret
93 : .Lfe1:
94 :     .size    main,.Lfe1-main
95 :     .ident  "GCC: (GNU) egcs-2.91.66 19990314/Linux (egcs-1.1.2 release)"
96 : /* end of assembly language code */
97 : /* ----------------------------------------------------------------- */
</TT></PRE>

<p>
    There are some new things in the assembly code that we need to
understand. First of all, the <tt>main()</tt> function defines 3 local variables for
whom space will be allocated on the stack. Also, to access these variables, we
will be using the indexed addressing mode and we will be using EBP for that.
So the first statement saves the current value of EBP onto the stack. Then
the stack pointer ESP is copied into EBP (note that the source ESP is first)
so that EBP can be used for accessing the local variables. Finally, we need
to create space for three 4-byte variables on the stack, so we subtract 12
from ESP; i.e., we grow the stack by 12 bytes. <tt>x</tt> will be the
bottom-most stack element and at an offset of -4 from EBP. See the diagram
below.</p>

<pre><tt>
             EBP-----> ---------------
                       |      x      |       4 Bytes
                       ---------------
                       |      y      |       4 Bytes
                       ---------------
                       |      z      |       4 Bytes
             ESP-----> ---------------
                       |             |
                       |             |
                   ^   |             |
                   |         ...
Address increases  |   |             |
as we go up        | 0 ---------------  
</tt></pre>

<p>
Similarly, the offset of y from EBP is -8 and that of z is -12. Now consider the
assignment <tt>x = 10;</tt>. The statement to implement it on line 34 is 
<tt>movl $10, -4(%ebp)</tt>. The indexed addressing mode is written as 
<tt>&lt;offset&gt;(&lt;base register&gt;)</tt>. Thus the above statement will copy the constant 10
to the address (EBP - 4) i.e. the address of <tt>x</tt> on the stack. In a similar
manner, <tt>-8(%ebp)</tt> is used to access <tt>y</tt> and <tt>-12(%ebp)</tt> is used to access <tt>z</tt>. Lines
37, 38 and 39 evaluate <tt>x + 45</tt> and assign the result to <tt>y</tt>. To do so, the value
of <tt>x</tt> is first copied into the EDX register, 45 is added to it and the result
which is in EDX is copied to <tt>y</tt>. The code for <tt>z = y + 4</tt> is similar. In line 47
the parameters for <tt>printf()</tt> are pushed on the stack. The last parameter (<tt>z</tt>) is
pushed first and then the address of the string is pushed. In line 51, we
cleanup the stack by adding 8 to ESP. I guess that since we pushed two 4-byte
parameters, we had to add 8 to ESP. In the first example, we pushed only one
parameter and hence we had to add just 4 to ESP. The rest of the code is easy
to understand.</p>

<p>
    Now if we look at the above program, when we evaluate <tt>x + 45</tt>, the
value of <tt>x</tt> is always going to be 10 because of the assignment <tt>x = 10</tt>.
Therefore <tt>x + 45</tt> will always evaluate to 55. So the statement always assigns
55 to <tt>y</tt>. Similarly, when evaluating <tt>y + 4</tt>, the result will always be 59. If
the optimization is enabled, the compiler will be in a position to detect
this. To enable optimization, use the -O2 option. Enter the following command:</p>

    <PRE>gcc -c -S -O2 test2.c</PRE>

<p>
The assembly code generated with optimization enabled is shown in lines 69 to
95. Notice that on line 85, the compiler directly pushes the constant 59 on
the stack followed by the address of the string and calls <tt>printf()</tt>. Thus, by
constant folding, the compiler evaluates the various expressions in the
program only once and plugs the final value into the generated code. One more
interesting thing to observe is that after constant folding, there is no need
of the variables <tt>x</tt>, <tt>y</tt> and <tt>z</tt>. Therefore no space for them is allocated on the
stack, thus reducing the memory requirement of the program. This brings out
the fact that one optimization may lead to another one. In the above case
constant folding lead to a decrease in run time (since all the computations
are done at compile time) and also to a decrease in the space requirements.</p>

<h2><a name="sect4">4. Common Subexpression Elimination</a></h2>
<p>
    Many a times, it happens that the same expression is evaluated at
different places in the same program and the values of the operands in the
expression do not change in between the two evaluations of that expression.
For example, the program may evaluate say <tt>a * b</tt> at the beginning and at the
end. If the values of <tt>a</tt> and <tt>b</tt> do not change in between these two evaluations
of <tt>a * b</tt>, then instead of evaluating <tt>a * b</tt> again at the end, we can save the
result of the evaluation of <tt>a * b</tt> at the beginning in some temporary variable
and use it at the end. This will help eliminate redundant computations in the
program. This optimization is called as <i>common subexpression elimination</i>.
Consider the following program that demonstrates it.</p>

<PRE><TT>
 1 : /* test3.c */
 2 : /* common subexpression elimination, and also of constant propagation */
 3 : #include &lt;stdio.h&gt;
 4 : 
 5 : int main()
 6 : {
 7 :     int a, b;
 8 :     int x, y, z;
 9 :     scanf("%d %d", &amp;a, &amp;b);
10 :     x = a * b;
11 : 
12 :     if(b &gt;= 4)
13 :     {
14 :         y = a * b;
15 :         z = 0;
16 :     }
17 :     else
18 :     {
19 :         z = a * b * 4;
20 :         y = 0;
21 :     }
22 : 
23 :     printf("x = %d, y = %d, z = %d\n", x, y, z);
24 :     return 0;
25 : }
26 : /* end of test3.c */    
27 : 
28 : /* ----------------------------------------------------------------- */
29 : /* generated unoptimized assembly language code */
30 :     .file   "test3.c"
31 :     .version    "01.01"
32 : gcc2_compiled.:
33 : .section    .rodata
34 : .LC0:
35 :     .string "%d %d"
36 : .LC1:
37 :     .string "x = %d, y = %d, z = %d\n"
38 : .text
39 :     .align 4
40 : .globl main
41 :     .type    main,@function
42 : main:
43 :     pushl %ebp                  /* save EBP */
44 :     movl %esp,%ebp              /* EBP = ESP */
45 :     subl $20,%esp               /* Create space for 5 variables */
46 : 
47 :     /* scanf("%d %d". &amp;a, &amp;b); */
48 :     leal -8(%ebp),%eax
49 :     pushl %eax                  /* push address of b on stack */
50 :     leal -4(%ebp),%eax
51 :     pushl %eax                  /* push address of a on stack */
52 :     pushl $.LC0                 /* push address of string */
53 :     call scanf
54 :     addl $12,%esp               /* stack cleanup after scanf */
55 : 
56 :     /* x = a * b; */
57 :     movl -4(%ebp),%eax          /* EAX = a */
58 :     imull -8(%ebp),%eax         /* EAX = EAX * b = a * b */
59 :     movl %eax,-12(%ebp)         /* x = EAX = a * b */
60 : 
61 :     /* if( b &gt;= 4)... */
62 :     cmpl $3,-8(%ebp)            /* compare b with 3 */
63 :     jle .L2                     /* else part at label .L2, if follows */
64 : 
65 :     /* y = a * b; */
66 :     movl -4(%ebp),%eax          /* EAX = a */
67 :     imull -8(%ebp),%eax         /* EAX = EAX * b = a * b */
68 :     movl %eax,-16(%ebp)         /* y = EAX = a * b */
69 :     /* z = 0; */
70 :     movl $0,-20(%ebp)
71 :     jmp .L3                     /* jump over the else part */
72 : 
73 :     .p2align 4,,7
74 : .L2:
75 :     /* else part begins here */
76 : 
77 :     /* z = a * b * 4; */
78 :     movl -4(%ebp),%eax          /* EAX = a */
79 :     imull -8(%ebp),%eax         /* EAX = EAX * b = a * b */
80 :     leal 0(,%eax,4),%edx        /* EDX = EAX*4 + 0 */
81 :     movl %edx,-20(%ebp)         /* z = EDX */
82 :     /* y = 0; */
83 :     movl $0,-16(%ebp)
84 : .L3:
85 :     /* if..else is over here */
86 : 
87 :     /* printf("x = %d, y = %d, z = %d\n", x, y, x); */
88 :     movl -20(%ebp),%eax
89 :     pushl %eax                  /* push address of z on stack */
90 :     movl -16(%ebp),%eax
91 :     pushl %eax                  /* push address of y on stack */
92 :     movl -12(%ebp),%eax
93 :     pushl %eax                  /* push address of x on stack */
94 :     pushl $.LC1                 /* address of string */
95 :     call printf
96 :     addl $16,%esp               /* stack cleanup after printf */
97 : 
98 :     /* return 0 */
99 :     xorl %eax,%eax
100 :     jmp .L1
101 :     .p2align 4,,7
102 : .L1:
103 :     leave
104 :     ret
105 : .Lfe1:
106 :     .size    main,.Lfe1-main
107 :     .ident  "GCC: (GNU) egcs-2.91.66 19990314/Linux (egcs-1.1.2 release)"
108 : /* end of unoptimized assembly language code */
109 : /* --------------------------------------------------------------- */
110 : 
111 : /* --------------------------------------------------------------- */
112 : /* generated optimized assembly language code */
113 :     .file   "test3.c"
114 :     .version    "01.01"
115 : gcc2_compiled.:
116 : .section    .rodata
117 : .LC0:
118 :     .string "%d %d"
119 : .LC1:
120 :     .string "x = %d, y = %d, z = %d\n"
121 : .text
122 :     .align 4
123 : .globl main
124 :     .type    main,@function
125 : main:
126 :     pushl %ebp               /* save EBP */
127 :     movl %esp,%ebp           /* EBP = ESP */
128 :     subl $8,%esp             /* space for just 2 variables on stack */
129 : 
130 :     /* scanf("%d %d", &amp;a, &amp;b); */
131 :     leal -4(%ebp),%eax
132 :     pushl %eax               /* push address of b on stack */
133 :     leal -8(%ebp),%eax
134 :     pushl %eax               /* push address of a on stack */
135 :     pushl $.LC0              /* address of string */
136 :     call scanf
137 : 
138 :     /* x = a * b; */
139 :     movl -4(%ebp),%eax       /* EAX = b */
140 :     movl %eax,%edx           /* EDX = EAX = b */
141 :     imull -8(%ebp),%edx      /* EDX = EDX * a = b * a = a * b */
142 : 
143 :     addl $12,%esp            /* delayed stack cleanup */
144 :     /* if( b >= 4).... */
145 :     cmpl $3,%eax             /* compare EAX = b with 3 */
146 :     jle .L17                 /* else part from label .L17 */
147 : 
148 :                              /* y stored in ECX, z in EAX, x in EDX */   
149 :     /* y = a * b; */
150 :     movl %edx,%ecx
151 :     /* z = 0; */
152 :     xorl %eax,%eax
153 :     jmp .L18                 /* jump over the else part */
154 :     .p2align 4,,7
155 : .L17:
156 :     /* z = a * b * 4; */
157 :     leal 0(,%edx,4),%eax     /* LEA EAX, [EDX*4]+0 */
158 :     /* y = 0; */
159 :     xorl %ecx,%ecx
160 : .L18:
161 :     pushl %eax               /* push value of z */
162 :     pushl %ecx               /* push value of y */
163 :     pushl %edx               /* push value of x */
164 :     pushl $.LC1              /* push address of string */
165 :     call printf
166 :     /* stack cleanup after printf not necessary */
167 : 
168 :     /* return 0; */
169 :     xorl %eax,%eax
170 :     leave
171 :     ret
172 : .Lfe1:
173 :     .size    main,.Lfe1-main
174 :     .ident  "GCC: (GNU) egcs-2.91.66 19990314/Linux (egcs-1.1.2 release)"
175 : /* end optimized assembly code */
176 : /* -------------------------------------------------------------- */
</TT></PRE>

<p>
    This program has an example of a common subexpression. On line 10, the expression
<tt>a * b</tt> is evaluated the first time, and then again on lines 14 and 19. The last two evaluations
of <tt>a * b</tt> are redundant since the value of neither <tt>a</tt> nor <tt>b</tt> changes after the first evaluation.
Thus these two evaluations are common subexpressions that can be eliminated. Lines 30 to 108
show the unoptimized assembly code that is a straightforward translation of the C code and
should be easy to understand. Now consider the optimized assembly language code from lines 113
to 176. The first thing to notice is that now only variables <tt>a</tt> and <tt>b</tt> are stored on the stack,
hence a stack of just 8 bytes is required as opposed to a 20 byte stack for the unoptimized
version. You may wonder what's so special about variables <tt>a</tt> and <tt>b</tt>. The specialty is that the
address of variables <tt>a</tt> and <tt>b</tt> is used in the program (for <tt>scanf()</tt>) and variables that reside in
the registers cannot have a memory address. Hence the compiler is unable to put them inside
the registers. The registers that the compiler chooses for other variables are as follows:
<tt>x</tt> in EDX, <tt>y</tt> in ECX and <tt>z</tt> in EAX. Statements 139 to 141 evaluate <tt>a * b</tt> and the value is stored
in EDX (which holds <tt>x</tt>). Also, the value of <tt>b</tt> is available in the register EAX.
One more thing to notice is the delayed cleanup of stack after the call to
<tt>scanf()</tt> on line 143. May be there is some advantage in doing so, I don't know.</p>

<p>
    Next starts the <tt>if</tt> statement. In the <tt>if</tt> part, the expression <tt>a * b</tt> is reevaluated and
we expect that the compiler will use the value stored in EDX. That is exactly
what the compiler does. For the statement <tt>y = a * b</tt>, the generated code on
line 150 simply copies the value of <tt>a * b</tt> in register EDX into the register
ECX (which holds <tt>y</tt>). In the <tt>else</tt> part again, there is a common subexpression
in the statement <tt>z = a * b * 4</tt>. Thus we expect that the compiler will take
the value of <tt>a * b</tt> in EDX, multiply it by 4 and then store the result in EAX
(which holds <tt>z</tt>). Now there are many ways in which this can be done. One is
to use a sequence of <tt>movl</tt>, <tt>imull</tt> instructions as in</p>
<pre><tt>
    movl %edx, %eax         /* EAX = EDX  = a * b */
    imull $4, %eax          /* EAX = EAX * 4 */
</tt></pre>
<p>
The second choice is to use a shift (<tt>sall</tt>) instead of <tt>imull</tt> in the above
sequence of instructions. However, it turns out that the GCC uses an obscure
speed trick to optimize the code. It uses the instruction</p>
<pre><tt>
    leal 0(,%edx,4), %eax
</tt></pre>
<p>
This instruction uses the scaling and indexing capabilities of the 80386
processors. The above instruction takes EDX as the index register, 4 as the
scale and 0 as the offset, calculates the effective address and stores it
in the EAX register. Thus the register EAX gets a value EDX(=a*b)*4 + 0 i.e.
<tt>a * b * 4</tt>. The <tt>leal</tt> instruction always executes in 2 clock cycles on an 80386,
whereas a simple shift will take around 7 clock cycles. We see
that the compiler knows about processor peculiarities and uses
it when it generates the code.</p>

<p>
    Thus it can be seen that common subexpressions save a lot of computations and also
space that is used up by the code for these redundant computations. At this stage, one can
understand that the compiler when analyzing the program for optimization must keep a track
of how and when the variables change, the expressions that are evaluated and also
which registers are allocated to variables. In general such a kind of analysis that the
compiler performs on the program is called as <i>data flow analysis</i>.</p>


<h2><a name="sect5">5. Dead Code Elimination</a></h2>
<p>
    Dead code is the code in the program that will never be executed for any input or other
conditions. A common example is an <tt>if</tt> statement. If the compiler finds out that the condition
inside the <tt>if</tt> is never going to be true, then the body of the <tt>if</tt> statement will never be
executed. In that case, the compiler can completely eliminate this dead code, thus saving the
memory space occupied by the code. The following program demonstrates dead code elimination.</p>

<pre><tt>
 1 : /* test4.c */
 2 : /* demonstration of dead code elimination */
 3 : #include &lt;stdio.h&gt;
 4 :
 5 : int main()
 6 : {
 7 :     int x;
 8 :
 9 :     scanf("%d", &amp;x);
10 :
11 :     if(x &lt; 0 && x &gt; 0)
12 :     {
13 :         x = 99;
14 :         printf("Hello. Inside the if!!!");
15 :     }
16 :
17 :     return 0;
18 : }
19 : /* end of test4.c */
20 :
21 : /* --------------------------------------------------------------- */
22 : /* optimized assembly code */
23 :     .file   "test4.c"
24 :     .version    "01.01"
25 : gcc2_compiled.:
26 : .section    .rodata
27 : .LC0:
28 :     .string "%d"
29 : .LC1:
30 :     .string "Hello. Inside the if!!!"
31 : .text
32 :     .align 4
33 : .globl main
34 :     .type    main,@function
35 : main:
36 :     pushl %ebp             /* save EBP on stack */
37 :     movl %esp,%ebp         /* EBP = ESP */
38 :     subl $4,%esp           /* create space for x on the stack */
39 :
40 :     /* scanf("%d", &amp;x); */
41 :     leal -4(%ebp),%eax
42 :     pushl %eax             /* push address of a on stack */
43 :     pushl $.LC0            /* push string on stack */
44 :     call scanf
45 :
46 :     /* the entire body of the if and the condition checking is dead code */
47 :     /* return 0; */
48 :     xorl %eax,%eax         /* no stack cleanup, we are exiting anyway */
49 :     leave
50 :     ret
51 : .Lfe1:
52 :     .size    main,.Lfe1-main
53 :     .ident  "GCC: (GNU) egcs-2.91.66 19990314/Linux (egcs-1.1.2 release)"
54 : /* end optimized assembly code */
55 : /* ---------------------------------------------------------------- */
</tt></pre>

<p>
    Since the unoptimized assembly language code serves no purpose and also its easy
to understand, I have omitted it. In the program, the condition on line 11 is <tt>x &lt; 0 &amp;&amp; x &gt; 0</tt>,
which can never be true. The compiler finds this and concludes that the body of the <tt>if</tt>
statement forms dead code, so it does not generate any code for that. One interesting
thing to notice is that after the body of <tt>if</tt> has been eliminated, the string
<tt>"Hello. Inside the if!!!"</tt> is no longer needed and hence can be eliminated from the read-only
data section. However, detection of such things probably will increase the complexity of the
compiler and hence is not done.</p>

<h2><a name="sect6">6. Strength Reduction using Induction Variable</a></h2>
<p>
    One type of code optimization is strength reduction in which a "costly" operation is
replaced by a less expensive one. For example, the evaluation of x<SUP>2</SUP> is much more efficient if
we multiply x by x rather than call the exponentiation routine. One place where this optimization
can be applied is in loops. Many times in a loop, one variable changes in synchronization with
the loop variable. Such a variable is called as <i>induction variable</i>. Induction variables give
the compiler a chance to apply strength reduction, as can be seen from the following program.</p>

<PRE><TT>
 1 : /* test5.c */
 2 : /* demonstration of induction variable elimination */
 3 : int main()
 4 : {
 5 :     int i, j;
 6 :
 7 :     for(i = 0; i &lt; 10; i++)
 8 :     {
 9 :         j = i * 7;
10 :         printf("i = %d, j = %d", i, j);
11 :     }
12 :     return 0;
13 : }
14 : /* end test5.c */
15 :
16 : /* --------------------------------------------------------------- */
17 : /* optimized assembly language code */
18 :     .file   "test5.c"
19 :     .version    "01.01"
20 : gcc2_compiled.:
21 : .section    .rodata
22 : .LC0:
23 :     .string "i = %d, j = %d"
24 : .text
25 :     .align 4
26 : .globl main
27 :     .type    main,@function
28 : main:
29 :     pushl %ebp                  /* save EBP on stack */
30 :     movl %esp,%ebp              /* ESP = EBP */
31 :
32 :     pushl %esi                  /* ESI will hold 'j' */
33 :     pushl %ebx                  /* EBX will hold 'i' */
34 :     xorl %ebx,%ebx              /* both are initialized to zero */
35 :     xorl %esi,%esi
36 :     .p2align 4,,7
37 : .L5:
38 :     /* printf("i = %d, j = %d", i, j); */
39 :     pushl %esi                  /* push value of j */
40 :     pushl %ebx                  /* push value of i */
41 :     pushl $.LC0                 /* push address of string */
42 :     call printf
43 :     addl $12,%esp               /* stack cleanup */
44 :
45 :     /* instead of saying j = i * 7, its efficient to say j = j + 7 */
46 :     addl $7,%esi
47 :     incl %ebx                   /* i++ */
48 :     cmpl $9,%ebx                /* is i &lt;= 9, then repeat the loop */
49 :     jle .L5
50 :
51 :     /* return 0; */
52 :     xorl %eax,%eax
53 :     leal -8(%ebp),%esp
54 :     popl %ebx
55 :     popl %esi
56 :     leave
57 :     ret
58 : .Lfe1:
59 :     .size    main,.Lfe1-main
60 :     .ident  "GCC: (GNU) egcs-2.91.66 19990314/Linux (egcs-1.1.2 release)"
61 :
62 : /* end optimized assembly code */
63 : /* ----------------------------------------------------------------- */
</TT></PRE>

<p>
    Here <tt>i</tt> is the loop variable whereas <tt>j</tt> is the induction variable as <tt>j</tt> always changes
in lock step with <tt>i</tt>. In the generated assembly language code, the compiler has decided to
use registers to store both <tt>i</tt> and <tt>j</tt> with <tt>i</tt> in EBX and <tt>j</tt> in ESI. Line 34 initializes EBX(=<tt>i</tt>) to
zero, as is indicated in the initialization part of the for loop. The compiler sees that during
the first pass, <tt>j</tt> will be assigned a value of 0, so it also initialized ESI to 0 in line 35.
The loop starts at line 37 and continues till line 49. Since <tt>j</tt> already has its value assigned
to it, the first thing the compiler does inside the loop is to call the <tt>printf()</tt> function. Here,
the compiler has adjusted the order of the statements in the loop so as to enable it to use
strength reduction. After analyzing the program, the compiler sees that inside the loop,
the value of <tt>i</tt> is always going to increase by 1 and since <tt>j</tt> is an induction variable, its
value is always going to increase by 7. Therefore, instead of multiplying <tt>i</tt> by 7 (which is costly),
it adds 7 to the old value of <tt>j</tt> before going on to the next iteration. Thus a costly operation
of multiplication has been replaced by addition which is cheaper (in terms of time).</p>

<h2><a name="sect7">7. Conclusion</a></h2>

<p>
    In this article we have seen some very basic code optimization techniques that the
GNU C Compiler uses to optimize the generated code. From these optimization techniques and
the various pieces of information that are required to apply them, the reader can appreciate
the type of sophisticated and complex analysis that the compiler must carry out on the
program. It has to keep track of various things like variables, their storage places (memory
or registers), expression evaluations, constant expressions, dead code etc. It is because of
this high complexity of the process that the compiler takes much longer to compile a program
with optimization enabled. There are many details that are involved in code
optimization and code generation that I have not explained and that I don't know.
Code optimization is a field of active research and interested
readers can refer to [1] for additional information.</p>

<h2><a name="sect8">8. Acknowledgments</a></h2>
<p>
I would like to thank my Bachelor's project guide Dr. Uday Khedker for creating my
interest in compilers and code optimization. I would also like to thank the <i>Linux
Gazette, Linux Documentation Project, PC Quest</I> and the <i>Pune Linux User's Group</i>
(<a href="http://www.pluggies.org">http://www.pluggies.org</a>)
which have earlier accepted and published my contributions that inspires me to write more.
</p>
<h2><a name="sect9">9. References</a></h2>

<OL>
<LI><i>Compilers: Principles, Techniques, and Tools</i>, A.V.Aho, Ravi Sethi and J.D.Ullman, Addison Wesley.
<LI><i>Systems Programming and Operating Systems</i>, D.M.Dhamdhere, Tata McGraw-Hill.
<LI><A HREF="http://www.linuxdoc.org/HOWTO/Assembly-HOWTO/index.html"><i>Assembly HOWTO</i></A>, Franois-Ren Rideau, Linux Documentation Project.
<LI><i>Advanced 80386 Programming Techniques</i>, James L. Turley, Osborne McGraw-Hill.
</OL>





<!-- *** BEGIN bio *** -->
<SPACER TYPE="vertical" SIZE="30">
<P> 
<H4><IMG ALIGN=BOTTOM ALT="" SRC="../gx/note.gif">Rahul U Joshi</H4>
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
<EM>I have recently joined Veritas Software India Pvt. Ltd. as an Associate
Software Engineer. I have worked on a research project entitled "Node Listing
Based Global Data Flow Analysis" during my Bachelor's degree. I am interested
in compilers, programming languages, operating systems and parallel/distributed
processing. I have contributed articles to Linux Gazette about
parallel processing and about optimizing C.</EM>

<!-- *** END bio *** -->

<!-- *** BEGIN copyright *** -->
<P> <hr> <!-- P --> 
<H5 ALIGN=center>

Copyright &copy; 2001, Rahul U Joshi.<BR>
Copying license <A HREF="../copying.html">http://www.linuxgazette.com/copying.html</A><BR> 
Published in Issue 71 of <i>Linux Gazette</i>, October 2001</H5>
<!-- *** END copyright *** -->

<!--startcut ==========================================================-->
<HR><P>
<CENTER>
<!-- *** BEGIN navbar *** -->
<IMG ALT="" SRC="../gx/navbar/left.jpg" WIDTH="14" HEIGHT="45" BORDER="0" ALIGN="bottom"><A HREF="arndt.html"><IMG ALT="[ Prev ]" SRC="../gx/navbar/prev.jpg" WIDTH="16" HEIGHT="45" BORDER="0" ALIGN="bottom"></A><A HREF="lg_toc71.html"><IMG ALT="[ Table of Contents ]" SRC="../gx/navbar/toc.jpg" WIDTH="220" HEIGHT="45" BORDER="0" ALIGN="bottom" ></A><A HREF="../lg_frontpage.html"><IMG ALT="[ Front Page ]" SRC="../gx/navbar/frontpage.jpg" WIDTH="137" HEIGHT="45" BORDER="0" ALIGN="bottom"></A><A HREF="http://www.linuxgazette.com/cgi-bin/talkback/all.py?site=LG&article=http://www.linuxgazette.com/issue71/joshi.html"><IMG ALT="[ Talkback ]" SRC="../gx/navbar/talkback.jpg" WIDTH="121" HEIGHT="45" BORDER="0" ALIGN="bottom"  ></A><A HREF="../lg_faq.html"><IMG ALT="[ FAQ ]" SRC="./../gx/navbar/faq.jpg"WIDTH="62" HEIGHT="45" BORDER="0" ALIGN="bottom"></A><A HREF="orr.html"><IMG ALT="[ Next ]" SRC="../gx/navbar/next.jpg" WIDTH="15" HEIGHT="45" BORDER="0" ALIGN="bottom"  ></A><IMG ALT="" SRC="../gx/navbar/right.jpg" WIDTH="15" HEIGHT="45" ALIGN="bottom">
<!-- *** END navbar *** -->
</CENTER>
</BODY></HTML>
<!--endcut ============================================================-->