03 March 2019

Different forms of application code

I've used LLVM compiler infrastructure to show the different stages the code can be in:

1. Source code


basic c source code:

$cat test.c
#include <stdio.h>

int main() {

    printf("Hello World\n");

    return 0;
}

2.  Intermediate code:

some platforms such as JVM, .net CLR and LLVM has an intermediate representation where the compiler compiles the source code into. in JVM its called bytecode while in LLVM it is called Intermediate Representation.

here's the LLVM IR of the above program:

$cat test.ll
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.14.0"

@.str = private unnamed_addr constant [13 x i8] c"Hello World\0A\00", align 1

; Function Attrs: noinline nounwind optnone ssp uwtable
define i32 @main() #0 {
  %1 = alloca i32, align 4
  store i32 0, i32* %1, align 4
  %2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0))
  ret i32 0
}

declare i32 @printf(i8*, ...) #1

attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"Apple LLVM version 10.0.0 (clang-1000.10.44.4)"}

3. Assembly code:


The generate bytecode or intermediate format usually either interpreted or compiled.
And the compilation either JIT (just in time) or AOT (ahead of time)

In the case of JVM, it is both interpreted and JIT compiled.
In the case of LLVM, the IR is AOT compiled into assembly code.

Here's the output of the previous IR translated into assembly code (in the proccess of AOT compilation):

$cat test.s
.section __TEXT,__text,regular,pure_instructions
.build_version macos, 10, 14
.globl _main                   ## -- Begin function main
.p2align 4, 0x90
_main:                                  ## @main
.cfi_startproc
## %bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
subq $16, %rsp
leaq L_.str(%rip), %rdi
movl $0, -4(%rbp)
movb $0, %al
callq _printf
xorl %ecx, %ecx
movl %eax, -8(%rbp)          ## 4-byte Spill
movl %ecx, %eax
addq $16, %rsp
popq %rbp
retq
.cfi_endproc
                                        ## -- End function
.section __TEXT,__cstring,cstring_literals
L_.str:                                 ## @.str
.asciz "Hello World\n"


.subsections_via_symbols

4. Object code:


Object code is the final machine (binary) code but for the specific module, in other words, it is not being linked with other runtimes libraries to form the complete binary.

The following is the example of object code for the code above:

$cat test.o
R__compact_unwind__LD8 `�__eh_frame__TEXTX@�
                                            h2
��
  PUH��H��H�=�E���1ɉE��H��]�Hello World
*zRx
-  �$��������*A�C
 _main_printf%

5. Binary Code:

After linking the object code is a complete binary code be executed.

$cat test
�__unwind_info__TEXT�H��__DATA__nl_symbol_ptr__DATA__la_symbol_ptr__DATH__LINKEDIT  �"�   0 0h � 8
                                                                                                  P�
                                                                                                      /usr/lib/dyld�<D�fd<��a�d��2


�*(�`
1ɉE��H��]��%�L�qAS�%a�h�����Hello WorldUH��H��H�=;�E���
`44�4
     �"Q@dyld_stub_binderQr�r@_printf�__mh_execute_header!main%��`$@ __mh_execute_header_main_printfdyld_stub_binder%

In JVM, there's the GraalVM that can produce native binaries, and there was gcj part of gcc that can generate native binaries as well.

No comments: