https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69871
Bug ID: 69871 Summary: Type punned structs returned by value optimized poorly Product: gcc Version: 5.3.0 Status: UNCONFIRMED Severity: enhancement Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: rf at rufflewind dot com Target Milestone: --- The following code, which unpacks a 32-bit integer into a struct of four bytes, does not optimize as well as it should. While "unpack" seems to optimize just fine, trivial wrappers of the function do not seem to get optimized nearly as well: - Two of the wrappers ("wrapper", "wrapper2") are completely identical yet they do not result in the same assembly code. One is optimized well, the other is not. - Adding another layer of indirection ("wrapperwrapper") also prevents the optimization from occurring. The problem occurs not only for union-based type-punning, but also for similar tricks that involve: - memcpy, where all three wrappers would optimize poorly, or - bitshift operators, where even "unpack" would optimize poorly. See also: https://gcc.gnu.org/ml/gcc/2016-02/msg00244.html The code was compiled with "gcc -fverbose-asm -Wall -S -O3 foo.c" on Linux 4.4.1 x86-64. The GCC binaries are part of the Arch Linux's gcc-multilib 5.3.0-4 binary package. --- struct alpha { char a, b, c, d; }; struct alpha unpack(unsigned x) { union { struct alpha r; unsigned i; } u; u.i = x; return u.r; } struct alpha wrapper(unsigned y) { return unpack(y); } struct alpha wrapper2(unsigned y) { return unpack(y); } struct alpha wrapperwrapper(unsigned y) { return wrapper(y); } --- .file "foo.c" # GNU C11 (GCC) version 5.3.0 (x86_64-unknown-linux-gnu) # compiled by GNU C version 5.3.0, GMP version 6.1.0, MPFR version 3.1.3-p5, MPC version 1.0.3 # GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 # options passed: foo.c -mtune=generic -march=x86-64 -O3 -Wall # -fverbose-asm # options enabled: -faggressive-loop-optimizations -falign-labels # -fasynchronous-unwind-tables -fauto-inc-dec -fbranch-count-reg # -fcaller-saves -fchkp-check-incomplete-type -fchkp-check-read # -fchkp-check-write -fchkp-instrument-calls -fchkp-narrow-bounds # -fchkp-optimize -fchkp-store-bounds -fchkp-use-static-bounds # -fchkp-use-static-const-bounds -fchkp-use-wrappers # -fcombine-stack-adjustments -fcommon -fcompare-elim -fcprop-registers # -fcrossjumping -fcse-follow-jumps -fdefer-pop # -fdelete-null-pointer-checks -fdevirtualize -fdevirtualize-speculatively # -fdwarf2-cfi-asm -fearly-inlining -feliminate-unused-debug-types # -fexpensive-optimizations -fforward-propagate -ffunction-cse -fgcse # -fgcse-after-reload -fgcse-lm -fgnu-runtime -fgnu-unique # -fguess-branch-probability -fhoist-adjacent-loads -fident -fif-conversion # -fif-conversion2 -findirect-inlining -finline -finline-atomics # -finline-functions -finline-functions-called-once # -finline-small-functions -fipa-cp -fipa-cp-alignment -fipa-cp-clone # -fipa-icf -fipa-icf-functions -fipa-icf-variables -fipa-profile # -fipa-pure-const -fipa-ra -fipa-reference -fipa-sra -fira-hoist-pressure # -fira-share-save-slots -fira-share-spill-slots # -fisolate-erroneous-paths-dereference -fivopts -fkeep-static-consts # -fleading-underscore -flifetime-dse -flra-remat -flto-odr-type-merging # -fmath-errno -fmerge-constants -fmerge-debug-strings # -fmove-loop-invariants -fomit-frame-pointer -foptimize-sibling-calls # -foptimize-strlen -fpartial-inlining -fpeephole -fpeephole2 # -fpredictive-commoning -fprefetch-loop-arrays -free -freg-struct-return # -freorder-blocks -freorder-blocks-and-partition -freorder-functions # -frerun-cse-after-loop -fsched-critical-path-heuristic # -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock # -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec # -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-fusion # -fschedule-insns2 -fsemantic-interposition -fshow-column -fshrink-wrap # -fsigned-zeros -fsplit-ivs-in-unroller -fsplit-wide-types -fssa-phiopt # -fstdarg-opt -fstrict-aliasing -fstrict-overflow # -fstrict-volatile-bitfields -fsync-libcalls -fthread-jumps # -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp -ftree-builtin-call-dce # -ftree-ccp -ftree-ch -ftree-coalesce-vars -ftree-copy-prop # -ftree-copyrename -ftree-cselim -ftree-dce -ftree-dominator-opts # -ftree-dse -ftree-forwprop -ftree-fre -ftree-loop-distribute-patterns # -ftree-loop-if-convert -ftree-loop-im -ftree-loop-ivcanon # -ftree-loop-optimize -ftree-loop-vectorize -ftree-parallelize-loops= # -ftree-partial-pre -ftree-phiprop -ftree-pre -ftree-pta -ftree-reassoc # -ftree-scev-cprop -ftree-sink -ftree-slp-vectorize -ftree-slsr -ftree-sra # -ftree-switch-conversion -ftree-tail-merge -ftree-ter -ftree-vrp # -funit-at-a-time -funswitch-loops -funwind-tables -fverbose-asm # -fzero-initialized-in-bss -m128bit-long-double -m64 -m80387 # -malign-stringops -mavx256-split-unaligned-load # -mavx256-split-unaligned-store -mfancy-math-387 -mfp-ret-in-387 -mfxsr # -mglibc -mieee-fp -mlong-double-80 -mmmx -mno-sse4 -mpush-args -mred-zone # -msse -msse2 -mtls-direct-seg-refs -mvzeroupper .section .text.unlikely,"ax",@progbits .LCOLDB0: .text .LHOTB0: .p2align 4,,15 .globl unpack .type unpack, @function unpack: .LFB0: .cfi_startproc movl %edi, %eax # x, x ret .cfi_endproc .LFE0: .size unpack, .-unpack .section .text.unlikely .LCOLDE0: .text .LHOTE0: .section .text.unlikely .LCOLDB1: .text .LHOTB1: .p2align 4,,15 .globl wrapper .type wrapper, @function wrapper: .LFB5: .cfi_startproc movl %edi, %eax # y, y xorl %edx, %edx # retval.9 movsbl %ah, %eax # y, SR.14 movb %dil, %dl # y, retval.9 movb %al, %dh # SR.14, retval.9 movl %edi, %eax # y, tmp101 andl $-16777216, %edi #, tmp105 andl $16711680, %eax #, tmp101 movzwl %dx, %edx # retval.9, tmp103 orl %eax, %edx # tmp101, tmp106 movl %edx, %eax # tmp106, tmp107 orl %edi, %eax # tmp105, tmp107 ret .cfi_endproc .LFE5: .size wrapper, .-wrapper .section .text.unlikely .LCOLDE1: .text .LHOTE1: .section .text.unlikely .LCOLDB2: .text .LHOTB2: .p2align 4,,15 .globl wrapper2 .type wrapper2, @function wrapper2: .LFB2: .cfi_startproc movl %edi, %eax # y, y ret .cfi_endproc .LFE2: .size wrapper2, .-wrapper2 .section .text.unlikely .LCOLDE2: .text .LHOTE2: .section .text.unlikely .LCOLDB3: .text .LHOTB3: .p2align 4,,15 .globl wrapperwrapper .type wrapperwrapper, @function wrapperwrapper: .LFB3: .cfi_startproc movl %edi, %eax # y, y xorl %edx, %edx # D.1859 movsbl %ah, %eax # y, SR.5 movb %dil, %dl # y, D.1859 movb %al, %dh # SR.5, D.1859 movl %edi, %eax # y, tmp101 andl $-16777216, %edi #, tmp105 andl $16711680, %eax #, tmp101 movzwl %dx, %edx # D.1859, tmp103 orl %eax, %edx # tmp101, tmp106 movl %edx, %eax # tmp106, tmp107 orl %edi, %eax # tmp105, tmp107 ret .cfi_endproc .LFE3: .size wrapperwrapper, .-wrapperwrapper .section .text.unlikely .LCOLDE3: .text .LHOTE3: .ident "GCC: (GNU) 5.3.0" .section .note.GNU-stack,"",@progbits