On 15/11/17 15:38, Alan Bateman wrote: > Moving the nativeOrder out of the loop make sense but I'm curious about > the context for improving this implementation.
I wonder about lifting ByteOrder.nativeOrder(). Maybe it fails to inline because the method is too large: if that happens, we really lose. I'm not seeing that, though: it seems to be inlined just fine, and has no effect. In any case, this patch doesn't help anything on my test hardware. Before: CRC.crc avgt 5 1050.374 ? 1.533 ns/op After: CRC.crc avgt 5 1088.962 ? 0.212 ns/op And here, for the sake of anyone interested, is the AArch64 code, before: ;; B10: # B10 B11 <- B9 B10 Loop: B10-B10 inner main of N120 Freq: 127.001 0x000003ffa4bb5900: ldr x2, [x11,w3,sxtw] ;*invokevirtual getLong {reexecute=0 rethrow=0 return_oop=0} ; - java.util.zip.CRC32C::updateBytes@147 (line 237) 0x000003ffa4bb5904: eor w4, w2, w1 ;*ixor {reexecute=0 rethrow=0 return_oop=0} ; - java.util.zip.CRC32C::updateBytes@193 (line 246) 0x000003ffa4bb5908: lsr x2, x2, #32 0x000003ffa4bb590c: ubfiz x5, x4, #2, #8 0x000003ffa4bb5910: mov w19, w2 ;*l2i {reexecute=0 rethrow=0 return_oop=0} ; - java.util.zip.CRC32C::updateBytes@171 (line 240) 0x000003ffa4bb5914: ubfx w2, w4, #16, #8 0x000003ffa4bb5918: ubfx w1, w4, #8, #8 0x000003ffa4bb591c: ldr w6, [x15,x5] 0x000003ffa4bb5920: lsr w5, w4, #24 0x000003ffa4bb5924: ldr w4, [x16,w2,sxtw #2] 0x000003ffa4bb5928: ldr w1, [x14,w1,sxtw #2] 0x000003ffa4bb592c: ldr w2, [x17,w5,sxtw #2] 0x000003ffa4bb5930: eor w5, w1, w6 0x000003ffa4bb5934: ubfiz x23, x19, #2, #8 0x000003ffa4bb5938: ubfx w6, w19, #8, #8 0x000003ffa4bb593c: eor w4, w5, w4 0x000003ffa4bb5940: ubfx w5, w19, #16, #8 0x000003ffa4bb5944: ldr w1, [x18,x23] 0x000003ffa4bb5948: eor w2, w4, w2 0x000003ffa4bb594c: ldr w6, [x0,w6,sxtw #2] 0x000003ffa4bb5950: lsr w4, w19, #24 0x000003ffa4bb5954: ldr w19, [x13,w5,sxtw #2] 0x000003ffa4bb5958: eor w2, w2, w1 0x000003ffa4bb595c: add w5, w3, #0x8 0x000003ffa4bb5960: ldr w1, [x12,w4,sxtw #2] 0x000003ffa4bb5964: eor w29, w2, w6 0x000003ffa4bb5968: eor w4, w29, w19 and after: ;; B10: # B10 B11 <- B9 B10 Loop: B10-B10 inner main of N121 Freq: 127.001 0x000003ff8cb98200: ldr x10, [x21,w3,sxtw] ;*invokevirtual getLong {reexecute=0 rethrow=0 return_oop=0} ; - java.util.zip.CRC32C::updateBytes@359 (line 264) 0x000003ff8cb98204: eor w1, w10, w1 ;*ixor {reexecute=0 rethrow=0 return_oop=0} ; - java.util.zip.CRC32C::updateBytes@380 (line 268) 0x000003ff8cb98208: ubfx w2, w1, #8, #8 0x000003ff8cb9820c: lsr x10, x10, #32 0x000003ff8cb98210: ubfx w5, w1, #16, #8 0x000003ff8cb98214: mov w10, w10 ;*l2i {reexecute=0 rethrow=0 return_oop=0} ; - java.util.zip.CRC32C::updateBytes@374 (line 266) 0x000003ff8cb98218: ldr w0, [x15,w2,sxtw #2] 0x000003ff8cb9821c: lsr w6, w1, #24 0x000003ff8cb98220: ubfiz x4, x1, #2, #8 0x000003ff8cb98224: ubfx w1, w10, #8, #8 0x000003ff8cb98228: ldr w2, [x17,w5,sxtw #2] 0x000003ff8cb9822c: ubfiz x20, x10, #2, #8 0x000003ff8cb98230: lsr w5, w10, #24 0x000003ff8cb98234: ubfx w19, w10, #16, #8 0x000003ff8cb98238: ldr w10, [x13,w1,sxtw #2] 0x000003ff8cb9823c: ldr w6, [x18,w6,sxtw #2] 0x000003ff8cb98240: ldr w4, [x16,x4] 0x000003ff8cb98244: ldr w1, [x12,w19,sxtw #2] 0x000003ff8cb98248: ldr w7, [x14,x20] 0x000003ff8cb9824c: eor w6, w2, w6 0x000003ff8cb98250: eor w0, w0, w4 0x000003ff8cb98254: ldr w19, [x11,w5,sxtw #2] 0x000003ff8cb98258: add w20, w3, #0x8 0x000003ff8cb9825c: eor w29, w0, w6 0x000003ff8cb98260: eor w10, w10, w7 0x000003ff8cb98264: eor w0, w1, w19 0x000003ff8cb98268: eor w1, w10, w29 -- Andrew Haley Java Platform Lead Engineer Red Hat UK Ltd. <https://www.redhat.com> EAC8 43EB D3EF DB98 CC77 2FAD A5CD 6035 332F A671