guiyanakuang commented on a change in pull request #958: URL: https://github.com/apache/orc/pull/958#discussion_r742507137
########## File path: c++/src/LzoDecompressor.cc ########## @@ -312,13 +312,11 @@ namespace orc { output += SIZE_OF_INT; matchAddress += increment32; - *reinterpret_cast<int32_t*>(output) = - *reinterpret_cast<int32_t*>(matchAddress); + memcpy(output, matchAddress, SIZE_OF_INT); Review comment: @wgtmac you are right, the compiler does optimize memcpy, the performance of both ways is similar in different compilers, in older versions of the compiler expand the assignment will be faster. I agree to wrap a bit_cast function for binary copy between different types. @dongjoon-hyun, So I don't think there's any performance loss here compared to the original. ![WX20211104-104627](https://user-images.githubusercontent.com/4069905/140250827-6282739b-c060-43fa-b348-87ede15129fc.png) ![WX20211104-105010](https://user-images.githubusercontent.com/4069905/140250854-cf6da388-18d8-42f0-8cd6-18468633acc3.png) ![WX20211104-105348](https://user-images.githubusercontent.com/4069905/140250863-6c99cfcb-0b72-4ee0-a6b0-ac31344ac771.png) ```c++ #include <string.h> static void use_memcpy(benchmark::State& state) { auto size = state.range(0); char buf[size]; for (int i = 0; i < 8; ++i) { buf[i] = 'a'; } for (auto _ : state) { char *output = buf + 8; char *matchAddress = buf; char *matchOutputLimit = buf + size; while (output < matchOutputLimit) { memcpy(output, matchAddress, 8); matchAddress += 8; output += 8; } } } static void use_expanded_assignment(benchmark::State& state) { auto size = state.range(0); char buf[size]; for (int i = 0; i < 8; ++i) { buf[i] = 'a'; } for (auto _ : state) { char *output = buf + 8; char *matchAddress = buf; char *matchOutputLimit = buf + size; while (output < matchOutputLimit) { output[0] = *matchOutputLimit; output[1] = *(matchOutputLimit + 1); output[2] = *(matchOutputLimit + 2); output[3] = *(matchOutputLimit + 3); output[4] = *(matchOutputLimit + 4); output[5] = *(matchOutputLimit + 5); output[6] = *(matchOutputLimit + 6); output[7] = *(matchOutputLimit + 7); matchAddress += 8; output += 8; } } } static void use_reinterpret_assignment(benchmark::State& state) { auto size = state.range(0); char buf[size]; for (int i = 0; i < 8; ++i) { buf[i] = 'a'; } for (auto _ : state) { char *output = buf + 8; char *matchAddress = buf; char *matchOutputLimit = buf + size; while (output < matchOutputLimit) { *reinterpret_cast<int64_t*>(output) = *reinterpret_cast<int64_t*>(matchAddress); matchAddress += 8; output += 8; } } } BENCHMARK(use_memcpy)->Arg(100000); BENCHMARK(use_expanded_assignment)->Arg(100000); BENCHMARK(use_reinterpret_assignment)->Arg(100000); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@orc.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org