pitrou commented on a change in pull request #7434: URL: https://github.com/apache/arrow/pull/7434#discussion_r440095501
########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -64,77 +64,30 @@ void StringDataTransform(KernelContext* ctx, const ExecBatch& batch, } } -// Generated with -// -// print("static constexpr uint8_t kAsciiUpperTable[] = {") -// for i in range(256): -// if i > 0: print(', ', end='') -// if i >= ord('a') and i <= ord('z'): -// print(i - 32, end='') -// else: -// print(i, end='') -// print("};") - -static constexpr uint8_t kAsciiUpperTable[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, - 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, - 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, - 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, - 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, - 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, - 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, - 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, - 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255}; - void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) { for (int64_t i = 0; i < length; ++i) { - *output++ = kAsciiUpperTable[*input++]; + const uint8_t utf8_code_unit = *input++; + // Code units in the range [a-z] can only be an encoding of an ascii + // character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different + // codepoint. This guaranteed by non-overal design of the unicode standard. (see Review comment: "non-overlap" ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org