[GitHub] [arrow] pitrou commented on a change in pull request #7434: ARROW-9131: [C++] Faster ascii_lower and ascii_upper.

GitBox Mon, 15 Jun 2020 04:03:16 -0700


pitrou commented on a change in pull request #7434:
URL: https://github.com/apache/arrow/pull/7434#discussion_r440095501




##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -64,77 +64,30 @@ void StringDataTransform(KernelContext* ctx, const 
ExecBatch& batch,
   }
 }
 
-// Generated with
-//
-// print("static constexpr uint8_t kAsciiUpperTable[] = {")
-// for i in range(256):
-//     if i > 0: print(', ', end='')
-//     if i >= ord('a') and i <= ord('z'):
-//         print(i - 32, end='')
-//     else:
-//         print(i, end='')
-// print("};")
-
-static constexpr uint8_t kAsciiUpperTable[] = {
-    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  
15,
-    16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  
31,
-    32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  
47,
-    48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  
63,
-    64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  
79,
-    80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  
95,
-    96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  
79,
-    80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 
127,
-    128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 
143,
-    144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 
159,
-    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 
175,
-    176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 
191,
-    192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 
207,
-    208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 
223,
-    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 
239,
-    240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 
255};
-
 void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* 
output) {
   for (int64_t i = 0; i < length; ++i) {
-    *output++ = kAsciiUpperTable[*input++];
+    const uint8_t utf8_code_unit = *input++;
+    // Code units in the range [a-z] can only be an encoding of an ascii
+    // character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an 
different
+    // codepoint. This guaranteed by non-overal design of the unicode 
standard. (see

Review comment:
       "non-overlap"




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a change in pull request #7434: ARROW-9131: [C++] Faster ascii_lower and ascii_upper.

Reply via email to