pitrou commented on code in PR #43389:
URL: https://github.com/apache/arrow/pull/43389#discussion_r1716860364


##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -45,48 +45,75 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, 
int column_id, int nu
   if (!is_fixed_length_column) {
     int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id);
     const uint8_t* row_ptr_base = rows.data(2);
-    const uint32_t* row_offsets = rows.offsets();
+    const RowTableImpl::offset_type* row_offsets = rows.offsets();
 
     if (varbinary_column_id == 0) {
       // Case 1: This is the first varbinary column
       //
       __m256i field_offset_within_row = 
_mm256_set1_epi32(rows.metadata().fixed_length);
       __m256i varbinary_end_array_offset =
-          _mm256_set1_epi32(rows.metadata().varbinary_end_array_offset);
+          _mm256_set1_epi64x(rows.metadata().varbinary_end_array_offset);
       for (int i = 0; i < num_rows / unroll; ++i) {
+        // Load 8 32-bit row ids.
         __m256i row_id =
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row_ids) + i);
-        __m256i row_offset = _mm256_i32gather_epi32(
-            reinterpret_cast<const int*>(row_offsets), row_id, 
sizeof(uint32_t));
+        // Gather the lower/higher 4 64-bit row offsets based on the 
lower/higher 4 32-bit
+        // row ids.
+        __m256i row_offset_lo =
+            _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id),
+                                   sizeof(RowTableImpl::offset_type));
+        __m256i row_offset_hi =
+            _mm256_i32gather_epi64(row_offsets, 
_mm256_extracti128_si256(row_id, 1),
+                                   sizeof(RowTableImpl::offset_type));
+        // Gather the lower/higher 4 32-bit field lengths based on the 
lower/higher 4
+        // 64-bit row offsets.
+        __m128i field_length_lo = _mm256_i64gather_epi32(
+            reinterpret_cast<const int*>(row_ptr_base),
+            _mm256_add_epi64(row_offset_lo, varbinary_end_array_offset), 1);
+        __m128i field_length_hi = _mm256_i64gather_epi32(
+            reinterpret_cast<const int*>(row_ptr_base),
+            _mm256_add_epi64(row_offset_hi, varbinary_end_array_offset), 1);
+        // The final 8 32-bit field lengths, subtracting the field offset 
within row.
         __m256i field_length = _mm256_sub_epi32(
-            _mm256_i32gather_epi32(
-                reinterpret_cast<const int*>(row_ptr_base),
-                _mm256_add_epi32(row_offset, varbinary_end_array_offset), 1),
-            field_offset_within_row);
+            _mm256_set_m128i(field_length_hi, field_length_lo), 
field_offset_within_row);
         process_8_values_fn(i * unroll, row_ptr_base,
-                            _mm256_add_epi32(row_offset, 
field_offset_within_row),
+                            _mm256_add_epi64(row_offset_lo, 
field_offset_within_row),
+                            _mm256_add_epi64(row_offset_hi, 
field_offset_within_row),
                             field_length);
       }
     } else {
       // Case 2: This is second or later varbinary column
       //
       __m256i varbinary_end_array_offset =
-          _mm256_set1_epi32(rows.metadata().varbinary_end_array_offset +
-                            sizeof(uint32_t) * (varbinary_column_id - 1));
+          _mm256_set1_epi64x(rows.metadata().varbinary_end_array_offset +
+                             sizeof(uint32_t) * (varbinary_column_id - 1));
       auto row_ptr_base_i64 =
           reinterpret_cast<const 
arrow::util::int64_for_gather_t*>(row_ptr_base);
       for (int i = 0; i < num_rows / unroll; ++i) {
+        // Load 8 32-bit row ids.
         __m256i row_id =
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row_ids) + i);
-        __m256i row_offset = _mm256_i32gather_epi32(
-            reinterpret_cast<const int*>(row_offsets), row_id, 
sizeof(uint32_t));
-        __m256i end_array_offset =
-            _mm256_add_epi32(row_offset, varbinary_end_array_offset);
-
-        __m256i field_offset_within_row_A = _mm256_i32gather_epi64(
-            row_ptr_base_i64, _mm256_castsi256_si128(end_array_offset), 1);
-        __m256i field_offset_within_row_B = _mm256_i32gather_epi64(
-            row_ptr_base_i64, _mm256_extracti128_si256(end_array_offset, 1), 
1);
+        // Gather the lower/higher 4 64-bit row offsets based on the 
lower/higher 4 32-bit
+        // row ids.
+        __m256i row_offset_lo =
+            _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id),
+                                   sizeof(RowTableImpl::offset_type));
+        // Gather the lower/higher 4 32-bit field lengths based on the 
lower/higher 4
+        // 64-bit row offsets.
+        __m256i row_offset_hi =
+            _mm256_i32gather_epi64(row_offsets, 
_mm256_extracti128_si256(row_id, 1),
+                                   sizeof(RowTableImpl::offset_type));
+        // Prepare the lower/higher 4 64-bit end array offsets based on the 
lower/higher 4
+        // 64-bit row offsets.
+        __m256i end_array_offset_lo =
+            _mm256_add_epi64(row_offset_lo, varbinary_end_array_offset);
+        __m256i end_array_offset_hi =
+            _mm256_add_epi64(row_offset_hi, varbinary_end_array_offset);
+
+        __m256i field_offset_within_row_A =
+            _mm256_i64gather_epi64(row_ptr_base_i64, end_array_offset_lo, 1);
+        __m256i field_offset_within_row_B =
+            _mm256_i64gather_epi64(row_ptr_base_i64, end_array_offset_hi, 1);

Review Comment:
   Also add a comment about them not being wired :-)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to