pitrou commented on a change in pull request #12399: URL: https://github.com/apache/arrow/pull/12399#discussion_r806609112
########## File path: cpp/src/arrow/csv/writer.cc ########## @@ -194,34 +204,48 @@ class UnquotedColumnPopulator : public ColumnPopulator { return Status::OK(); }; - if (reject_values_with_quotes_) { - // When using this UnquotedColumnPopulator on values that, after casting, could - // produce quotes, we need to return an error in accord with RFC4180. We need to - // precede valid_func with a check. - return VisitArrayDataInline<StringType>( - *casted_array_->data(), - [&](arrow::util::string_view s) { - RETURN_NOT_OK(CheckStringHasNoStructuralChars(s)); - return valid_function(s); - }, - null_function); - } else { - // Populate without checking and rejecting values with quotes. - return VisitArrayDataInline<StringType>(*casted_array_->data(), valid_function, - null_function); - } + return VisitArrayDataInline<StringType>(*casted_array_->data(), valid_function, + null_function); } private: - // Returns an error status if s has any structural characters. - static Status CheckStringHasNoStructuralChars(const util::string_view& s) { - if (std::any_of(s.begin(), s.end(), [](const char& c) { - return c == '\n' || c == '\r' || c == ',' || c == '"'; - })) { - return Status::Invalid( - "CSV values may not contain structural characters if quoting style is " - "\"None\". See RFC4180. Invalid value: ", - s); + // Returns an error status if string array has any structural characters. + static Status CheckStringArrayHasNoStructuralChars(const StringArray& array) { + // scan the underlying string array buffer as a single big string + const uint8_t* const data = array.raw_data() + array.offset(); Review comment: ```suggestion const uint8_t* const data = array.raw_data() + array.value_offset(0); ``` ########## File path: cpp/src/arrow/csv/writer.cc ########## @@ -194,34 +204,48 @@ class UnquotedColumnPopulator : public ColumnPopulator { return Status::OK(); }; - if (reject_values_with_quotes_) { - // When using this UnquotedColumnPopulator on values that, after casting, could - // produce quotes, we need to return an error in accord with RFC4180. We need to - // precede valid_func with a check. - return VisitArrayDataInline<StringType>( - *casted_array_->data(), - [&](arrow::util::string_view s) { - RETURN_NOT_OK(CheckStringHasNoStructuralChars(s)); - return valid_function(s); - }, - null_function); - } else { - // Populate without checking and rejecting values with quotes. - return VisitArrayDataInline<StringType>(*casted_array_->data(), valid_function, - null_function); - } + return VisitArrayDataInline<StringType>(*casted_array_->data(), valid_function, + null_function); } private: - // Returns an error status if s has any structural characters. - static Status CheckStringHasNoStructuralChars(const util::string_view& s) { - if (std::any_of(s.begin(), s.end(), [](const char& c) { - return c == '\n' || c == '\r' || c == ',' || c == '"'; - })) { - return Status::Invalid( - "CSV values may not contain structural characters if quoting style is " - "\"None\". See RFC4180. Invalid value: ", - s); + // Returns an error status if string array has any structural characters. + static Status CheckStringArrayHasNoStructuralChars(const StringArray& array) { + // scan the underlying string array buffer as a single big string + const uint8_t* const data = array.raw_data() + array.offset(); + const int64_t buffer_size = array.total_values_length(); + int64_t offset = 0; +#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON) +#if defined(ARROW_HAVE_SSE4_2) + // _mm_cmpistrc gives slightly better performance than the naive approach, + // probably doesn't deserve the effort + using simd_batch = xsimd::batch<uint8_t, xsimd::sse4_2>; +#else + using simd_batch = xsimd::batch<uint8_t, xsimd::neon64>; +#endif + while ((offset + 16) <= buffer_size) { + const auto v = simd_batch::load_unaligned(data + offset); + if (xsimd::any((v == '\n') | (v == '\r') | (v == ',') | (v == '"'))) { + break; + } + offset += 16; + } +#endif + while (offset < buffer_size) { + // error happened or remaining bytes to check + const char c = static_cast<char>(data[offset]); + if (c == '\n' || c == '\r' || c == ',' || c == '"') { + // extract the offending string from array per offset + const auto* offsets = array.raw_value_offsets(); + const auto index = + std::upper_bound(offsets, offsets + array.length(), offset) - offsets; Review comment: Need to account for the fact that the stored offsets do not necessarily start at 0 (unlike the `offset` variable): ```suggestion const auto index = std::upper_bound(offsets, offsets + array.length(), offset + offsets[0]) - offsets; ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org