pitrou commented on code in PR #47524:
URL: https://github.com/apache/arrow/pull/47524#discussion_r2349281358
##########
cpp/src/arrow/csv/writer.cc:
##########
@@ -176,6 +177,32 @@ char* Escape(std::string_view s, char* out) {
return out;
}
+int64_t StopAtStructuralChar(const uint8_t* data, const int64_t buffer_size,
Review Comment:
```suggestion
// Return the index of the first structural char in the input. A structural
char
// is a character that needs quoting and/or escaping.
int64_t StopAtStructuralChar(const uint8_t* data, const int64_t buffer_size,
```
##########
cpp/src/arrow/csv/options.h:
##########
@@ -209,6 +209,9 @@ struct ARROW_EXPORT WriteOptions {
/// \brief Quoting style
QuotingStyle quoting_style = QuotingStyle::Needed;
+ /// \brief Quoting style of header
Review Comment:
```suggestion
/// \brief Quoting style of header
///
/// Note that `QuotingStyle::Needed` and `QuotingStyle::AllValid` have the
same
/// effect of quoting all column names.
```
##########
cpp/src/arrow/csv/writer_test.cc:
##########
@@ -91,6 +93,16 @@ std::vector<WriterTestParams> GenerateTestCases() {
auto dummy_schema = schema({field("a", uint8())});
std::string dummy_batch_data = R"([{"a": null}])";
+ auto header_without_structral_charaters =
Review Comment:
Typo: "structral" :-)
##########
cpp/src/arrow/csv/writer.cc:
##########
@@ -105,7 +105,8 @@ int64_t CountQuotes(std::string_view s) {
// Matching quote pair character length.
constexpr int64_t kQuoteCount = 2;
-constexpr int64_t kQuoteDelimiterCount = kQuoteCount + /*end_char*/ 1;
+// Matching delimiter character length.
Review Comment:
Nit
```suggestion
// Delimiter character length.
```
##########
cpp/src/arrow/csv/writer_test.cc:
##########
@@ -91,6 +93,16 @@ std::vector<WriterTestParams> GenerateTestCases() {
auto dummy_schema = schema({field("a", uint8())});
std::string dummy_batch_data = R"([{"a": null}])";
+ auto header_without_structral_charaters =
+ schema({field("a ", uint64()), field("b", int32())});
+ std::string expected_header_without_structral_charaters = std::string(R"(a
,b)") + "\n";
Review Comment:
Same here.
##########
cpp/src/arrow/csv/writer.cc:
##########
@@ -578,26 +588,62 @@ class CSVWriterImpl : public ipc::RecordBatchWriter {
return Status::OK();
}
- int64_t CalculateHeaderSize() const {
+ int64_t CalculateHeaderSize(QuotingStyle quoting_style) const {
int64_t header_length = 0;
for (int col = 0; col < schema_->num_fields(); col++) {
const std::string& col_name = schema_->field(col)->name();
header_length += col_name.size();
- header_length += CountQuotes(col_name);
+ switch (quoting_style) {
+ case QuotingStyle::None:
+ break;
+ case QuotingStyle::Needed:
+ case QuotingStyle::AllValid:
+ header_length += CountQuotes(col_name);
+ break;
+ }
+ }
+ header_length += kDelimiterCount * (schema_->num_fields() - 1) +
options_.eol.size();
+ switch (quoting_style) {
+ case QuotingStyle::None:
+ break;
+ case QuotingStyle::Needed:
+ case QuotingStyle::AllValid:
+ header_length += kQuoteCount * schema_->num_fields();
+ break;
}
- // header_length + ([quotes + ','] * schema_->num_fields()) + (eol - ',')
- return header_length + (kQuoteDelimiterCount * schema_->num_fields()) +
- (options_.eol.size() - 1);
+ return header_length;
}
Status WriteHeader() {
// Only called once, as part of initialization
- RETURN_NOT_OK(data_buffer_->Resize(CalculateHeaderSize(),
/*shrink_to_fit=*/false));
+
RETURN_NOT_OK(data_buffer_->Resize(CalculateHeaderSize(options_.quoting_header),
+ /*shrink_to_fit=*/false));
char* next = reinterpret_cast<char*>(data_buffer_->mutable_data());
for (int col = 0; col < schema_->num_fields(); ++col) {
- *next++ = '"';
- next = Escape(schema_->field(col)->name(), next);
- *next++ = '"';
+ const std::string& col_name = schema_->field(col)->name();
+ switch (options_.quoting_header) {
+ case QuotingStyle::None:
+ if (StopAtStructuralChar(reinterpret_cast<const
uint8_t*>(col_name.c_str()),
+ col_name.length(), options_.delimiter) !=
+ static_cast<int64_t>(col_name.length())) {
+ return Status::Invalid(
+ "CSV header may not contain structural characters if quoting
style is "
+ "\"None\". See RFC4180. Invalid value: ",
+ col_name);
+ }
+ memcpy(next, col_name.data(), col_name.size());
+ next += col_name.size();
+ break;
+ // The behavior of the Needed quoting_style in CSV data depends on the
data type.
+ // And it is always quoted when the data type is binary. To avoid
semantic
+ // differences, the behavior of Need and AllValid should be consistent.
Review Comment:
```suggestion
// QuotingStyle::Needed is defined as always quoting string/binary
data,
// regardless of whether it contains structural chars.
// We use consistent semantics for header names, which are strings.
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]