ARROW-1061: [C++] Harden decimal parsing against invalid strings Author: Phillip Cloud <cpcl...@gmail.com>
Closes #708 from cpcloud/ARROW-1061 and squashes the following commits: c93d1db8 [Phillip Cloud] ARROW-1061: [C++] Harden decimal parsing against invalid strings Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/1cb18d52 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/1cb18d52 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/1cb18d52 Branch: refs/heads/master Commit: 1cb18d528bbb61943273eb8ca8d9dd1a8c631485 Parents: 33117d9 Author: Phillip Cloud <cpcl...@gmail.com> Authored: Tue May 23 20:29:44 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Wed May 31 13:45:48 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/util/decimal-test.cc | 80 +++++++++++++++++++++++++++++++++ cpp/src/arrow/util/decimal.cc | 75 ++++++++++++++++++++++++------- 2 files changed, 139 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/1cb18d52/cpp/src/arrow/util/decimal-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc index 5d95c2c..72107a2 100644 --- a/cpp/src/arrow/util/decimal-test.cc +++ b/cpp/src/arrow/util/decimal-test.cc @@ -54,6 +54,29 @@ TYPED_TEST(DecimalTest, TestFromString) { ASSERT_EQ(scale, 5); } +TEST(DecimalTest, TestStringStartingWithPlus) { + std::string plus_value("+234.234"); + Decimal32 out; + int scale; + int precision; + ASSERT_OK(FromString(plus_value, &out, &precision, &scale)); + ASSERT_EQ(234234, out.value); + ASSERT_EQ(6, precision); + ASSERT_EQ(3, scale); +} + +TEST(DecimalTest, TestStringStartingWithPlus128) { + std::string plus_value("+2342394230592.232349023094"); + decimal::int128_t expected_value("2342394230592232349023094"); + Decimal128 out; + int scale; + int precision; + ASSERT_OK(FromString(plus_value, &out, &precision, &scale)); + ASSERT_EQ(expected_value, out.value); + ASSERT_EQ(25, precision); + ASSERT_EQ(12, scale); +} + TEST(DecimalTest, TestStringToInt32) { int32_t value = 0; StringToInteger("123", "456", 1, &value); @@ -160,6 +183,63 @@ TEST(DecimalTest, TestDecimal128StringAndBytesRoundTrip) { ASSERT_EQ(expected.value, result.value); } +TEST(DecimalTest, TestInvalidInputMinus) { + std::string invalid_value("-"); + Decimal32 out; + Status status = decimal::FromString(invalid_value, &out); + ASSERT_RAISES(Invalid, status); +} + +TEST(DecimalTest, TestInvalidInputDot) { + std::string invalid_value("0.0.0"); + Decimal32 out; + Status status = decimal::FromString(invalid_value, &out); + ASSERT_RAISES(Invalid, status); +} + +TEST(DecimalTest, TestInvalidInputEmbeddedMinus) { + std::string invalid_value("0-13-32"); + Decimal32 out; + Status status = decimal::FromString(invalid_value, &out); + ASSERT_RAISES(Invalid, status); +} + +TEST(DecimalTest, TestInvalidInputSingleChar) { + std::string invalid_value("a"); + Decimal32 out; + Status status = decimal::FromString(invalid_value, &out); + ASSERT_RAISES(Invalid, status); +} + +TEST(DecimalTest, TestInvalidInputWithValidSubstring) { + std::string invalid_value("-23092.235-"); + Decimal32 out; + Status status = decimal::FromString(invalid_value, &out); + auto msg = status.message(); + ASSERT_RAISES(Invalid, status); +} + +TEST(DecimalTest, TestInvalidInputWithMinusPlus) { + std::string invalid_value("-+23092.235"); + Decimal32 out; + Status status = decimal::FromString(invalid_value, &out); + ASSERT_RAISES(Invalid, status); +} + +TEST(DecimalTest, TestInvalidInputWithPlusMinus) { + std::string invalid_value("+-23092.235"); + Decimal32 out; + Status status = decimal::FromString(invalid_value, &out); + ASSERT_RAISES(Invalid, status); +} + +TEST(DecimalTest, TestInvalidInputWithLeadingZeros) { + std::string invalid_value("00a"); + Decimal32 out; + Status status = decimal::FromString(invalid_value, &out); + ASSERT_RAISES(Invalid, status); +} + template <typename T> class DecimalZerosTest : public ::testing::Test {}; TYPED_TEST_CASE(DecimalZerosTest, DecimalTypes); http://git-wip-us.apache.org/repos/asf/arrow/blob/1cb18d52/cpp/src/arrow/util/decimal.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 734df13..72ede35 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -29,18 +29,27 @@ ARROW_EXPORT Status FromString( } int8_t sign = 1; - auto charp = s.c_str(); - auto end = charp + s.length(); + std::string::const_iterator charp = s.cbegin(); + std::string::const_iterator end = s.cend(); - if (*charp == '+' || *charp == '-') { - if (*charp == '-') { sign = -1; } + char first_char = *charp; + if (first_char == '+' || first_char == '-') { + if (first_char == '-') { sign = -1; } ++charp; } - auto numeric_string_start = charp; + if (charp == end) { + std::stringstream ss; + ss << "Single character: '" << first_char << "' is not a valid decimal value"; + return Status::Invalid(ss.str()); + } + + std::string::const_iterator numeric_string_start = charp; + + DCHECK_LT(charp, end); // skip leading zeros - while (*charp == '0') { + while (charp != end && *charp == '0') { ++charp; } @@ -59,25 +68,59 @@ ARROW_EXPORT Status FromString( return Status::OK(); } - auto whole_part_start = charp; - while (isdigit(*charp)) { + std::string::const_iterator whole_part_start = charp; + + while (charp != end && isdigit(*charp)) { ++charp; } - auto whole_part_end = charp; + + std::string::const_iterator whole_part_end = charp; std::string whole_part(whole_part_start, whole_part_end); - if (*charp == '.') { + if (charp != end && *charp == '.') { ++charp; + + if (charp == end) { + return Status::Invalid( + "Decimal point must be followed by at least one base ten digit. Reached the " + "end of the string."); + } + + if (!isdigit(*charp)) { + std::stringstream ss; + ss << "Decimal point must be followed by a base ten digit. Found '" << *charp + << "'"; + return Status::Invalid(ss.str()); + } } else { - // no decimal point - DCHECK_EQ(charp, end); + if (charp != end) { + std::stringstream ss; + ss << "Expected base ten digit or decimal point but found '" << *charp + << "' instead."; + return Status::Invalid(ss.str()); + } } - auto fractional_part_start = charp; - while (isdigit(*charp)) { - ++charp; + std::string::const_iterator fractional_part_start = charp; + + // The rest must be digits, because if we have a decimal point it must be followed by + // digits + if (charp != end) { + while (charp != end && isdigit(*charp)) { + ++charp; + } + + // The while loop has ended before the end of the string which means we've hit a + // character that isn't a base ten digit + if (charp != end) { + std::stringstream ss; + ss << "Found non base ten digit character '" << *charp + << "' before the end of the string"; + return Status::Invalid(ss.str()); + } } - auto fractional_part_end = charp; + + std::string::const_iterator fractional_part_end = charp; std::string fractional_part(fractional_part_start, fractional_part_end); if (precision != nullptr) {