[ 
https://issues.apache.org/jira/browse/ARROW-2224?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16434100#comment-16434100
 ] 

ASF GitHub Bot commented on ARROW-2224:
---------------------------------------

pitrou closed pull request #1880: ARROW-2224: [C++] Remove boost-regex 
dependency
URL: https://github.com/apache/arrow/pull/1880
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/README.md b/cpp/README.md
index 8018efd9e..daeeade72 100644
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -35,7 +35,6 @@ On Ubuntu/Debian you can install the requirements with:
 ```shell
 sudo apt-get install cmake \
      libboost-dev \
-     libboost-regex-dev \
      libboost-filesystem-dev \
      libboost-system-dev
 ```
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake 
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 129174c8d..020e0ed44 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -157,11 +157,8 @@ if (ARROW_BOOST_VENDORED)
     
"${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_system${CMAKE_STATIC_LIBRARY_SUFFIX}")
   set(BOOST_STATIC_FILESYSTEM_LIBRARY
     
"${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_filesystem${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  set(BOOST_STATIC_REGEX_LIBRARY
-    
"${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_regex${CMAKE_STATIC_LIBRARY_SUFFIX}")
   set(BOOST_SYSTEM_LIBRARY "${BOOST_STATIC_SYSTEM_LIBRARY}")
   set(BOOST_FILESYSTEM_LIBRARY "${BOOST_STATIC_FILESYSTEM_LIBRARY}")
-  set(BOOST_REGEX_LIBRARY "${BOOST_STATIC_REGEX_LIBRARY}")
   if (ARROW_BOOST_HEADER_ONLY)
     set(BOOST_BUILD_PRODUCTS)
     set(BOOST_CONFIGURE_COMMAND "")
@@ -169,12 +166,11 @@ if (ARROW_BOOST_VENDORED)
   else()
     set(BOOST_BUILD_PRODUCTS
       ${BOOST_SYSTEM_LIBRARY}
-      ${BOOST_FILESYSTEM_LIBRARY}
-      ${BOOST_REGEX_LIBRARY})
+      ${BOOST_FILESYSTEM_LIBRARY})
     set(BOOST_CONFIGURE_COMMAND
       "./bootstrap.sh"
       "--prefix=${BOOST_PREFIX}"
-      "--with-libraries=filesystem,system,regex")
+      "--with-libraries=filesystem,system")
     if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG")
       set(BOOST_BUILD_VARIANT "debug")
     else()
@@ -214,19 +210,16 @@ else()
     if (ARROW_BOOST_HEADER_ONLY)
       find_package(Boost REQUIRED)
     else()
-      find_package(Boost COMPONENTS system filesystem regex REQUIRED)
+      find_package(Boost COMPONENTS system filesystem REQUIRED)
       if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG")
         set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG})
         set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG})
-        set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG})
       else()
         set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE})
         set(BOOST_SHARED_FILESYSTEM_LIBRARY 
${Boost_FILESYSTEM_LIBRARY_RELEASE})
-        set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE})
       endif()
       set(BOOST_SYSTEM_LIBRARY boost_system_shared)
       set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared)
-      set(BOOST_REGEX_LIBRARY boost_regex_shared)
     endif()
   else()
     # Find static boost headers and libs
@@ -235,19 +228,16 @@ else()
     if (ARROW_BOOST_HEADER_ONLY)
       find_package(Boost REQUIRED)
     else()
-      find_package(Boost COMPONENTS system filesystem regex REQUIRED)
+      find_package(Boost COMPONENTS system filesystem REQUIRED)
       if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG")
         set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG})
         set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG})
-        set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG})
       else()
         set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE})
         set(BOOST_STATIC_FILESYSTEM_LIBRARY 
${Boost_FILESYSTEM_LIBRARY_RELEASE})
-        set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE})
       endif()
       set(BOOST_SYSTEM_LIBRARY boost_system_static)
       set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static)
-      set(BOOST_REGEX_LIBRARY boost_regex_static)
     endif()
   endif()
 endif()
@@ -264,11 +254,7 @@ if (NOT ARROW_BOOST_HEADER_ONLY)
       STATIC_LIB "${BOOST_STATIC_FILESYSTEM_LIBRARY}"
       SHARED_LIB "${BOOST_SHARED_FILESYSTEM_LIBRARY}")
 
-  ADD_THIRDPARTY_LIB(boost_regex
-      STATIC_LIB "${BOOST_STATIC_REGEX_LIBRARY}"
-      SHARED_LIB "${BOOST_SHARED_REGEX_LIBRARY}")
-
-  SET(ARROW_BOOST_LIBS boost_system boost_filesystem boost_regex)
+  SET(ARROW_BOOST_LIBS boost_system boost_filesystem)
 endif()
 
 include_directories(SYSTEM ${Boost_INCLUDE_DIR})
diff --git a/cpp/src/arrow/util/CMakeLists.txt 
b/cpp/src/arrow/util/CMakeLists.txt
index 41c27a58f..896952705 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -60,5 +60,6 @@ ADD_ARROW_TEST(rle-encoding-test)
 ADD_ARROW_TEST(stl-util-test)
 
 ADD_ARROW_BENCHMARK(bit-util-benchmark)
+ADD_ARROW_BENCHMARK(decimal-benchmark)
 
 add_subdirectory(variant)
diff --git a/cpp/src/arrow/util/decimal-benchmark.cc 
b/cpp/src/arrow/util/decimal-benchmark.cc
new file mode 100644
index 000000000..3129536cf
--- /dev/null
+++ b/cpp/src/arrow/util/decimal-benchmark.cc
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+#include <string>
+#include <vector>
+
+#include "arrow/util/decimal.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace Decimal {
+
+static void BM_FromString(benchmark::State& state) {  // NOLINT non-const 
reference
+  std::vector<std::string> values = {"0", "1.23", "12.345e6", "-12.345e-6"};
+
+  while (state.KeepRunning()) {
+    for (const auto& value : values) {
+      Decimal128 dec;
+      int32_t scale, precision;
+      ARROW_UNUSED(Decimal128::FromString(value, &dec, &scale, &precision));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * values.size());
+}
+
+BENCHMARK(BM_FromString)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+
+}  // namespace Decimal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/decimal-test.cc 
b/cpp/src/arrow/util/decimal-test.cc
index 6db46d485..2829e4ae6 100644
--- a/cpp/src/arrow/util/decimal-test.cc
+++ b/cpp/src/arrow/util/decimal-test.cc
@@ -205,9 +205,7 @@ TEST(DecimalZerosTest, LeadingZerosDecimalPoint) {
   int32_t precision;
   int32_t scale;
   ASSERT_OK(Decimal128::FromString(string_value, &d, &precision, &scale));
-  // We explicitly do not support this for now, otherwise this would be 
ASSERT_EQ
   ASSERT_EQ(4, precision);
-
   ASSERT_EQ(4, scale);
   ASSERT_EQ(0, d);
 }
diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc
index 48380a9c9..9e5e3ddb3 100644
--- a/cpp/src/arrow/util/decimal.cc
+++ b/cpp/src/arrow/util/decimal.cc
@@ -23,8 +23,6 @@
 #include <limits>
 #include <sstream>
 
-#include <boost/regex.hpp>
-
 #include "arrow/util/bit-util.h"
 #include "arrow/util/decimal.h"
 #include "arrow/util/logging.h"
@@ -253,112 +251,126 @@ static void StringToInteger(const std::string& str, 
Decimal128* out) {
   }
 }
 
-static const boost::regex DECIMAL_REGEX(
-    // sign of the number
-    "(?<SIGN>[-+]?)"
-
-    // digits around the decimal point
-    
"(((?<LEFT_DIGITS>\\d+)\\.(?<FIRST_RIGHT_DIGITS>\\d*)|\\.(?<SECOND_RIGHT_DIGITS>\\d+)"
-    ")"
+namespace {
 
-    // optional exponent
-    "([eE](?<FIRST_EXP_VALUE>[-+]?\\d+))?"
+struct DecimalComponents {
+  std::string sign;
+  std::string whole_digits;
+  std::string fractional_digits;
+  std::string exponent_sign;
+  std::string exponent_digits;
+};
 
-    // otherwise
-    "|"
+inline bool IsSign(char c) { return c == '-' || c == '+'; }
 
-    // we're just an integer
-    "(?<INTEGER>\\d+)"
+inline bool IsDot(char c) { return c == '.'; }
 
-    // or an integer with an exponent
-    "(?:[eE](?<SECOND_EXP_VALUE>[-+]?\\d+))?)");
+inline bool IsDigit(char c) { return c >= '0' && c <= '9'; }
 
-static inline bool is_zero_character(char c) { return c == '0'; }
+inline bool StartsExponent(char c) { return c == 'e' || c == 'E'; }
 
-Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* 
precision,
-                              int32_t* scale) {
-  if (s.empty()) {
-    return Status::Invalid("Empty string cannot be converted to decimal");
+inline size_t ParseDigitsRun(const char* s, size_t start, size_t size, 
std::string* out) {
+  size_t pos;
+  for (pos = start; pos < size; ++pos) {
+    if (!IsDigit(s[pos])) {
+      break;
+    }
   }
+  *out = std::string(s + start, pos - start);
+  return pos;
+}
 
-  // case of all zeros
-  if (std::all_of(s.cbegin(), s.cend(), is_zero_character)) {
-    if (precision != nullptr) {
-      *precision = 0;
-    }
+bool ParseDecimalComponents(const char* s, size_t size, DecimalComponents* 
out) {
+  size_t pos = 0;
 
-    if (scale != nullptr) {
-      *scale = 0;
+  if (size == 0) {
+    return false;
+  }
+  // Sign of the number
+  if (IsSign(s[pos])) {
+    out->sign = std::string(s + pos, 1);
+    ++pos;
+  }
+  // First run of digits
+  pos = ParseDigitsRun(s, pos, size, &out->whole_digits);
+  if (pos == size) {
+    return !out->whole_digits.empty();
+  }
+  // Optional dot (if given in fractional form)
+  bool has_dot = IsDot(s[pos]);
+  if (has_dot) {
+    // Second run of digits
+    ++pos;
+    pos = ParseDigitsRun(s, pos, size, &out->fractional_digits);
+  }
+  if (out->whole_digits.empty() && out->fractional_digits.empty()) {
+    // Need at least some digits (whole or fractional)
+    return false;
+  }
+  if (pos == size) {
+    return true;
+  }
+  // Optional exponent
+  if (StartsExponent(s[pos])) {
+    ++pos;
+    if (pos == size) {
+      return false;
+    }
+    // Optional exponent sign
+    if (IsSign(s[pos])) {
+      out->exponent_sign = std::string(s + pos, 1);
+      ++pos;
+    }
+    pos = ParseDigitsRun(s, pos, size, &out->exponent_digits);
+    if (out->exponent_digits.empty()) {
+      // Need some exponent digits
+      return false;
     }
-
-    *out = 0;
-    return Status::OK();
   }
+  return pos == size;
+}
 
-  boost::smatch results;
-  const bool matches = boost::regex_match(s, results, DECIMAL_REGEX);
+}  // namespace
 
-  if (!matches) {
-    std::stringstream ss;
-    ss << "The string " << s << " is not a valid decimal number";
-    return Status::Invalid(ss.str());
+Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* 
precision,
+                              int32_t* scale) {
+  if (s.empty()) {
+    return Status::Invalid("Empty string cannot be converted to decimal");
   }
 
-  const std::string sign = results["SIGN"];
-  const std::string integer = results["INTEGER"];
-
-  const std::string left_digits = results["LEFT_DIGITS"];
-  const std::string first_right_digits = results["FIRST_RIGHT_DIGITS"];
-
-  const std::string second_right_digits = results["SECOND_RIGHT_DIGITS"];
-
-  const std::string first_exp_value = results["FIRST_EXP_VALUE"];
-  const std::string second_exp_value = results["SECOND_EXP_VALUE"];
-
-  std::string whole_part;
-  std::string fractional_part;
-  std::string exponent_value;
-
-  if (!integer.empty()) {
-    whole_part = integer;
-  } else if (!left_digits.empty()) {
-    DCHECK(second_right_digits.empty()) << s << " " << second_right_digits;
-    whole_part = left_digits;
-    fractional_part = first_right_digits;
-  } else {
-    DCHECK(first_right_digits.empty()) << s << " " << first_right_digits;
-    fractional_part = second_right_digits;
+  DecimalComponents dec;
+  if (!ParseDecimalComponents(s.data(), s.size(), &dec)) {
+    std::stringstream ss;
+    ss << "The string '" << s << "' is not a valid decimal number";
+    return Status::Invalid(ss.str());
   }
+  std::string exponent_value = dec.exponent_sign + dec.exponent_digits;
 
-  // skip leading zeros before the decimal point
-  std::string::const_iterator without_leading_zeros =
-      std::find_if_not(whole_part.cbegin(), whole_part.cend(), 
is_zero_character);
-  whole_part = std::string(without_leading_zeros, whole_part.cend());
-
-  if (!first_exp_value.empty()) {
-    exponent_value = first_exp_value;
-  } else {
-    exponent_value = second_exp_value;
+  // Count number of significant digits (without leading zeros)
+  size_t first_non_zero = dec.whole_digits.find_first_not_of('0');
+  size_t significant_digits = dec.fractional_digits.size();
+  if (first_non_zero != std::string::npos) {
+    significant_digits += dec.whole_digits.size() - first_non_zero;
   }
 
   if (precision != nullptr) {
-    *precision = static_cast<int32_t>(whole_part.size() + 
fractional_part.size());
+    *precision = static_cast<int32_t>(significant_digits);
   }
 
   if (scale != nullptr) {
     if (!exponent_value.empty()) {
       auto adjusted_exponent = static_cast<int32_t>(std::stol(exponent_value));
-      auto len = static_cast<int32_t>(whole_part.size() + 
fractional_part.size());
+      auto len = static_cast<int32_t>(significant_digits);
       *scale = -adjusted_exponent + len - 1;
     } else {
-      *scale = static_cast<int32_t>(fractional_part.size());
+      *scale = static_cast<int32_t>(dec.fractional_digits.size());
     }
   }
 
   if (out != nullptr) {
     *out = 0;
-    StringToInteger(whole_part + fractional_part, out);
-    if (sign == "-") {
+    StringToInteger(dec.whole_digits + dec.fractional_digits, out);
+    if (dec.sign == "-") {
       out->Negate();
     }
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++] Get rid of boost regex usage
> ----------------------------------
>
>                 Key: ARROW-2224
>                 URL: https://issues.apache.org/jira/browse/ARROW-2224
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: C++
>            Reporter: Phillip Cloud
>            Assignee: Phillip Cloud
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.10.0
>
>
> We're using {{boost::regex}} to parse decimal strings for {{decimal128}} 
> types. We should use {{libre2}} instead.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to