[jira] [Commented] (ARROW-2145) [Python] Decimal conversion not working for NaN values

ASF GitHub Bot (JIRA) Mon, 26 Feb 2018 13:51:02 -0800

    [ 
https://issues.apache.org/jira/browse/ARROW-2145?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16377650#comment-16377650
 ]


ASF GitHub Bot commented on ARROW-2145:
---------------------------------------

wesm commented on a change in pull request #1651: 
ARROW-2145/ARROW-2153/ARROW-2157/ARROW-2160/ARROW-2177: [Python] Decimal 
conversion not working for NaN values
URL: https://github.com/apache/arrow/pull/1651#discussion_r170745843
 
 

 ##########
 File path: cpp/src/arrow/util/decimal.cc
 ##########
 @@ -212,160 +253,124 @@ static void StringToInteger(const std::string& str, 
Decimal128* out) {
   }
 }
 
-Status Decimal128::FromString(const std::string& s, Decimal128* out, int* 
precision,
-                              int* scale) {
-  // Implements this regex: 
"(\\+?|-?)((0*)(\\d*))(\\.(\\d+))?((E|e)(\\+|-)?\\d+)?";
-  if (s.empty()) {
-    return Status::Invalid("Empty string cannot be converted to decimal");
-  }
+static const boost::regex DECIMAL_REGEX(
+    // sign of the number
+    "(?<SIGN>[-+]?)"
 
-  std::string::const_iterator charp = s.cbegin();
-  std::string::const_iterator end = s.cend();
+    // digits around the decimal point
+    
"(((?<LEFT_DIGITS>\\d+)\\.(?<FIRST_RIGHT_DIGITS>\\d*)|\\.(?<SECOND_RIGHT_DIGITS>\\d+)"
+    ")"
 
-  char first_char = *charp;
-  bool is_negative = false;
-  if (first_char == '+' || first_char == '-') {
-    is_negative = first_char == '-';
-    ++charp;
-  }
+    // optional exponent
+    "([eE](?<FIRST_EXP_VALUE>[-+]?\\d+))?"
 
-  if (charp == end) {
-    std::stringstream ss;
-    ss << "Single character: '" << first_char << "' is not a valid decimal 
value";
-    return Status::Invalid(ss.str());
-  }
+    // otherwise
+    "|"
 
-  std::string::const_iterator numeric_string_start = charp;
+    // we're just an integer
+    "(?<INTEGER>\\d+)"
 
-  DCHECK_LT(charp, end);
+    // or an integer with an exponent
+    "(?:[eE](?<SECOND_EXP_VALUE>[-+]?\\d+))?)");
 
-  // skip leading zeros
-  charp = std::find_if_not(charp, end, [](char value) { return value == '0'; 
});
+static inline bool is_zero_character(char c) { return c == '0'; }
 
-  // all zeros and no decimal point
-  if (charp == end) {
-    if (out != NULLPTR) {
-      *out = 0;
-    }
+Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* 
precision,
+                              int32_t* scale) {
+  if (s.empty()) {
+    return Status::Invalid("Empty string cannot be converted to decimal");
+  }
 
-    // Not sure what other libraries assign precision to for this case (this 
case of
-    // a string consisting only of one or more zeros)
+  // case of all zeros
+  if (std::all_of(s.cbegin(), s.cend(), is_zero_character)) {
     if (precision != NULLPTR) {
-      *precision = static_cast<int>(charp - numeric_string_start);
+      *precision = 0;
     }
 
     if (scale != NULLPTR) {
       *scale = 0;
     }
 
+    *out = 0;
     return Status::OK();
   }
 
-  std::string::const_iterator whole_part_start = charp;
+  boost::smatch results;
+  const bool matches = boost::regex_match(s, results, DECIMAL_REGEX);
+
+  if (!matches) {
+    std::stringstream ss;
+    ss << s << " does not match";
+    return Status::Invalid(ss.str());
+  }
+
+  const std::string sign = results["SIGN"];
+  const std::string integer = results["INTEGER"];
 
-  charp = std::find_if_not(charp, end, isdigit);
+  const std::string left_digits = results["LEFT_DIGITS"];
+  const std::string first_right_digits = results["FIRST_RIGHT_DIGITS"];
 
-  std::string::const_iterator whole_part_end = charp;
-  std::string whole_part(whole_part_start, whole_part_end);
+  const std::string second_right_digits = results["SECOND_RIGHT_DIGITS"];
 
-  if (charp != end && *charp == '.') {
-    ++charp;
+  const std::string first_exp_value = results["FIRST_EXP_VALUE"];
+  const std::string second_exp_value = results["SECOND_EXP_VALUE"];
 
-    if (charp == end) {
-      return Status::Invalid(
-          "Decimal point must be followed by at least one base ten digit. 
Reached the "
-          "end of the string.");
-    }
+  std::string whole_part;
+  std::string fractional_part;
+  std::string exponent_value;
 
-    if (std::isdigit(*charp) == 0) {
-      std::stringstream ss;
-      ss << "Decimal point must be followed by a base ten digit. Found '" << 
*charp
-         << "'";
-      return Status::Invalid(ss.str());
-    }
+  if (!integer.empty()) {
+    whole_part = integer;
+  } else if (!left_digits.empty()) {
+    DCHECK(second_right_digits.empty()) << s << " " << second_right_digits;
+    whole_part = left_digits;
+    fractional_part = first_right_digits;
   } else {
-    if (charp != end) {
-      std::stringstream ss;
-      ss << "Expected base ten digit or decimal point but found '" << *charp
-         << "' instead.";
-      return Status::Invalid(ss.str());
-    }
+    DCHECK(first_right_digits.empty()) << s << " " << first_right_digits;
+    fractional_part = second_right_digits;
   }
 
-  std::string::const_iterator fractional_part_start = charp;
-
-  // The rest must be digits or an exponent
-  if (charp != end) {
-    charp = std::find_if_not(charp, end, isdigit);
+  // skip leading zeros before the decimal point
+  std::string::const_iterator without_leading_zeros =
+      std::find_if_not(whole_part.cbegin(), whole_part.cend(), 
is_zero_character);
+  whole_part = std::string(without_leading_zeros, whole_part.cend());
 
-    // The while loop has ended before the end of the string which means we've 
hit a
-    // character that isn't a base ten digit or "E" for exponent
-    if (charp != end && *charp != 'E' && *charp != 'e') {
-      std::stringstream ss;
-      ss << "Found non base ten digit character '" << *charp
-         << "' before the end of the string";
-      return Status::Invalid(ss.str());
-    }
+  if (!first_exp_value.empty()) {
+    exponent_value = first_exp_value;
+  } else {
+    exponent_value = second_exp_value;
   }
 
-  std::string::const_iterator fractional_part_end = charp;
-  std::string fractional_part(fractional_part_start, fractional_part_end);
-
   if (precision != NULLPTR) {
-    *precision = static_cast<int>(whole_part.size() + fractional_part.size());
+    *precision = static_cast<int32_t>(whole_part.size() + 
fractional_part.size());
   }
 
-  if (charp != end) {
-    // we must have an exponent, if this aborts then we have somehow not 
caught this and
-    // raised a proper error
-    DCHECK(*charp == 'E' || *charp == 'e');
-
-    ++charp;
-
-    const char value = *charp;
-    const bool starts_with_plus_or_minus = value == '+' || value == '-';
-
-    // we use this to construct the adjusted exponent integer later
-    std::string::const_iterator digit_start = charp;
-
-    // skip plus or minus
-    charp += starts_with_plus_or_minus;
-
-    // confirm that the rest of the characters are digits
-    charp = std::find_if_not(charp, end, isdigit);
-
-    if (charp != end) {
-      // we have something other than digits here
-      std::stringstream ss;
-      ss << "Found non decimal digit exponent value '" << *charp << "'";
-      return Status::Invalid(ss.str());
-    }
-
-    if (scale != NULLPTR) {
-      // compute the scale from the adjusted exponent
-      std::string adjusted_exponent_string(digit_start, end);
-      DCHECK(std::all_of(adjusted_exponent_string.cbegin() + 
starts_with_plus_or_minus,
-                         adjusted_exponent_string.cend(), isdigit))
-          << "Non decimal digit character found in " << 
adjusted_exponent_string;
-      const auto adjusted_exponent =
-          static_cast<int32_t>(std::stol(adjusted_exponent_string));
-      const auto len = static_cast<int32_t>(whole_part.size() + 
fractional_part.size());
-
+  if (scale != NULLPTR) {
+    if (!exponent_value.empty()) {
+      auto adjusted_exponent = static_cast<int32_t>(std::stol(exponent_value));
+      auto len = static_cast<int32_t>(whole_part.size() + 
fractional_part.size());
       *scale = -adjusted_exponent + len - 1;
-    }
-  } else {
-    if (scale != NULLPTR) {
-      *scale = static_cast<int>(fractional_part.size());
+    } else {
+      *scale = static_cast<int32_t>(fractional_part.size());
     }
   }
 
   if (out != NULLPTR) {
-    // zero out in case we've passed in a previously used value
     *out = 0;
     StringToInteger(whole_part + fractional_part, out);
-    if (is_negative) {
+    if (sign == "-") {
       out->Negate();
     }
+
+    if (scale != NULLPTR && *scale < 0) {
+      const int32_t abs_scale = std::abs(*scale);
+      *out *= ScaleMultipliers[abs_scale];
+
+      if (precision != NULLPTR) {
 
 Review comment:
   FWIW it's not necessary to use this `NULLPTR` macro outside headers I don't 
believe 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Decimal conversion not working for NaN values
> ------------------------------------------------------
>
>                 Key: ARROW-2145
>                 URL: https://issues.apache.org/jira/browse/ARROW-2145
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++, Python
>    Affects Versions: 0.8.0
>            Reporter: Antony Mayi
>            Assignee: Phillip Cloud
>            Priority: Major
>              Labels: pull-request-available
>
> {code:python}
> import pyarrow as pa
> import pandas as pd
> import decimal
> pa.Table.from_pandas(pd.DataFrame({'a': [decimal.Decimal('1.1'), 
> decimal.Decimal('NaN')]}))
> {code}
> throws following exception:
> {code}
> Traceback (most recent call last):
>   File "<stdin>", line 1, in <module>
>   File "pyarrow/table.pxi", line 875, in pyarrow.lib.Table.from_pandas 
> (/arrow/python/build/temp.linux-x86_64-3.6/lib.cxx:44927)
>   File "/lib/python3.6/site-packages/pyarrow/pandas_compat.py", line 350, in 
> dataframe_to_arrays
>     convert_types)]
>   File "/lib/python3.6/site-packages/pyarrow/pandas_compat.py", line 349, in 
> <listcomp>
>     for c, t in zip(columns_to_convert,
>   File "/lib/python3.6/site-packages/pyarrow/pandas_compat.py", line 345, in 
> convert_column
>     return pa.array(col, from_pandas=True, type=ty)
>   File "pyarrow/array.pxi", line 170, in pyarrow.lib.array 
> (/arrow/python/build/temp.linux-x86_64-3.6/lib.cxx:29224)
>   File "pyarrow/array.pxi", line 70, in pyarrow.lib._ndarray_to_array 
> (/arrow/python/build/temp.linux-x86_64-3.6/lib.cxx:28465)
>   File "pyarrow/error.pxi", line 98, in pyarrow.lib.check_status 
> (/arrow/python/build/temp.linux-x86_64-3.6/lib.cxx:9068)
> pyarrow.lib.ArrowException: Unknown error: an integer is required (got type 
> str)
> {code}
> Same problem with other special decimal values like {{infinity}}.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (ARROW-2145) [Python] Decimal conversion not working for NaN values

Reply via email to