This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-adbc.git


The following commit(s) were added to refs/heads/main by this push:
     new 409f8398 feat(c/driver/postgresql): Handle NUMERIC type by converting 
to string (#883)
409f8398 is described below

commit 409f83986a9c756cf75ad3153c46c7aded5a2463
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Jul 11 15:08:50 2023 -0300

    feat(c/driver/postgresql): Handle NUMERIC type by converting to string 
(#883)
    
    Closes #767.
    
    This PR implements NUMERIC Postgres columns by converting them to
    strings. This is not ideal but there is also no Arrow type that can be
    easily computed before an entire column of Postgres numerics has been
    resolved. A future extension could attempt decimal conversion in the
    driver but for now a string value is better than is what currently gets
    returned (a blob of the COPY data).
    
    Example support via the R bindings:
    
    ``` r
    # pak::pak("apache/arrow-adbc/r/adbcpostgresql#883")
    # docker compose up postgres_test
    library(adbcdrivermanager)
    
    uri <- 
"postgresql://localhost:5432/postgres?user=postgres&password=password"
    db <- adbc_database_init(adbcpostgresql::adbcpostgresql(), uri = uri)
    
    db |>
      read_adbc(
        "SELECT CAST(col AS NUMERIC) AS col FROM
          (VALUES (-123.456), (123.456), (0.4567890), (1200000), ('nan'), 
('-inf'), ('inf'), (NULL))
          AS drvd(col)"
      ) |>
      as.data.frame()
    #>         col
    #> 1  -123.456
    #> 2   123.456
    #> 3 0.4567890
    #> 4   1200000
    #> 5       nan
    #> 6      -inf
    #> 7       inf
    #> 8      <NA>
    ```
    
    <sup>Created on 2023-07-08 with [reprex
    v2.0.2](https://reprex.tidyverse.org)</sup>
---
 c/driver/postgresql/postgres_copy_reader.h       | 170 +++++++++++++++++++++++
 c/driver/postgresql/postgres_copy_reader_test.cc |  82 +++++++++++
 c/driver/postgresql/postgres_type.h              |   5 +
 c/driver/postgresql/postgresql_test.cc           |  12 ++
 4 files changed, 269 insertions(+)

diff --git a/c/driver/postgresql/postgres_copy_reader.h 
b/c/driver/postgresql/postgres_copy_reader.h
index 18d1fbd4..7d3844f8 100644
--- a/c/driver/postgresql/postgres_copy_reader.h
+++ b/c/driver/postgresql/postgres_copy_reader.h
@@ -17,6 +17,7 @@
 
 #pragma once
 
+#include <algorithm>
 #include <cerrno>
 #include <cstdint>
 #include <memory>
@@ -211,6 +212,172 @@ class PostgresCopyNetworkEndianFieldReader : public 
PostgresCopyFieldReader {
   }
 };
 
+// Converts COPY resulting from the Postgres NUMERIC type into a string.
+// Rewritten based on the Postgres implementation of NUMERIC cast to string in
+// src/backend/utils/adt/numeric.c : get_str_from_var() (Note that in the 
initial source,
+// DEC_DIGITS is always 4 and DBASE is always 10000).
+//
+// Briefly, the Postgres representation of "numeric" is an array of int16_t 
("digits")
+// from most significant to least significant. Each "digit" is a value between 
0000 and
+// 9999. There are weight + 1 digits before the decimal point and dscale 
digits after the
+// decimal point. Both of those values can be zero or negative. A "sign" 
component
+// encodes the positive or negativeness of the value and is also used to 
encode special
+// values (inf, -inf, and nan).
+class PostgresCopyNumericFieldReader : public PostgresCopyFieldReader {
+ public:
+  ArrowErrorCode Read(ArrowBufferView* data, int32_t field_size_bytes, 
ArrowArray* array,
+                      ArrowError* error) override {
+    // -1 for NULL
+    if (field_size_bytes < 0) {
+      return ArrowArrayAppendNull(array, 1);
+    }
+
+    // Read the input
+    if (data->size_bytes < static_cast<int64_t>(4 * sizeof(int16_t))) {
+      ArrowErrorSet(error,
+                    "Expected at least %d bytes of field data for numeric copy 
data but "
+                    "only %d bytes of input remain",
+                    static_cast<int>(4 * sizeof(int16_t)),
+                    static_cast<int>(data->size_bytes));  // 
NOLINT(runtime/int)
+      return EINVAL;
+    }
+
+    int16_t ndigits = ReadUnsafe<int16_t>(data);
+    int16_t weight = ReadUnsafe<int16_t>(data);
+    uint16_t sign = ReadUnsafe<uint16_t>(data);
+    uint16_t dscale = ReadUnsafe<uint16_t>(data);
+
+    if (data->size_bytes < static_cast<int64_t>(ndigits * sizeof(int16_t))) {
+      ArrowErrorSet(error,
+                    "Expected at least %d bytes of field data for numeric 
digits copy "
+                    "data but only %d bytes of input remain",
+                    static_cast<int>(ndigits * sizeof(int16_t)),
+                    static_cast<int>(data->size_bytes));  // 
NOLINT(runtime/int)
+      return EINVAL;
+    }
+
+    digits_.clear();
+    for (int16_t i = 0; i < ndigits; i++) {
+      digits_.push_back(ReadUnsafe<int16_t>(data));
+    }
+
+    // Handle special values
+    std::string special_value;
+    switch (sign) {
+      case kNumericNAN:
+        special_value = std::string("nan");
+        break;
+      case kNumericPinf:
+        special_value = std::string("inf");
+        break;
+      case kNumericNinf:
+        special_value = std::string("-inf");
+        break;
+      case kNumericPos:
+      case kNumericNeg:
+        special_value = std::string("");
+        break;
+      default:
+        ArrowErrorSet(error,
+                      "Unexpected value for sign read from Postgres numeric 
field: %d",
+                      static_cast<int>(sign));
+        return EINVAL;
+    }
+
+    if (!special_value.empty()) {
+      NANOARROW_RETURN_NOT_OK(
+          ArrowBufferAppend(data_, special_value.data(), 
special_value.size()));
+      NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(offsets_, 
data_->size_bytes));
+      return AppendValid(array);
+    }
+
+    // Calculate string space requirement
+    int64_t max_chars_required = std::max<int64_t>(1, (weight + 1) * 
kDecDigits);
+    max_chars_required += dscale + kDecDigits + 2;
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(data_, max_chars_required));
+    char* out0 = reinterpret_cast<char*>(data_->data + data_->size_bytes);
+    char* out = out0;
+
+    // Build output string in-place, starting with the negative sign
+    if (sign == kNumericNeg) {
+      *out++ = '-';
+    }
+
+    // ...then digits before the decimal point
+    int d;
+    int d1;
+    int16_t dig;
+
+    if (weight < 0) {
+      d = weight + 1;
+      *out++ = '0';
+    } else {
+      for (d = 0; d <= weight; d++) {
+        if (d < ndigits) {
+          dig = digits_[d];
+        } else {
+          dig = 0;
+        }
+
+        // To strip leading zeroes
+        int append = (d > 0);
+
+        for (const auto pow10 : {1000, 100, 10, 1}) {
+          d1 = dig / pow10;
+          dig -= d1 * pow10;
+          append |= (d1 > 0);
+          if (append) {
+            *out++ = d1 + '0';
+          }
+        }
+      }
+    }
+
+    // ...then the decimal point + digits after it. This may write more digits
+    // than specified by dscale so we need to keep track of how many we want to
+    // keep here.
+    int64_t actual_chars_required = out - out0;
+
+    if (dscale > 0) {
+      *out++ = '.';
+      actual_chars_required += dscale + 1;
+
+      for (int i = 0; i < dscale; i++, d++, i += kDecDigits) {
+        if (d >= 0 && d < ndigits) {
+          dig = digits_[d];
+        } else {
+          dig = 0;
+        }
+
+        for (const auto pow10 : {1000, 100, 10, 1}) {
+          d1 = dig / pow10;
+          dig -= d1 * pow10;
+          *out++ = d1 + '0';
+        }
+      }
+    }
+
+    // Update data buffer size and add offsets
+    data_->size_bytes += actual_chars_required;
+    NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(offsets_, 
data_->size_bytes));
+    return AppendValid(array);
+  }
+
+ private:
+  std::vector<int16_t> digits_;
+
+  // Number of decimal digits per Postgres digit
+  static const int kDecDigits = 4;
+  // The "base" of the Postgres representation (i.e., each "digit" is 0 to 
9999)
+  static const int kNBase = 10000;
+  // Valid values for the sign component
+  static const uint16_t kNumericPos = 0x0000;
+  static const uint16_t kNumericNeg = 0x4000;
+  static const uint16_t kNumericNAN = 0xC000;
+  static const uint16_t kNumericPinf = 0xD000;
+  static const uint16_t kNumericNinf = 0xF000;
+};
+
 // Reader for Pg->Arrow conversions whose Arrow representation is simply the
 // bytes of the field representation. This can be used with binary and string
 // Arrow types and any Postgres type.
@@ -574,6 +741,9 @@ static inline ArrowErrorCode MakeCopyFieldReader(const 
PostgresType& pg_type,
         case PostgresTypeId::kName:
           *out = new PostgresCopyBinaryFieldReader();
           return NANOARROW_OK;
+        case PostgresTypeId::kNumeric:
+          *out = new PostgresCopyNumericFieldReader();
+          return NANOARROW_OK;
         default:
           return ErrorCantConvert(error, pg_type, schema_view);
       }
diff --git a/c/driver/postgresql/postgres_copy_reader_test.cc 
b/c/driver/postgresql/postgres_copy_reader_test.cc
index 26a1ab39..44ad0601 100644
--- a/c/driver/postgresql/postgres_copy_reader_test.cc
+++ b/c/driver/postgresql/postgres_copy_reader_test.cc
@@ -334,6 +334,88 @@ TEST(PostgresCopyUtilsTest, 
PostgresCopyReadDoublePrecision) {
   ASSERT_EQ(data_buffer[4], 0);
 }
 
+// For full coverage, ensure that this contains NUMERIC examples that:
+// - Have >= four zeroes to the left of the decimal point
+// - Have >= four zeroes to the right of the decimal point
+// - Include special values (nan, -inf, inf, NULL)
+// - Have >= four trailing zeroes to the right of the decimal point
+// - Have >= four leading zeroes before the first digit to the right of the 
decimal point
+// - Is < 0 (negative)
+// COPY (SELECT CAST(col AS NUMERIC) AS col FROM (  VALUES (1000000), 
('0.00001234'),
+// ('1.0000'), (-123.456), (123.456), ('nan'), ('-inf'), ('inf'), (NULL)) AS 
drvd(col)) TO
+// STDOUT WITH (FORMAT binary);
+static uint8_t kTestPgCopyNumeric[] = {
+    0x50, 0x47, 0x43, 0x4f, 0x50, 0x59, 0x0a, 0xff, 0x0d, 0x0a, 0x00, 0x00, 
0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 
0x01, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x00, 0x01, 0x00, 0x00, 0x00, 
0x0a, 0x00,
+    0x01, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x08, 0x04, 0xd2, 0x00, 0x01, 0x00, 
0x00, 0x00,
+    0x0a, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x01, 0x00, 
0x01, 0x00,
+    0x00, 0x00, 0x0c, 0x00, 0x02, 0x00, 0x00, 0x40, 0x00, 0x00, 0x03, 0x00, 
0x7b, 0x11,
+    0xd0, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x02, 0x00, 0x00, 0x00, 
0x00, 0x00,
+    0x03, 0x00, 0x7b, 0x11, 0xd0, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 
0x00, 0x00,
+    0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 
0x00, 0x00,
+    0x00, 0xf0, 0x00, 0x00, 0x20, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 
0x00, 0x00,
+    0x00, 0xd0, 0x00, 0x00, 0x20, 0x00, 0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 
0xff};
+
+TEST(PostgresCopyUtilsTest, PostgresCopyReadNumeric) {
+  ArrowBufferView data;
+  data.data.as_uint8 = kTestPgCopyNumeric;
+  data.size_bytes = sizeof(kTestPgCopyNumeric);
+
+  auto col_type = PostgresType(PostgresTypeId::kNumeric);
+  PostgresType input_type(PostgresTypeId::kRecord);
+  input_type.AppendChild("col", col_type);
+
+  PostgresCopyStreamTester tester;
+  ASSERT_EQ(tester.Init(input_type), NANOARROW_OK);
+  ASSERT_EQ(tester.ReadAll(&data), ENODATA);
+  ASSERT_EQ(data.data.as_uint8 - kTestPgCopyNumeric, 
sizeof(kTestPgCopyNumeric));
+  ASSERT_EQ(data.size_bytes, 0);
+
+  nanoarrow::UniqueArray array;
+  ASSERT_EQ(tester.GetArray(array.get()), NANOARROW_OK);
+  ASSERT_EQ(array->length, 9);
+  ASSERT_EQ(array->n_children, 1);
+
+  nanoarrow::UniqueSchema schema;
+  tester.GetSchema(schema.get());
+
+  nanoarrow::UniqueArrayView array_view;
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), 
nullptr),
+            NANOARROW_OK);
+  ASSERT_EQ(array_view->children[0]->storage_type, NANOARROW_TYPE_STRING);
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr), 
NANOARROW_OK);
+
+  auto validity = array_view->children[0]->buffer_views[0].data.as_uint8;
+  ASSERT_TRUE(ArrowBitGet(validity, 0));
+  ASSERT_TRUE(ArrowBitGet(validity, 1));
+  ASSERT_TRUE(ArrowBitGet(validity, 2));
+  ASSERT_TRUE(ArrowBitGet(validity, 3));
+  ASSERT_TRUE(ArrowBitGet(validity, 4));
+  ASSERT_TRUE(ArrowBitGet(validity, 5));
+  ASSERT_TRUE(ArrowBitGet(validity, 6));
+  ASSERT_TRUE(ArrowBitGet(validity, 7));
+  ASSERT_FALSE(ArrowBitGet(validity, 8));
+
+  struct ArrowStringView item;
+  item = ArrowArrayViewGetStringUnsafe(array_view->children[0], 0);
+  EXPECT_EQ(std::string(item.data, item.size_bytes), "1000000");
+  item = ArrowArrayViewGetStringUnsafe(array_view->children[0], 1);
+  EXPECT_EQ(std::string(item.data, item.size_bytes), "0.00001234");
+  item = ArrowArrayViewGetStringUnsafe(array_view->children[0], 2);
+  EXPECT_EQ(std::string(item.data, item.size_bytes), "1.0000");
+  item = ArrowArrayViewGetStringUnsafe(array_view->children[0], 3);
+  EXPECT_EQ(std::string(item.data, item.size_bytes), "-123.456");
+  item = ArrowArrayViewGetStringUnsafe(array_view->children[0], 4);
+  EXPECT_EQ(std::string(item.data, item.size_bytes), "123.456");
+  item = ArrowArrayViewGetStringUnsafe(array_view->children[0], 5);
+  EXPECT_EQ(std::string(item.data, item.size_bytes), "nan");
+  item = ArrowArrayViewGetStringUnsafe(array_view->children[0], 6);
+  EXPECT_EQ(std::string(item.data, item.size_bytes), "-inf");
+  item = ArrowArrayViewGetStringUnsafe(array_view->children[0], 7);
+  EXPECT_EQ(std::string(item.data, item.size_bytes), "inf");
+}
+
 // COPY (SELECT CAST("col" AS TEXT) AS "col" FROM (  VALUES ('abc'), ('1234'),
 // (NULL::text)) AS drvd("col")) TO STDOUT WITH (FORMAT binary);
 static uint8_t kTestPgCopyText[] = {
diff --git a/c/driver/postgresql/postgres_type.h 
b/c/driver/postgresql/postgres_type.h
index e234e36a..7b4197bf 100644
--- a/c/driver/postgresql/postgres_type.h
+++ b/c/driver/postgresql/postgres_type.h
@@ -214,6 +214,11 @@ class PostgresType {
         NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, 
NANOARROW_TYPE_DOUBLE));
         break;
 
+      // ---- Numeric/Decimal-------------------
+      case PostgresTypeId::kNumeric:
+        NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, 
NANOARROW_TYPE_STRING));
+        break;
+
       // ---- Binary/string --------------------
       case PostgresTypeId::kChar:
       case PostgresTypeId::kBpchar:
diff --git a/c/driver/postgresql/postgresql_test.cc 
b/c/driver/postgresql/postgresql_test.cc
index 153d8eb2..1a30fd58 100644
--- a/c/driver/postgresql/postgresql_test.cc
+++ b/c/driver/postgresql/postgresql_test.cc
@@ -836,6 +836,16 @@ static std::initializer_list<TypeTestCase> kIntTypeCases = 
{
     {"BIGSERIAL", "BIGSERIAL", 
std::to_string(std::numeric_limits<int64_t>::max()),
      NANOARROW_TYPE_INT64, std::numeric_limits<int64_t>::max()},
 };
+static std::initializer_list<TypeTestCase> kNumericTypeCases = {
+    {"NUMERIC_TRAILING0", "NUMERIC", "1000000", NANOARROW_TYPE_STRING, 
"1000000"},
+    {"NUMERIC_LEADING0", "NUMERIC", "0.00001234", NANOARROW_TYPE_STRING, 
"0.00001234"},
+    {"NUMERIC_TRAILING02", "NUMERIC", "'1.0000'", NANOARROW_TYPE_STRING, 
"1.0000"},
+    {"NUMERIC_NEGATIVE", "NUMERIC", "-123.456", NANOARROW_TYPE_STRING, 
"-123.456"},
+    {"NUMERIC_POSITIVE", "NUMERIC", "123.456", NANOARROW_TYPE_STRING, 
"123.456"},
+    {"NUMERIC_NAN", "NUMERIC", "'nan'", NANOARROW_TYPE_STRING, "nan"},
+    {"NUMERIC_NINF", "NUMERIC", "'-inf'", NANOARROW_TYPE_STRING, "-inf"},
+    {"NUMERIC_PINF", "NUMERIC", "'inf'", NANOARROW_TYPE_STRING, "inf"},
+};
 static std::initializer_list<TypeTestCase> kDateTypeCases = {
     {"DATE0", "DATE", "'1970-01-01'", NANOARROW_TYPE_DATE32, int64_t(0)},
     {"DATE1", "DATE", "'2000-01-01'", NANOARROW_TYPE_DATE32, int64_t(10957)},
@@ -966,6 +976,8 @@ INSTANTIATE_TEST_SUITE_P(FloatTypes, PostgresTypeTest, 
testing::ValuesIn(kFloatT
                          TypeTestCase::FormatName);
 INSTANTIATE_TEST_SUITE_P(IntTypes, PostgresTypeTest, 
testing::ValuesIn(kIntTypeCases),
                          TypeTestCase::FormatName);
+INSTANTIATE_TEST_SUITE_P(NumericType, PostgresTypeTest,
+                         testing::ValuesIn(kNumericTypeCases), 
TypeTestCase::FormatName);
 INSTANTIATE_TEST_SUITE_P(DateTypes, PostgresTypeTest, 
testing::ValuesIn(kDateTypeCases),
                          TypeTestCase::FormatName);
 INSTANTIATE_TEST_SUITE_P(TimeTypes, PostgresTypeTest, 
testing::ValuesIn(kTimeTypeCases),

Reply via email to