Github user wgtmac commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r158342083
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,476 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <getopt.h>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+static char gDelimiter = ',';
+
+// extract one column raw text from one line
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(gDelimiter);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(gDelimiter, start);
+ ++col;
+ }
+ return col == colIndex ? s.substr(start, end - start) : "";
+}
+
+static const char* GetDate(void) {
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ if (buffer.size() - offset < col.size()) {
+ buffer.reserve(buffer.size() * 2);
+ }
+ memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+ stringBatch->data[i] = buffer.data() + offset;
+ stringBatch->length[i] = static_cast<int64_t>(col.size());
+ offset += col.size();
+ }
+ }
+ stringBatch->hasNulls = hasNull;
+ stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::DoubleVectorBatch* dblBatch =
+ dynamic_cast<orc::DoubleVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ dblBatch->data[i] = atof(col.c_str());
+ }
+ }
+ dblBatch->hasNulls = hasNull;
+ dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ size_t scale,
+ size_t precision) {
+
+
+ orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR;
+ orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR;
+ if (precision <= 18) {
+ d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch);
+ d64Batch->scale = static_cast<int32_t>(scale);
+ } else {
+ d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch);
+ d128Batch->scale = static_cast<int32_t>(scale);
+ }
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ size_t ptPos = col.find('.');
+ size_t curScale = 0;
+ std::string num = col;
+ if (ptPos != std::string::npos) {
+ curScale = col.length() - ptPos - 1;
+ num = col.substr(0, ptPos) + col.substr(ptPos + 1);
+ }
+ orc::Int128 decimal(num);
+ while (curScale != scale) {
+ curScale++;
+ decimal *= 10;
+ }
+ if (precision <= 18) {
+ d64Batch->values[i] = decimal.toLong();
+ } else {
+ d128Batch->values[i] = decimal;
+ }
+ }
+ }
+ batch->hasNulls = hasNull;
+ batch->numElements = numValues;
+}
+
+void fillBoolValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* boolBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ std::transform(col.begin(), col.end(), col.begin(), ::tolower);
+ if (col == "true" || col == "t") {
+ boolBatch->data[i] = true;
+ } else {
+ boolBatch->data[i] = false;
+ }
+ }
+ }
+ boolBatch->hasNulls = hasNull;
+ boolBatch->numElements = numValues;
+}
+
+// parse date string from format YYYY-mm-dd
+void fillDateValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ struct tm tm;
+ memset(&tm, 0, sizeof(struct tm));
+ strptime(col.c_str(), "%Y-%m-%d", &tm);
+ time_t t = mktime(&tm);
+ time_t t1970 = 0;
+ double seconds = difftime(t, t1970);
+ int64_t days = static_cast<int64_t>(seconds / (60*60*24));
+ longBatch->data[i] = days;
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+// parse timestamp values in seconds
+void fillTimestampValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::TimestampVectorBatch* tsBatch =
+ dynamic_cast<orc::TimestampVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ tsBatch->data[i] = atoll(col.c_str());
+ tsBatch->nanoseconds[i] = 0;
+ }
+ }
+ tsBatch->hasNulls = hasNull;
+ tsBatch->numElements = numValues;
+}
+
+void usage() {
+ std::cout << "Usage: csv-import --input <input file> --output <output> "
--- End diff --
Done. Thanks!
---