Github user xndai commented on a diff in the pull request: https://github.com/apache/orc/pull/199#discussion_r155593186 --- Diff: tools/src/CSVFileImport.cc --- @@ -0,0 +1,411 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" +#include "orc/OrcFile.hh" + +#include <algorithm> +#include <fstream> +#include <iostream> +#include <memory> +#include <string> +#include <sys/time.h> +#include <time.h> + +#define DELIMITER ',' + +std::string extractColumn(std::string s, uint64_t colIndex) { + uint64_t col = 0; + size_t start = 0; + size_t end = s.find(DELIMITER); + while (col < colIndex && end != std::string::npos) { + start = end + 1; + end = s.find(DELIMITER, start); + ++col; + } + return s.substr(start, end - start); +} + +static const char* GetDate(void) +{ + static char buf[200]; + time_t t = time(NULL); + struct tm* p = localtime(&t); + strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p); + return buf; +} + +void fillLongValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::LongVectorBatch* longBatch = + dynamic_cast<orc::LongVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + longBatch->notNull[i] = 0; + hasNull = true; + } else { + longBatch->data[i] = atoll(col.c_str()); + } + } + longBatch->hasNulls = hasNull; + longBatch->numElements = numValues; +} + +void fillStringValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex, + orc::DataBuffer<char>& buffer, + uint64_t& offset) { + orc::StringVectorBatch* stringBatch = + dynamic_cast<orc::StringVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + stringBatch->notNull[i] = 0; + hasNull = true; + } else { + memcpy(buffer.data() + offset, + col.c_str(), + col.size()); + stringBatch->data[i] = buffer.data() + offset; + stringBatch->length[i] = static_cast<int64_t>(col.size()); + offset += col.size(); + } + } + stringBatch->hasNulls = hasNull; + stringBatch->numElements = numValues; +} + +void fillDoubleValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::DoubleVectorBatch* dblBatch = + dynamic_cast<orc::DoubleVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + dblBatch->notNull[i] = 0; + hasNull = true; + } else { + dblBatch->data[i] = atof(col.c_str()); + } + } + dblBatch->hasNulls = hasNull; + dblBatch->numElements = numValues; +} + +// parse fixed point decimal numbers +void fillDecimalValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex, + size_t scale, + size_t precision) { + + + orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR; + orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR; + if (precision <= 18) { + d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch); + d64Batch->scale = static_cast<int32_t>(scale); + } else { + d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch); + d128Batch->scale = static_cast<int32_t>(scale); + } + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + batch->notNull[i] = 0; + hasNull = true; + } else { + size_t ptPos = col.find('.'); + size_t curScale = 0; + std::string num = col; + if (ptPos != std::string::npos) { + curScale = col.length() - ptPos - 1; + num = col.substr(0, ptPos) + col.substr(ptPos + 1); + } + orc::Int128 decimal(num); + while (curScale != scale) { + curScale++; + decimal *= 10; + } + if (precision <= 18) { + d64Batch->values[i] = decimal.toLong(); + } else { + d128Batch->values[i] = decimal; + } + } + } + batch->hasNulls = hasNull; + batch->numElements = numValues; +} + +void fillBoolValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::LongVectorBatch* boolBatch = + dynamic_cast<orc::LongVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + boolBatch->notNull[i] = 0; + hasNull = true; + } else { + std::transform(col.begin(), col.end(), col.begin(), ::tolower); + if (col == "true" || col == "t") { + boolBatch->data[i] = true; + } else { + boolBatch->data[i] = false; + } + } + } + boolBatch->hasNulls = hasNull; + boolBatch->numElements = numValues; +} + +// parse date string from format YYYY-MM-dd +void fillDateValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::LongVectorBatch* longBatch = + dynamic_cast<orc::LongVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + longBatch->notNull[i] = 0; + hasNull = true; + } else { + struct tm tm; + memset(&tm, 0, sizeof(struct tm)); + strptime(col.c_str(), "%Y-%m-%d", &tm); + time_t t = mktime(&tm); + time_t t1970 = 0; + double seconds = difftime(t, t1970); + int64_t days = static_cast<int64_t>(seconds / (60*60*24)); + longBatch->data[i] = days; + } + } + longBatch->hasNulls = hasNull; + longBatch->numElements = numValues; +} + +// parse timestamp values in seconds +void fillTimestampValues(const std::vector<std::string>& data, + orc::ColumnVectorBatch* batch, + uint64_t numValues, + uint64_t colIndex) { + orc::TimestampVectorBatch* tsBatch = + dynamic_cast<orc::TimestampVectorBatch*>(batch); + bool hasNull = false; + for (uint64_t i = 0; i < numValues; ++i) { + std::string col = extractColumn(data[i], colIndex); + if (col.empty()) { + tsBatch->notNull[i] = 0; + hasNull = true; + } else { + tsBatch->data[i] = atoll(col.c_str()); + tsBatch->nanoseconds[i] = 0; + } + } + tsBatch->hasNulls = hasNull; + tsBatch->numElements = numValues; +} + +void usage() { + std::cout << "Usage: csv-import <input> <output> --schema=<file schema>" + << "Import CSV file into an Orc file using the specified schema.\n"; +} + +int main(int argc, char* argv[]) { + if (argc != 4) { + std::cout << "Invalid number of arguments." << std::endl; + usage(); + return 1; + } + + std::string input = argv[1]; + std::string output = argv[2]; + std::string schema = argv[3]; + + const std::string SCHEMA_PREFIX = "--schema="; + ORC_UNIQUE_PTR<orc::Type> fileType = ORC_NULLPTR; + if (schema.find(SCHEMA_PREFIX) != 0) { + std::cout << "Cannot find " << SCHEMA_PREFIX << " argument." << std::endl; + usage(); + return 1; + } else { + fileType = orc::Type::buildTypeFromString(schema.substr(SCHEMA_PREFIX.size())); + } + + std::cout << GetDate() << "Start importing Orc file..." << std::endl; + + double totalElapsedTime = 0.0; + double totalCPUTime = 0.0; + + orc::DataBuffer<char> buffer(*orc::getDefaultPool()); + buffer.resize(4 * 1024 * 1024); + + // set ORC writer options here + uint64_t stripeSize = (36 << 20); // 36M --- End diff -- 36M as default stripe size seems to be too small. Should probably use 128M, 196M or 256M.
---