This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 113023fb86 (Enhancement)[load-json] support simdjson in new json
reader (#16903)
113023fb86 is described below
commit 113023fb8632454e33d73e67c6bcbd5074e0dd81
Author: lihangyu <[email protected]>
AuthorDate: Tue Feb 21 11:31:00 2023 +0800
(Enhancement)[load-json] support simdjson in new json reader (#16903)
be config:
enable_simdjson_reader=true
related PR #11665
---
be/src/common/configbase.cpp | 1 +
be/src/exprs/json_functions.cpp | 66 +++
be/src/exprs/json_functions.h | 28 +-
be/src/vec/exec/format/json/new_json_reader.cpp | 615 +++++++++++++++++++++
be/src/vec/exec/format/json/new_json_reader.h | 46 ++
docs/en/docs/admin-manual/config/be-config.md | 5 +
docs/zh-CN/docs/admin-manual/config/be-config.md | 5 +
.../data/load_p0/stream_load/invalid_json.json | 37 ++
.../load_p0/stream_load/invalid_json_array.json | 8 +-
...id_json_array.json => invalid_json_array1.json} | 10 +-
.../load_p0/stream_load/invalid_json_array2.json | 121 ++++
.../load_p0/stream_load/invalid_json_array3.json | 123 +++++
...id_json_array.json => invalid_json_array4.json} | 10 +-
.../load_p0/stream_load/invalid_nest_json1.json | 5 +
.../load_p0/stream_load/invalid_nest_json2.json | 16 +
.../load_p0/stream_load/invalid_nest_json3.json | 5 +
.../stream_load/invalid_nest_json_array.json | 26 +
.../stream_load/invalid_nest_json_array1.json | 26 +
.../stream_load/invalid_nest_json_array2.json | 26 +
.../stream_load/invalid_nest_json_array3.json | 25 +
.../data/load_p0/stream_load/nest_json_array.json | 43 +-
.../data/load_p0/stream_load/simple_json2.json | 2 +
.../data/load_p0/stream_load/test_json_load.out | 17 +
.../load_p0/stream_load/test_json_load.groovy | 96 +++-
24 files changed, 1306 insertions(+), 56 deletions(-)
diff --git a/be/src/common/configbase.cpp b/be/src/common/configbase.cpp
index 8f9ca0e6ac..af20b37260 100644
--- a/be/src/common/configbase.cpp
+++ b/be/src/common/configbase.cpp
@@ -441,6 +441,7 @@ void set_fuzzy_configs() {
// random value true or false
set_fuzzy_config("disable_storage_page_cache", ((rand() % 2) == 0) ?
"true" : "false");
set_fuzzy_config("enable_system_metrics", ((rand() % 2) == 0) ? "true" :
"false");
+ set_fuzzy_config("enable_simdjson_reader", ((rand() % 2) == 0) ? "true" :
"false");
// random value from 8 to 48
// s = set_fuzzy_config("doris_scanner_thread_pool_thread_num",
std::to_string((rand() % 41) + 8));
// LOG(INFO) << s.to_string();
diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp
index 3016463482..5a3cbcc9e0 100644
--- a/be/src/exprs/json_functions.cpp
+++ b/be/src/exprs/json_functions.cpp
@@ -248,4 +248,70 @@ void JsonFunctions::get_parsed_paths(const
std::vector<std::string>& path_exprs,
}
}
+Status JsonFunctions::extract_from_object(simdjson::ondemand::object& obj,
+ const std::vector<JsonPath>&
jsonpath,
+ simdjson::ondemand::value* value)
noexcept {
+// Return DataQualityError when it's a malformed json.
+// Otherwise the path was not found, due to array out of bound or not exist
+#define HANDLE_SIMDJSON_ERROR(err, msg)
\
+ do {
\
+ const simdjson::error_code& _err = err;
\
+ const std::string& _msg = msg;
\
+ if (UNLIKELY(_err)) {
\
+ if (_err == simdjson::NO_SUCH_FIELD || _err ==
simdjson::INDEX_OUT_OF_BOUNDS) { \
+ return Status::NotFound(
\
+ fmt::format("err: {}, msg: {}",
simdjson::error_message(_err), _msg)); \
+ }
\
+ return Status::DataQualityError(
\
+ fmt::format("err: {}, msg: {}",
simdjson::error_message(_err), _msg)); \
+ }
\
+ } while (false);
+
+ if (jsonpath.size() <= 1) {
+ // The first elem of json path should be '$'.
+ // A valid json path's size is >= 2.
+ return Status::DataQualityError("empty json path");
+ }
+
+ simdjson::ondemand::value tvalue;
+
+ // Skip the first $.
+ for (int i = 1; i < jsonpath.size(); i++) {
+ if (UNLIKELY(!jsonpath[i].is_valid)) {
+ return Status::DataQualityError(fmt::format("invalid json path:
{}", jsonpath[i].key));
+ }
+
+ const std::string& col = jsonpath[i].key;
+ int index = jsonpath[i].idx;
+
+ // Since the simdjson::ondemand::object cannot be converted to
simdjson::ondemand::value,
+ // we have to do some special treatment for the second elem of json
path.
+ // If the key is not found in json object, simdjson::NO_SUCH_FIELD
would be returned.
+ if (i == 1) {
+ HANDLE_SIMDJSON_ERROR(obj.find_field_unordered(col).get(tvalue),
+ fmt::format("unable to find field: {}",
col));
+ } else {
+ HANDLE_SIMDJSON_ERROR(tvalue.find_field_unordered(col).get(tvalue),
+ fmt::format("unable to find field: {}",
col));
+ }
+
+ // TODO support [*] which idex == -2
+ if (index != -1) {
+ // try to access tvalue as array.
+ // If the index is beyond the length of array,
simdjson::INDEX_OUT_OF_BOUNDS would be returned.
+ simdjson::ondemand::array arr;
+ HANDLE_SIMDJSON_ERROR(tvalue.get_array().get(arr),
+ fmt::format("failed to access field as
array, field: {}", col));
+
+ HANDLE_SIMDJSON_ERROR(
+ arr.at(index).get(tvalue),
+ fmt::format("failed to access array field: {}, index: {}",
col, index));
+ }
+ }
+
+ std::swap(*value, tvalue);
+
+ return Status::OK();
+}
+
} // namespace doris
diff --git a/be/src/exprs/json_functions.h b/be/src/exprs/json_functions.h
index 39fbda875b..a070b136b5 100644
--- a/be/src/exprs/json_functions.h
+++ b/be/src/exprs/json_functions.h
@@ -19,9 +19,11 @@
#include <fmt/core.h>
#include <rapidjson/document.h>
+#include <simdjson.h>
#include <sstream>
+#include "common/status.h"
#include "udf/udf.h"
namespace doris {
@@ -63,26 +65,6 @@ struct JsonPath {
return ss.str();
}
- std::string to_simdjson_pointer(bool* valid) const {
- std::stringstream ss;
- if (!is_valid) {
- *valid = false;
- return "";
- }
- ss << "/";
- if (!key.empty()) {
- ss << key;
- }
- if (idx == -2) {
- // not support [*]
- *valid = false;
- return "";
- } else if (idx > -1) {
- ss << "/" << idx;
- }
- return ss.str();
- }
-
std::string debug_string() const {
return fmt::format("key:{}, idx:{}, valid:{}", key, idx, is_valid);
}
@@ -113,6 +95,12 @@ public:
static void parse_json_paths(const std::string& path_strings,
std::vector<JsonPath>* parsed_paths);
+ // extract_from_object extracts value from object according to the json
path.
+ // Now, we do not support complete functions of json path.
+ // Eg. city[*].id is not supported in this function
+ static Status extract_from_object(simdjson::ondemand::object& obj,
+ const std::vector<JsonPath>& jsonpath,
+ simdjson::ondemand::value* value)
noexcept;
private:
static rapidjson::Value* match_value(const std::vector<JsonPath>&
parsed_paths,
diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp
b/be/src/vec/exec/format/json/new_json_reader.cpp
index f321a43d9c..a2d1d89d4b 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -88,6 +88,10 @@ NewJsonReader::NewJsonReader(RuntimeProfile* profile, const
TFileScanRangeParams
_io_ctx(io_ctx) {}
Status NewJsonReader::init_reader() {
+ if (config::enable_simdjson_reader) {
+ RETURN_IF_ERROR(_simdjson_init_reader());
+ return Status::OK();
+ }
RETURN_IF_ERROR(_get_range_params());
RETURN_IF_ERROR(_open_file_reader());
@@ -1009,4 +1013,615 @@ Status
NewJsonReader::_read_one_message(std::unique_ptr<uint8_t[]>* file_buf, si
}
return Status::OK();
}
+// ---------SIMDJSON----------
+// simdjson, replace none simdjson function if it is ready
+Status NewJsonReader::_simdjson_init_reader() {
+ RETURN_IF_ERROR(_get_range_params());
+
+ RETURN_IF_ERROR(_open_file_reader());
+ if (_read_json_by_line) {
+ RETURN_IF_ERROR(_open_line_reader());
+ }
+
+ // generate _parsed_jsonpaths and _parsed_json_root
+ RETURN_IF_ERROR(_parse_jsonpath_and_json_root());
+
+ //improve performance
+ if (_parsed_jsonpaths.empty() || _is_dynamic_schema) { // input is a
simple json-string
+ _vhandle_json_callback = _is_dynamic_schema ?
&NewJsonReader::_vhandle_dynamic_json
+ :
&NewJsonReader::_simdjson_handle_simple_json;
+ } else { // input is a complex json-string and a json-path
+ if (_strip_outer_array) {
+ _vhandle_json_callback =
&NewJsonReader::_simdjson_handle_flat_array_complex_json;
+ } else {
+ _vhandle_json_callback =
&NewJsonReader::_simdjson_handle_nested_complex_json;
+ }
+ }
+ if (_is_dynamic_schema) {
+ _json_parser =
std::make_unique<vectorized::JSONDataParser<vectorized::SimdJSONParser>>();
+ }
+ _ondemand_json_parser = std::make_unique<simdjson::ondemand::parser>();
+ for (int i = 0; i < _file_slot_descs.size(); ++i) {
+ _slot_desc_index.emplace(_file_slot_descs[i]->col_name(), i);
+ }
+ _simdjson_ondemand_padding_buffer.resize(_padded_size);
+ return Status::OK();
+}
+
+Status
NewJsonReader::_simdjson_handle_simple_json(std::vector<MutableColumnPtr>&
columns,
+ const
std::vector<SlotDescriptor*>& slot_descs,
+ bool* is_empty_row, bool*
eof) {
+ // simple json
+ simdjson::ondemand::object objectValue;
+ size_t num_rows = columns[0]->size();
+ do {
+ bool valid = false;
+ try {
+ if (_next_row >= _total_rows) { // parse json and generic document
+ Status st = _simdjson_parse_json(is_empty_row, eof);
+ if (st.is<DATA_QUALITY_ERROR>()) {
+ continue; // continue to read next
+ }
+ RETURN_IF_ERROR(st);
+ if (*is_empty_row == true) {
+ return Status::OK();
+ }
+ if (_json_value.type() ==
simdjson::ondemand::json_type::array) {
+ _array = _json_value.get_array();
+ _array_iter = _array.begin();
+
+ _total_rows = _array.count_elements();
+ if (_total_rows == 0) {
+ // may be passing an empty json, such as "[]"
+ RETURN_IF_ERROR(_append_error_msg(nullptr, "Empty json
line", "", nullptr));
+ if (*_scanner_eof) {
+ *is_empty_row = true;
+ return Status::OK();
+ }
+ continue;
+ }
+ } else {
+ _total_rows = 1; // only one row
+ objectValue = _json_value;
+ }
+ _next_row = 0;
+ }
+
+ if (_json_value.type() == simdjson::ondemand::json_type::array) {
// handle case 1
+ objectValue = *_array_iter;
+ RETURN_IF_ERROR(
+ _simdjson_set_column_value(&objectValue, columns,
slot_descs, &valid));
+ if (_array_iter == _array.end()) {
+ // Hint to read next json doc
+ _next_row = _total_rows + 1;
+ break;
+ }
+ ++_array_iter;
+ } else { // handle case 2
+ // objectValue = _json_value.get_object();
+ RETURN_IF_ERROR(
+ _simdjson_set_column_value(&objectValue, columns,
slot_descs, &valid));
+ }
+ _next_row++;
+ if (!valid) {
+ if (*_scanner_eof) {
+ // When _scanner_eof is true and valid is false, it means
that we have encountered
+ // unqualified data and decided to stop the scan.
+ *is_empty_row = true;
+ return Status::OK();
+ }
+ continue;
+ }
+ *is_empty_row = false;
+ break; // get a valid row, then break
+ } catch (simdjson::simdjson_error& e) {
+ // prevent from endless loop
+ _next_row = _total_rows + 1;
+ fmt::memory_buffer error_msg;
+ fmt::format_to(error_msg, "Parse json data for array failed. code:
{}, error info: {}",
+ e.error(), e.what());
+ RETURN_IF_ERROR(_state->append_error_msg_to_file(
+ [&]() -> std::string { return ""; },
+ [&]() -> std::string { return fmt::to_string(error_msg);
}, eof));
+ _counter->num_rows_filtered++;
+ // Before continuing to process other rows, we need to first clean
the fail parsed row.
+ for (int i = 0; i < columns.size(); ++i) {
+ if (columns[i]->size() > num_rows) {
+ columns[i]->pop_back(columns[i]->size() - num_rows);
+ }
+ }
+ if (!valid) {
+ if (*_scanner_eof) {
+ // When _scanner_eof is true and valid is false, it means
that we have encountered
+ // unqualified data and decided to stop the scan.
+ *is_empty_row = true;
+ return Status::OK();
+ }
+ continue;
+ }
+ continue;
+ }
+ } while (_next_row <= _total_rows);
+ return Status::OK();
+}
+
+Status NewJsonReader::_simdjson_handle_flat_array_complex_json(
+ std::vector<MutableColumnPtr>& columns, const
std::vector<SlotDescriptor*>& slot_descs,
+ bool* is_empty_row, bool* eof) {
+// Advance one row in array list, if it is the endpoint, stop advance and
break the loop
+#define ADVANCE_ROW() \
+ if (_array_iter == _array.end()) { \
+ _next_row = _total_rows + 1; \
+ break; \
+ } \
+ ++_array_iter; \
+ ++_next_row;
+
+ // array complex json
+ size_t num_rows = columns[0]->size();
+ simdjson::ondemand::object cur;
+ do {
+ try {
+ if (_next_row >= _total_rows) {
+ Status st = _simdjson_parse_json(is_empty_row, eof);
+ if (st.is<DATA_QUALITY_ERROR>()) {
+ continue; // continue to read next
+ }
+ RETURN_IF_ERROR(st);
+ if (*is_empty_row == true) {
+ if (st == Status::OK()) {
+ return Status::OK();
+ }
+ if (_total_rows == 0) {
+ continue;
+ }
+ }
+ _array = _json_value.get_array();
+ _array_iter = _array.begin();
+ }
+
+ bool valid = true;
+ cur = (*_array_iter).get_object();
+ // extract root
+ if (_parsed_json_root.size() != 0) {
+ simdjson::ondemand::value val;
+ Status st = JsonFunctions::extract_from_object(cur,
_parsed_json_root, &val);
+ if (UNLIKELY(!st.ok())) {
+ if (st.is_not_found()) {
+ RETURN_IF_ERROR(
+ _append_error_msg(nullptr, "JsonPath not
found", "", nullptr));
+ ADVANCE_ROW();
+ continue;
+ }
+ return st;
+ }
+ if (val.type() != simdjson::ondemand::json_type::object) {
+ RETURN_IF_ERROR(_append_error_msg(nullptr, "Not object
item", "", nullptr));
+ ADVANCE_ROW();
+ continue;
+ }
+ cur = val.get_object();
+ }
+ RETURN_IF_ERROR(_simdjson_write_columns_by_jsonpath(&cur,
slot_descs, columns, &valid));
+ ADVANCE_ROW();
+ if (!valid) {
+ continue; // process next line
+ }
+ *is_empty_row = false;
+ break; // get a valid row, then break
+ } catch (simdjson::simdjson_error& e) {
+ // prevent from endless loop
+ _next_row = _total_rows + 1;
+ fmt::memory_buffer error_msg;
+ fmt::format_to(error_msg, "Parse json data failed. code: {}, error
info: {}", e.error(),
+ e.what());
+ RETURN_IF_ERROR(_state->append_error_msg_to_file(
+ [&]() -> std::string { return ""; },
+ [&]() -> std::string { return fmt::to_string(error_msg);
}, eof));
+ _counter->num_rows_filtered++;
+ // Before continuing to process other rows, we need to first clean
the fail parsed row.
+ for (int i = 0; i < columns.size(); ++i) {
+ if (columns[i]->size() > num_rows) {
+ columns[i]->pop_back(columns[i]->size() - num_rows);
+ }
+ }
+ if (*_scanner_eof) {
+ // When _scanner_eof is true and valid is false, it means that
we have encountered
+ // unqualified data and decided to stop the scan.
+ *is_empty_row = true;
+ return Status::OK();
+ }
+ continue;
+ }
+ } while (_next_row <= _total_rows);
+ return Status::OK();
+}
+
+Status NewJsonReader::_simdjson_handle_nested_complex_json(
+ std::vector<MutableColumnPtr>& columns, const
std::vector<SlotDescriptor*>& slot_descs,
+ bool* is_empty_row, bool* eof) {
+ // nested complex json
+ while (true) {
+ size_t num_rows = columns[0]->size();
+ simdjson::ondemand::object cur;
+ try {
+ Status st = _simdjson_parse_json(is_empty_row, eof);
+ if (st.is<DATA_QUALITY_ERROR>()) {
+ continue; // continue to read next
+ }
+ RETURN_IF_ERROR(st);
+ if (*is_empty_row == true) {
+ return Status::OK();
+ }
+ *is_empty_row = false;
+ bool valid = true;
+ if (_json_value.type() != simdjson::ondemand::json_type::object) {
+ RETURN_IF_ERROR(_append_error_msg(nullptr, "Not object item",
"", nullptr));
+ continue;
+ }
+ cur = _json_value.get_object();
+ st = _simdjson_write_columns_by_jsonpath(&cur, slot_descs,
columns, &valid);
+ if (!st.ok()) {
+ RETURN_IF_ERROR(_append_error_msg(nullptr, st.to_string(), "",
nullptr));
+ // Before continuing to process other rows, we need to first
clean the fail parsed row.
+ for (int i = 0; i < columns.size(); ++i) {
+ if (columns[i]->size() > num_rows) {
+ columns[i]->pop_back(columns[i]->size() - num_rows);
+ }
+ }
+ continue;
+ }
+ if (!valid) {
+ // there is only one line in this case, so if it return false,
just set is_empty_row true
+ // so that the caller will continue reading next line.
+ *is_empty_row = true;
+ }
+ break; // read a valid row
+ } catch (simdjson::simdjson_error& e) {
+ fmt::memory_buffer error_msg;
+ fmt::format_to(error_msg, "Parse json data failed. code: {}, error
info: {}", e.error(),
+ e.what());
+ RETURN_IF_ERROR(_state->append_error_msg_to_file(
+ [&]() -> std::string { return ""; },
+ [&]() -> std::string { return fmt::to_string(error_msg);
}, eof));
+ _counter->num_rows_filtered++;
+ // Before continuing to process other rows, we need to first clean
the fail parsed row.
+ for (int i = 0; i < columns.size(); ++i) {
+ if (columns[i]->size() > num_rows) {
+ columns[i]->pop_back(columns[i]->size() - num_rows);
+ }
+ }
+ if (*_scanner_eof) {
+ // When _scanner_eof is true and valid is false, it means that
we have encountered
+ // unqualified data and decided to stop the scan.
+ *is_empty_row = true;
+ return Status::OK();
+ }
+ continue;
+ }
+ }
+ return Status::OK();
+}
+
+Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object*
value,
+
std::vector<MutableColumnPtr>& columns,
+ const
std::vector<SlotDescriptor*>& slot_descs,
+ bool* valid) {
+ // set
+ size_t cur_row_count = columns[0]->size();
+ bool has_valid_value = false;
+ // iterate through object, simdjson::ondemond will parsing on the fly
+ for (auto field : *value) {
+ std::string_view key = field.unescaped_key();
+ auto iter = _slot_desc_index.find(std::string(key));
+ if (iter == _slot_desc_index.end()) {
+ // This key is not exist in slot desc, just ignore
+ continue;
+ }
+ simdjson::ondemand::value val = field.value();
+ RETURN_IF_ERROR(_simdjson_write_data_to_column(val,
slot_descs[iter->second],
+
columns[iter->second].get(), valid));
+ if (!(*valid)) {
+ return Status::OK();
+ }
+ has_valid_value = true;
+ }
+ if (!has_valid_value) {
+ RETURN_IF_ERROR(
+ _append_error_msg(value, "All fields is null, this is a
invalid row.", "", valid));
+ return Status::OK();
+ }
+
+ // fill missing slot
+ int nullcount = 0;
+ int ctx_idx = 0;
+ for (auto slot_desc : slot_descs) {
+ if (!slot_desc->is_materialized()) {
+ continue;
+ }
+ int dest_index = ctx_idx++;
+ auto* column_ptr = columns[dest_index].get();
+ if (column_ptr->size() < cur_row_count + 1) {
+ DCHECK(column_ptr->size() == cur_row_count);
+ column_ptr->assume_mutable()->insert_default();
+ ++nullcount;
+ }
+ DCHECK(column_ptr->size() == cur_row_count + 1);
+ }
+ // There is at least one valid value here
+ DCHECK(nullcount < columns.size());
+ *valid = true;
+ return Status::OK();
+}
+
+Status
NewJsonReader::_simdjson_write_data_to_column(simdjson::ondemand::value& value,
+ SlotDescriptor* slot_desc,
+ vectorized::IColumn*
column, bool* valid) {
+ // write
+ vectorized::ColumnNullable* nullable_column = nullptr;
+ vectorized::IColumn* column_ptr = nullptr;
+ if (slot_desc->is_nullable()) {
+ nullable_column = assert_cast<vectorized::ColumnNullable*>(column);
+ column_ptr = &nullable_column->get_nested_column();
+ }
+ // TODO: if the vexpr can support another 'slot_desc type' than
'TYPE_VARCHAR',
+ // we need use a function to support these types to insert data in columns.
+ ColumnString* column_string = assert_cast<ColumnString*>(column_ptr);
+ switch (value.type()) {
+ case simdjson::ondemand::json_type::null: {
+ if (column->is_nullable()) {
+ // insert_default already push 1 to null_map
+ nullable_column->insert_default();
+ } else {
+ RETURN_IF_ERROR(_append_error_msg(
+ nullptr, "Json value is null, but the column `{}` is not
nullable.",
+ slot_desc->col_name(), valid));
+ return Status::OK();
+ }
+ break;
+ }
+ case simdjson::ondemand::json_type::boolean: {
+ nullable_column->get_null_map_data().push_back(0);
+ if (value.get_bool()) {
+ column_string->insert_data("1", 1);
+ } else {
+ column_string->insert_data("0", 1);
+ }
+ break;
+ }
+ case simdjson::ondemand::json_type::object:
+ case simdjson::ondemand::json_type::array: {
+ auto str_view = simdjson::to_json_string(value).value();
+ std::string value_str(str_view.data(), str_view.size());
+ // compact json value
+ value_str.erase(std::remove_if(value_str.begin(), value_str.end(),
+ [](const char& c) {
+ // white space
+ return c == ' ' || c == '\t' || c
== '\n' || c == '\r' ||
+ c == '\f' || c == '\v';
+ }),
+ value_str.end());
+ nullable_column->get_null_map_data().push_back(0);
+ column_string->insert_data(value_str.data(), value_str.length());
+ break;
+ }
+ default: {
+ auto str_view = simdjson::to_json_string(value).value();
+ if (value.type() == simdjson::ondemand::json_type::string) {
+ // trim
+ str_view = str_view.substr(1, str_view.length() - 2);
+ }
+ nullable_column->get_null_map_data().push_back(0);
+ column_string->insert_data(str_view.data(), str_view.length());
+ }
+ }
+ *valid = true;
+ return Status::OK();
+}
+
+Status NewJsonReader::_append_error_msg(simdjson::ondemand::object* obj,
std::string error_msg,
+ std::string col_name, bool* valid) {
+ std::string err_msg;
+ if (!col_name.empty()) {
+ fmt::memory_buffer error_buf;
+ fmt::format_to(error_buf, error_msg, col_name);
+ err_msg = fmt::to_string(error_buf);
+ } else {
+ err_msg = error_msg;
+ }
+
+ RETURN_IF_ERROR(_state->append_error_msg_to_file(
+ [&]() -> std::string {
+ if (!obj) {
+ return "";
+ }
+ std::string_view str_view;
+ (void)!obj->raw_json().get(str_view);
+ return std::string(str_view.data(), str_view.size());
+ },
+ [&]() -> std::string { return err_msg; }, _scanner_eof));
+
+ _counter->num_rows_filtered++;
+ if (valid != nullptr) {
+ // current row is invalid
+ *valid = false;
+ }
+ return Status::OK();
+}
+
+Status NewJsonReader::_simdjson_parse_json(bool* is_empty_row, bool* eof) {
+ size_t size = 0;
+ RETURN_IF_ERROR(_simdjson_parse_json_doc(&size, eof));
+
+ // read all data, then return
+ if (size == 0 || *eof) {
+ *is_empty_row = true;
+ return Status::OK();
+ }
+
+ if (!_parsed_jsonpaths.empty() && _strip_outer_array) {
+ _total_rows = _json_value.count_elements().value();
+ _next_row = 0;
+
+ if (_total_rows == 0) {
+ // meet an empty json array.
+ *is_empty_row = true;
+ }
+ }
+ return Status::OK();
+}
+Status NewJsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) {
+ // read a whole message
+ SCOPED_TIMER(_file_read_timer);
+ const uint8_t* json_str = nullptr;
+ std::unique_ptr<uint8_t[]> json_str_ptr;
+ if (_line_reader != nullptr) {
+ RETURN_IF_ERROR(_line_reader->read_line(&json_str, size, eof));
+ } else {
+ size_t length = 0;
+ RETURN_IF_ERROR(_read_one_message(&json_str_ptr, &length));
+ json_str = json_str_ptr.get();
+ *size = length;
+ if (length == 0) {
+ *eof = true;
+ }
+ }
+
+ _bytes_read_counter += *size;
+ if (*eof) {
+ return Status::OK();
+ }
+ if (*size + simdjson::SIMDJSON_PADDING > _padded_size) {
+ // For efficiency reasons, simdjson requires a string with a few bytes
(simdjson::SIMDJSON_PADDING) at the end.
+ // Hence, a re-allocation is needed if the space is not enough.
+ _simdjson_ondemand_padding_buffer.resize(*size +
simdjson::SIMDJSON_PADDING);
+ _padded_size = *size + simdjson::SIMDJSON_PADDING;
+ }
+ memcpy(&_simdjson_ondemand_padding_buffer.front(), json_str, *size);
+ auto error =
+ _ondemand_json_parser
+
->iterate(std::string_view(_simdjson_ondemand_padding_buffer.data(), *size),
+ _padded_size)
+ .get(_original_json_doc);
+ auto return_quality_error = [&](fmt::memory_buffer& error_msg,
+ const std::string& doc_info) -> Status {
+ RETURN_IF_ERROR(_state->append_error_msg_to_file(
+ [&]() -> std::string { return doc_info; },
+ [&]() -> std::string { return fmt::to_string(error_msg); },
_scanner_eof));
+ _counter->num_rows_filtered++;
+ if (*_scanner_eof) {
+ // Case A: if _scanner_eof is set to true in
"append_error_msg_to_file", which means
+ // we meet enough invalid rows and the scanner should be stopped.
+ // So we set eof to true and return OK, the caller will stop the
process as we meet the end of file.
+ *eof = true;
+ return Status::OK();
+ }
+ return Status::DataQualityError(fmt::to_string(error_msg));
+ };
+ if (error != simdjson::error_code::SUCCESS) {
+ fmt::memory_buffer error_msg;
+ fmt::format_to(error_msg, "Parse json data for JsonDoc failed. code:
{}, error info: {}",
+ error, simdjson::error_message(error));
+ return return_quality_error(error_msg, std::string((char*)json_str,
*size));
+ }
+ try {
+ // set json root
+ // if it is an array at top level, then we should iterate the entire
array in
+ // ::_simdjson_handle_flat_array_complex_json
+ if (_parsed_json_root.size() != 0 &&
+ _original_json_doc.type() ==
simdjson::ondemand::json_type::object) {
+ simdjson::ondemand::object object = _original_json_doc;
+ Status st = JsonFunctions::extract_from_object(object,
_parsed_json_root, &_json_value);
+ if (!st.ok()) {
+ fmt::memory_buffer error_msg;
+ fmt::format_to(error_msg, "{}", st.to_string());
+ return return_quality_error(error_msg,
std::string((char*)json_str, *size));
+ }
+ } else {
+ _json_value = _original_json_doc;
+ }
+ } catch (simdjson::simdjson_error& e) {
+ fmt::memory_buffer error_msg;
+ fmt::format_to(error_msg, "Encounter error while extract_from_object,
error: {}", e.what());
+ return return_quality_error(error_msg, std::string((char*)json_str,
*size));
+ }
+
+ if (_json_value.type() == simdjson::ondemand::json_type::array &&
!_strip_outer_array) {
+ fmt::memory_buffer error_msg;
+ fmt::format_to(error_msg, "{}",
+ "JSON data is array-object, `strip_outer_array` must be
TRUE.");
+ return return_quality_error(error_msg, std::string((char*)json_str,
*size));
+ }
+
+ if (_json_value.type() != simdjson::ondemand::json_type::array &&
_strip_outer_array) {
+ fmt::memory_buffer error_msg;
+ fmt::format_to(error_msg, "{}",
+ "JSON data is not an array-object, `strip_outer_array`
must be FALSE.");
+ return return_quality_error(error_msg, std::string((char*)json_str,
*size));
+ }
+ return Status::OK();
+}
+
+Status NewJsonReader::_simdjson_write_columns_by_jsonpath(
+ simdjson::ondemand::object* value, const std::vector<SlotDescriptor*>&
slot_descs,
+ std::vector<MutableColumnPtr>& columns, bool* valid) {
+ // write by jsonpath
+ size_t column_num = slot_descs.size();
+ bool has_valid_value = false;
+ size_t cur_row_count = columns[0]->size();
+ for (size_t i = 0; i < column_num; i++) {
+ auto* column_ptr = columns[i].get();
+ simdjson::ondemand::value json_value;
+ Status st;
+ if (i < _parsed_jsonpaths.size()) {
+ st = JsonFunctions::extract_from_object(*value,
_parsed_jsonpaths[i], &json_value);
+ if (!st.ok() && !st.is<NOT_FOUND>()) {
+ return st;
+ }
+ }
+ if (i >= _parsed_jsonpaths.size() || st.is<NOT_FOUND>()) {
+ // not match in jsondata.
+ if (!slot_descs[i]->is_nullable()) {
+ RETURN_IF_ERROR(_append_error_msg(
+ value, "The column `{}` is not nullable, but it's not
found in jsondata.",
+ slot_descs[i]->col_name(), valid));
+ return Status::OK();
+ }
+ } else {
+ RETURN_IF_ERROR(
+ _simdjson_write_data_to_column(json_value, slot_descs[i],
column_ptr, valid));
+ if (!(*valid)) {
+ return Status::OK();
+ }
+ has_valid_value = true;
+ }
+ }
+ if (!has_valid_value) {
+ RETURN_IF_ERROR(
+ _append_error_msg(value, "All fields is null, this is a
invalid row.", "", valid));
+ return Status::OK();
+ }
+
+ // fill missing slot
+ int ctx_idx = 0;
+ int nullcount = 0;
+ for (auto slot_desc : slot_descs) {
+ if (!slot_desc->is_materialized()) {
+ continue;
+ }
+ int dest_index = ctx_idx++;
+ auto* column_ptr = columns[dest_index].get();
+ if (column_ptr->size() < cur_row_count + 1) {
+ DCHECK(column_ptr->size() == cur_row_count);
+ column_ptr->assume_mutable()->insert_default();
+ ++nullcount;
+ }
+ DCHECK(column_ptr->size() == cur_row_count + 1);
+ }
+ // There is at least one valid value here
+ DCHECK(nullcount < columns.size());
+ *valid = true;
+ return Status::OK();
+}
+
} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/json/new_json_reader.h
b/be/src/vec/exec/format/json/new_json_reader.h
index 0c14342204..42737f7094 100644
--- a/be/src/vec/exec/format/json/new_json_reader.h
+++ b/be/src/vec/exec/format/json/new_json_reader.h
@@ -107,6 +107,37 @@ private:
Status _read_one_message(std::unique_ptr<uint8_t[]>* file_buf, size_t*
read_size);
+ // simdjson, replace none simdjson function if it is ready
+ Status _simdjson_init_reader();
+ Status _simdjson_parse_json(bool* is_empty_row, bool* eof);
+ Status _simdjson_parse_json_doc(size_t* size, bool* eof);
+
+ Status _simdjson_handle_simple_json(std::vector<MutableColumnPtr>& columns,
+ const std::vector<SlotDescriptor*>&
slot_descs,
+ bool* is_empty_row, bool* eof);
+
+ Status
_simdjson_handle_flat_array_complex_json(std::vector<MutableColumnPtr>& columns,
+ const
std::vector<SlotDescriptor*>& slot_descs,
+ bool* is_empty_row, bool*
eof);
+
+ Status _simdjson_handle_nested_complex_json(std::vector<MutableColumnPtr>&
columns,
+ const
std::vector<SlotDescriptor*>& slot_descs,
+ bool* is_empty_row, bool* eof);
+
+ Status _simdjson_set_column_value(simdjson::ondemand::object* value,
+ std::vector<MutableColumnPtr>& columns,
+ const std::vector<SlotDescriptor*>&
slot_descs, bool* valid);
+
+ Status _simdjson_write_data_to_column(simdjson::ondemand::value& value,
+ SlotDescriptor* slot_desc,
+ vectorized::IColumn* column_ptr,
bool* valid);
+
+ Status _simdjson_write_columns_by_jsonpath(simdjson::ondemand::object*
value,
+ const
std::vector<SlotDescriptor*>& slot_descs,
+ std::vector<MutableColumnPtr>&
columns, bool* valid);
+ Status _append_error_msg(simdjson::ondemand::object* obj, std::string
error_msg,
+ std::string col_name, bool* valid);
+
Status (NewJsonReader::*_vhandle_json_callback)(
std::vector<vectorized::MutableColumnPtr>& columns,
const std::vector<SlotDescriptor*>& slot_descs, bool*
is_empty_row, bool* eof);
@@ -163,7 +194,22 @@ private:
RuntimeProfile::Counter* _file_read_timer;
bool _is_dynamic_schema = false;
+ // name mapping
+ phmap::flat_hash_map<String, size_t> _slot_desc_index;
+ // simdjson
+ static constexpr size_t _init_buffer_size = 1024 * 1024 * 8;
+ size_t _padded_size = _init_buffer_size + simdjson::SIMDJSON_PADDING;
+ std::string _simdjson_ondemand_padding_buffer;
+ // char _simdjson_ondemand_padding_buffer[_padded_size];
+ simdjson::ondemand::document _original_json_doc;
+ simdjson::ondemand::value _json_value;
+ // for strip outer array
+ // array_iter pointed to _array
+ simdjson::ondemand::array_iterator _array_iter;
+ simdjson::ondemand::array _array;
std::unique_ptr<JSONDataParser<SimdJSONParser>> _json_parser;
+ std::unique_ptr<simdjson::ondemand::parser> _ondemand_json_parser =
nullptr;
};
+
} // namespace vectorized
} // namespace doris
diff --git a/docs/en/docs/admin-manual/config/be-config.md
b/docs/en/docs/admin-manual/config/be-config.md
index 180e36f5b7..e224649bd9 100644
--- a/docs/en/docs/admin-manual/config/be-config.md
+++ b/docs/en/docs/admin-manual/config/be-config.md
@@ -1415,4 +1415,9 @@ Indicates how many tablets failed to load in the data
directory. At the same tim
* Description: Whether parse multidimensional array, if false encountering
will return ERROR
* Default value: true
+#### `enable_simdjson_reader`
+
+* Description: Whether enable simdjson to parse json while stream load
+* Default value: false
+
</version>
diff --git a/docs/zh-CN/docs/admin-manual/config/be-config.md
b/docs/zh-CN/docs/admin-manual/config/be-config.md
index 734e264b40..65531b8824 100644
--- a/docs/zh-CN/docs/admin-manual/config/be-config.md
+++ b/docs/zh-CN/docs/admin-manual/config/be-config.md
@@ -1433,4 +1433,9 @@ load tablets from header failed, failed tablets size:
xxx, path=xxx
* 描述: 在动态表中是否解析多维数组,如果是false遇到多维数组则会报错。
* 默认值: true
+#### `enable_simdjson_reader`
+
+* 描述: 是否在导入json数据时用simdjson来解析。
+* 默认值: false
+
</version>
diff --git a/regression-test/data/load_p0/stream_load/invalid_json.json
b/regression-test/data/load_p0/stream_load/invalid_json.json
index 4ee59e809e..f6f945893a 100644
--- a/regression-test/data/load_p0/stream_load/invalid_json.json
+++ b/regression-test/data/load_p0/stream_load/invalid_json.json
@@ -1,3 +1,40 @@
{"no": 1, "item: {"id": 1, "city": "beijing", "code": 2345671}}
{"no": 2, "item": {}}
{"no": 2, "item": {"id":"123}}
+{"no": 2, item": {"id":"123}}
+{"no": 2, "item": ["id":"123'}
+{hno": 2, "item": ["id":"123'}
+hno": 2, [],"item": ["id":"123'}
+hno": , [],"item": ["id":"123'}
+{"no": 2, "item": {"id", }}
+{"no": 2, "item": {"id", ""}}
+{"no": 2, "item": {"id" : "123", ""}}
+{"no": 2, "item": {"id" : "123", "}}
+{"no": 2, "item": [{"id" : "123"}]}
+{"no": 2, "item": [{"id" : "123"}]}
+{
+}
+{"ni",}
+}{
+-
++
+/
+"{}"
+{1}
+{[12]}
+{1:"1["}
+{"no" : 1, "item" : {"id"}}}
+{"no" : 1, "item" : "id"}}}
+{"no" : 1, "item" : ["id"}}}
+"
+["1"]
+[
+]
+{}
+null
+{null}
+{"no" : 1, "item" : {"id" : 1921}}"}
+{"no" : 1, "item" : {"id" : 1921}}
+{"no" : "xxx", "item" : {"x":"123", "id" : 1281111}}
+{"no" : 1.22, "texx": "111", "item" : {"x":"123", "id" : 17117171}}
+{"no" : 10011.0, "texx": "111", "item" : {"id" : null}, "item" : 191218}
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/invalid_json_array.json
b/regression-test/data/load_p0/stream_load/invalid_json_array.json
index 7f1e5f1884..e98771f034 100644
--- a/regression-test/data/load_p0/stream_load/invalid_json_array.json
+++ b/regression-test/data/load_p0/stream_load/invalid_json_array.json
@@ -6,6 +6,10 @@
{"v6": "6514.405051", "k4": "6000", "k3": "600", "k2": "60", "k1": "6",
"v3": "obdrei", "v2": "m", "v1": "2010-01-06", "k5": "2016-01-01 00:00:00",
"v5": "882.708491", "v4": "921.867848"},
{"v6": "8604.198677", "k4": "7000", "k3": "700", "k2": "70", "k1": "7",
"v3": "cuobdhvrgkugknj", "v2": "a", "v1": "2010-01-07", "k5": "2017-01-01
00:00:00", "v5": "209.420112", "v4": "141.656421"},
{"v6": "7784.859446", "k4": "8000", "k3": "800", "k2": "80", "k1": "8",
"v3": "phcxztwgjllhmj", "v2": "z", "v1": "2010-01-08", "k5": "2018-01-01
00:00:00", "v5": "285.664871", "v4": "762.813376"},
- {"v6": "4846.735593", "k4": "9000", "k3": "900", "k2": "90", "k1": "9",
"v3": "nbarqjwilbkelk", "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
- {"v6": "7996.434686", "k4": "10000", "k3": "1000", "k2": "100", "k1":
"10", "v3": "zucprgdnlgzzfl", "v2": "s", "v1": "2010-01-10", "k5": "2020-01-01
00:00:00", "v5": "155.861217", "v4": "26.874738"},]
+ {"v6": "4846.735593", "k4": "9000", "k3": "900", "k2": "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": 4846.735593", "k4": "9000", "k3": "900", "k2": "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": 4846.735593", "k4": "9000", "k3": "900, "k2": "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": 4846.735593", "k4": "9000", "k3": "900, "k2":} "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": 4846.735593", "k4": "9000",}"k3": "900, "k2":} "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": "7996.434686", "k4": "10000", "k3": "1000", "k2": "100", "k1":
"10", "v3": "zucprgdnlgzzfl", "v2": "s", "v1": "2010-01-10", "k5": "2020-01-01
00:00:00", "v5": "155.861217", "v4": "26.874738"}]
diff --git a/regression-test/data/load_p0/stream_load/invalid_json_array.json
b/regression-test/data/load_p0/stream_load/invalid_json_array1.json
similarity index 57%
copy from regression-test/data/load_p0/stream_load/invalid_json_array.json
copy to regression-test/data/load_p0/stream_load/invalid_json_array1.json
index 7f1e5f1884..293831d8b6 100644
--- a/regression-test/data/load_p0/stream_load/invalid_json_array.json
+++ b/regression-test/data/load_p0/stream_load/invalid_json_array1.json
@@ -1,4 +1,4 @@
- [ {"v6": "7395.231067", "k4": "1000", "k3": "100", "k2": "10", "k1": "1",
"v3": "ynqnzeowymt", "v2": "t", "v1": "2010-01-01", "k5": "2011-01-01
00:00:00", "v5": "180.998031", "v4": "38.638843"},
+ { "item": [{"v6": "7395.231067", "k4": "1000", "k3": "100", "k2": "10",
"k1": "1", "v3": "ynqnzeowymt", "v2": "t", "v1": "2010-01-01", "k5":
"2011-01-01 00:00:00", "v5": "180.998031", "v4": "38.638843"},
{"v6": "2080.504502", "k4": "2000", "k3": "200", "k2": "20", "k1": "2",
"v3": "hfkfwlr", "v2": "f", "v1": "2010-01-02", "k5": "2012-01-01 00:00:00",
"v5": "539.922834", "v4": "506.044046"},
{"v6": "4605.253205", "k4": "3000", "k3": "300", "k2": "30", "k1": "3",
"v3": "uoclasp", "v2": "t", "v1": "2010-01-03", "k5": "2013-01-01 00:00:00",
"v5": "577.044148", "v4": "377.793209"},
{"v6": "7291.703724", "k4": "4000", "k3": "400", "k2": "40", "k1": "4",
"v3": "iswngzeodfhptjzgswsddt", "v2": "n", "v1": "2010-01-04", "k5":
"2014-01-01 00:00:00", "v5": "919.067864", "v4": "871.354536"},
@@ -6,6 +6,10 @@
{"v6": "6514.405051", "k4": "6000", "k3": "600", "k2": "60", "k1": "6",
"v3": "obdrei", "v2": "m", "v1": "2010-01-06", "k5": "2016-01-01 00:00:00",
"v5": "882.708491", "v4": "921.867848"},
{"v6": "8604.198677", "k4": "7000", "k3": "700", "k2": "70", "k1": "7",
"v3": "cuobdhvrgkugknj", "v2": "a", "v1": "2010-01-07", "k5": "2017-01-01
00:00:00", "v5": "209.420112", "v4": "141.656421"},
{"v6": "7784.859446", "k4": "8000", "k3": "800", "k2": "80", "k1": "8",
"v3": "phcxztwgjllhmj", "v2": "z", "v1": "2010-01-08", "k5": "2018-01-01
00:00:00", "v5": "285.664871", "v4": "762.813376"},
- {"v6": "4846.735593", "k4": "9000", "k3": "900", "k2": "90", "k1": "9",
"v3": "nbarqjwilbkelk", "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
- {"v6": "7996.434686", "k4": "10000", "k3": "1000", "k2": "100", "k1":
"10", "v3": "zucprgdnlgzzfl", "v2": "s", "v1": "2010-01-10", "k5": "2020-01-01
00:00:00", "v5": "155.861217", "v4": "26.874738"},]
+ "v6": "4846.735593", "k4": "9000", "k3": "900", "k2": "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": 4846.735593", "k4": "9000", "k3": "900", "k2": "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": 4846.735593", "k4": "9000", "k3": "900, "k2": "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": 4846.735593", "k4": "9000", "k3": "900, "k2":} "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": 4846.735593", "k4": "9000",}"k3": "900, "k2":} "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": "7996.434686", "k4": "10000", "k3": "1000", "k2": "100", "k1":
"10", "v3": "zucprgdnlgzzfl", "v2": "s", "v1": "2010-01-10", "k5": "2020-01-01
00:00:00", "v5": "155.861217", "v4": "26.874738"}]}
diff --git a/regression-test/data/load_p0/stream_load/invalid_json_array2.json
b/regression-test/data/load_p0/stream_load/invalid_json_array2.json
new file mode 100644
index 0000000000..a8022b3dbe
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/invalid_json_array2.json
@@ -0,0 +1,121 @@
+{"item" : {
+ "v6": "7395.231067",
+ "k4": "1000",
+ "k3": "100",
+ "k2": "10",
+ "k1": "1",
+ "v3": "ynqnzeowymt",
+ "v2": "t",
+ "v1": "2010-01-01",
+ "k5": "2011-01-01 00:00:00",
+ "v5": "180.998031",
+ "v4": "38.638843"
+},
+{
+ "v6": "2080.504502",
+ "k4": "2000",
+ "k3": "200",
+ "k2": "20",
+ "k1": "2",
+ "v3": "hfkfwlr",
+ "v2": "f",
+ "v1": "2010-01-02",
+ "k5": "2012-01-01 00:00:00",
+ "v5": "539.922834",
+ "v4": "506.044046"
+},
+{
+ "v6": "4605.253205",
+ "k4": "3000",
+ "k3": "300",
+ "k2": "30",
+ "k1": "3",
+ "v3": "uoclasp",
+ "v2": "t",
+ "v1": "2010-01-03",
+ "k5": "2013-01-01 00:00:00",
+ "v5": "577.044148",
+ "v4": "377.793209"
+},
+{
+ "v6": "7291.703724",
+ "k4": "4000",
+ "k3": "400",
+ "k2": "40",
+ "k1": "4",
+ "v3": "iswngzeodfhptjzgswsddt",
+ "v2": "n",
+ "v1": "2010-01-04",
+ "k5": "2014-01-01 00:00:00",
+ "v5": "919.067864",
+ "v4": "871.354536"
+},
+{
+ "fake" : null
+},
+{
+ "v6": "6514.405051",
+ "k4": "6000",
+ "k3": "600",
+ "k2": "60",
+ "k1": "6",
+ "v3": "obdrei",
+ "v2": "m",
+ "v1": "2010-01-06",
+ "k5": "2016-01-01 00:00:00",
+ "v5": "882.708491",
+ "v4": "921.867848"
+},
+{
+ "v6": "8604.198677",
+ "k4": "7000",
+ "k3": "700",
+ "k2": "70",
+ "k1": "7",
+ "v3": "cuobdhvrgkugknj",
+ "v2": "a",
+ "v1": "2010-01-07",
+ "k5": "2017-01-01 00:00:00",
+ "v5": "209.420112",
+ "v4": "141.656421"
+},
+{
+ "v6": "7784.859446",
+ "k4": "8000",
+ "k3": "800",
+ "k2": "80",
+ "k1": "8",
+ "v3": "phcxztwgjllhmj",
+ "v2": "z",
+ "v1": "2010-01-08",
+ "k5": "2018-01-01 00:00:00",
+ "v5": "285.664871",
+ "v4": "762.813376"
+},
+
+ "v6": "4846.735593",
+ "k4": "9000",
+ "k3": "900",
+ "k2": "90",
+ "k1": "9",
+ "v3": "nbarqjwilbkelk",
+ "v2": "b",
+ "v1": "2010-01-09",
+ "k5": "2019-01-01 00:00:00",
+ "v5": "535.285510",
+ "v4": "92.702403"
+},
+{
+ "v6": "7996.434686",
+ "k4": "10000",
+ "k3": "1000",
+ "k2": "100",
+ "k1": "10",
+ "v3": "zucprgdnlgzzfl",
+ "v2": "s",
+ "v1": "2010-01-10",
+ "k5": "2020-01-01 00:00:00",
+ "v5": "155.861217",
+ "v4": "26.874738"
+}
+] }
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/invalid_json_array3.json
b/regression-test/data/load_p0/stream_load/invalid_json_array3.json
new file mode 100644
index 0000000000..a53e6e727a
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/invalid_json_array3.json
@@ -0,0 +1,123 @@
+{
+ "item": [{
+ "v6": "7395.231067",
+ "k4": "1000",
+ "k3": "100",
+ "k2": "10",
+ "k1": "1",
+ "v3": "ynqnzeowymt",
+ "v2": "t",
+ "v1": "2010-01-01",
+ "k5": "2011-01-01 00:00:00",
+ "v5": "180.998031",
+ "v4": "38.638843"
+ ,
+ {
+ "v6": "2080.504502",
+ "k4": "2000",
+ "k3": "200",
+ "k2": "20",
+ "k1": "2",
+ "v3": "hfkfwlr",
+ "v2": "f",
+ "v1": "2010-01-02",
+ "k5": "2012-01-01 00:00:00",
+ "v5": "539.922834",
+ "v4": "506.044046"
+ },
+ {
+ "v6": "4605.253205",
+ "k4": "3000",
+ "k3": "300",
+ "k2": "30",
+ "k1": "3",
+ "v3": "uoclasp",
+ "v2": "t",
+ "v1": "2010-01-03",
+ "k5": "2013-01-01 00:00:00",
+ "v5": "577.044148",
+ "v4": "377.793209"
+ },
+ {
+ "v6": "7291.703724",
+ "k4": "4000",
+ "k3": "400",
+ "k2": "40",
+ "k1": "4",
+ "v3": "iswngzeodfhptjzgswsddt",
+ "v2": "n",
+ "v1": "2010-01-04",
+ "k5": "2014-01-01 00:00:00",
+ "v5": "919.067864",
+ "v4": "871.354536"
+ },
+ {
+ "fake": null
+ },
+ {
+ "v6": "6514.405051",
+ "k4": "6000",
+ "k3": "600",
+ "k2": "60",
+ "k1": "6",
+ "v3": "obdrei",
+ "v2": "m",
+ "v1": "2010-01-06",
+ "k5": "2016-01-01 00:00:00",
+ "v5": "882.708491",
+ "v4": "921.867848"
+ },
+ {
+ "v6": "8604.198677",
+ "k4": "7000",
+ "k3": "700",
+ "k2": "70",
+ "k1": "7",
+ "v3": "cuobdhvrgkugknj",
+ "v2": "a",
+ "v1": "2010-01-07",
+ "k5": "2017-01-01 00:00:00",
+ "v5": "209.420112",
+ "v4": "141.656421"
+ },
+ {
+ "v6": "7784.859446",
+ "k4": "8000",
+ "k3": "800",
+ "k2": "80",
+ "k1": "8",
+ "v3": "phcxztwgjllhmj",
+ "v2": "z",
+ "v1": "2010-01-08",
+ "k5": "2018-01-01 00:00:00",
+ "v5": "285.664871",
+ "v4": "762.813376"
+ },
+ {
+ "v6": "4846.735593",
+ "k4": "9000",
+ "k3": "900",
+ "k2": "90",
+ "k1": "9",
+ "v3": "nbarqjwilbkelk",
+ "v2": "b",
+ "v1": "2010-01-09",
+ "k5": "2019-01-01 00:00:00",
+ "v5": "535.285510",
+ "v4": "92.702403"
+ },
+ {
+ "v6": "7996.434686",
+ "k4": "10000",
+ "k3": "1000",
+ "k2": "100",
+ "k1": "10",
+ "v3": "zucprgdnlgzzfl",
+ "v2": "s",
+ "v1": "2010-01-10",
+ "k5": "2020-01-01 00:00:00",
+ "v5": "155.861217",
+ "v4": "26.874738"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/invalid_json_array.json
b/regression-test/data/load_p0/stream_load/invalid_json_array4.json
similarity index 57%
copy from regression-test/data/load_p0/stream_load/invalid_json_array.json
copy to regression-test/data/load_p0/stream_load/invalid_json_array4.json
index 7f1e5f1884..293831d8b6 100644
--- a/regression-test/data/load_p0/stream_load/invalid_json_array.json
+++ b/regression-test/data/load_p0/stream_load/invalid_json_array4.json
@@ -1,4 +1,4 @@
- [ {"v6": "7395.231067", "k4": "1000", "k3": "100", "k2": "10", "k1": "1",
"v3": "ynqnzeowymt", "v2": "t", "v1": "2010-01-01", "k5": "2011-01-01
00:00:00", "v5": "180.998031", "v4": "38.638843"},
+ { "item": [{"v6": "7395.231067", "k4": "1000", "k3": "100", "k2": "10",
"k1": "1", "v3": "ynqnzeowymt", "v2": "t", "v1": "2010-01-01", "k5":
"2011-01-01 00:00:00", "v5": "180.998031", "v4": "38.638843"},
{"v6": "2080.504502", "k4": "2000", "k3": "200", "k2": "20", "k1": "2",
"v3": "hfkfwlr", "v2": "f", "v1": "2010-01-02", "k5": "2012-01-01 00:00:00",
"v5": "539.922834", "v4": "506.044046"},
{"v6": "4605.253205", "k4": "3000", "k3": "300", "k2": "30", "k1": "3",
"v3": "uoclasp", "v2": "t", "v1": "2010-01-03", "k5": "2013-01-01 00:00:00",
"v5": "577.044148", "v4": "377.793209"},
{"v6": "7291.703724", "k4": "4000", "k3": "400", "k2": "40", "k1": "4",
"v3": "iswngzeodfhptjzgswsddt", "v2": "n", "v1": "2010-01-04", "k5":
"2014-01-01 00:00:00", "v5": "919.067864", "v4": "871.354536"},
@@ -6,6 +6,10 @@
{"v6": "6514.405051", "k4": "6000", "k3": "600", "k2": "60", "k1": "6",
"v3": "obdrei", "v2": "m", "v1": "2010-01-06", "k5": "2016-01-01 00:00:00",
"v5": "882.708491", "v4": "921.867848"},
{"v6": "8604.198677", "k4": "7000", "k3": "700", "k2": "70", "k1": "7",
"v3": "cuobdhvrgkugknj", "v2": "a", "v1": "2010-01-07", "k5": "2017-01-01
00:00:00", "v5": "209.420112", "v4": "141.656421"},
{"v6": "7784.859446", "k4": "8000", "k3": "800", "k2": "80", "k1": "8",
"v3": "phcxztwgjllhmj", "v2": "z", "v1": "2010-01-08", "k5": "2018-01-01
00:00:00", "v5": "285.664871", "v4": "762.813376"},
- {"v6": "4846.735593", "k4": "9000", "k3": "900", "k2": "90", "k1": "9",
"v3": "nbarqjwilbkelk", "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
- {"v6": "7996.434686", "k4": "10000", "k3": "1000", "k2": "100", "k1":
"10", "v3": "zucprgdnlgzzfl", "v2": "s", "v1": "2010-01-10", "k5": "2020-01-01
00:00:00", "v5": "155.861217", "v4": "26.874738"},]
+ "v6": "4846.735593", "k4": "9000", "k3": "900", "k2": "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": 4846.735593", "k4": "9000", "k3": "900", "k2": "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": 4846.735593", "k4": "9000", "k3": "900, "k2": "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": 4846.735593", "k4": "9000", "k3": "900, "k2":} "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": 4846.735593", "k4": "9000",}"k3": "900, "k2":} "90", "k1": "9",
"v3": "nbarqjwilbkelk" "v2": "b", "v1": "2010-01-09", "k5": "2019-01-01
00:00:00", "v5": "535.285510", "v4": "92.702403"},
+ {"v6": "7996.434686", "k4": "10000", "k3": "1000", "k2": "100", "k1":
"10", "v3": "zucprgdnlgzzfl", "v2": "s", "v1": "2010-01-10", "k5": "2020-01-01
00:00:00", "v5": "155.861217", "v4": "26.874738"}]}
diff --git a/regression-test/data/load_p0/stream_load/invalid_nest_json1.json
b/regression-test/data/load_p0/stream_load/invalid_nest_json1.json
new file mode 100644
index 0000000000..09caf85733
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/invalid_nest_json1.json
@@ -0,0 +1,5 @@
+{"no": 1, "item": {"id": 1, "city": "beijing", "code": 2345671}}
+{"no": 2, "item": {"id": 2, "city" "shanghai", "code": 2345672}}
+{"no": 3, "item": {"id": 3, "city": "hangzhou", "code": 2345673}}
+{"no": 4, "item": id": 4, "city": "shenzhen", "code": 2345674}}
+{"no": 5, "item": {"id": 5, "city": "guangzhou", "code": 2345675}}
diff --git a/regression-test/data/load_p0/stream_load/invalid_nest_json2.json
b/regression-test/data/load_p0/stream_load/invalid_nest_json2.json
new file mode 100644
index 0000000000..46da8b855c
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/invalid_nest_json2.json
@@ -0,0 +1,16 @@
+{"no": 1, "item": {"id": 1, "city": "beijing", "code": 2345671}}
+{"no": 2, "item": {"id": 2, "city": "shanghai", "code": 2345672}}
+{"no": 3, "item": ["id": 3, "city"], "hangzhou", "code": 2345673}}
+{"no": 3, "xxxxx": ["id": 3, "city"], "hangzhou", "code": 2345673}}
+{"no": 3, xxxxx": ["id": 3, "city"], "hangzhou", "code": 2345673}}
+{"no": 4, "item": {"id": 4, "city": "shenzhen", "code": 2345674}}
+{"no": 5, "item": {"id": 5, "city": "guangzhou", "code": 2345675}}
+{"no": 5, "item": {"id": 5, "city": ["guangzhou"], "code": 2345675}}
+{"no": 5, "item": {"id": 5, "city": {"guangzhou": 1}, "code": 2345675}}
+{"no": 5, "item": {"id": 5, "city": ["guangzhou", "code": 2345675]]}
+{"no": 5, "item": {"id": 5, "city": ["guangzhou", "code": 2345675}}
+{"no": 5, "item": {"id": 5, "city": {"guangzhou", "code": 2345675}}
+{"no": 5, "item": {"id": 5, "city": {"guangzhou":1, "code": 2345675}}}
+{"no": 5, "item": {"id": 5, "city": "1}}}
+{"no": 5, "item": {"id": 5, "city": "1]}}}
+{"no": 5, ["item": {"id": 5, "city": {"guangzhou": 1, "code": 2345675}]}
diff --git a/regression-test/data/load_p0/stream_load/invalid_nest_json3.json
b/regression-test/data/load_p0/stream_load/invalid_nest_json3.json
new file mode 100644
index 0000000000..57d8a6fc74
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/invalid_nest_json3.json
@@ -0,0 +1,5 @@
+no": 1, "item": {"id": 1, "city": "beijing", "code": 2345671}}
+{"no": , "item": {"id": 2, "city": "shanghai", "code": 2345672}}
+{"no": 3, "item": ["id": 3, "city": "hangzhou", "code": 2345673}}
+{"no": 4, item": {"id": 4, "city": "shenzhen", "code": 2345674}}
+{"no": 5, "item": {"id": 5, "city": guangzhou", "code": 2345675}}
diff --git
a/regression-test/data/load_p0/stream_load/invalid_nest_json_array.json
b/regression-test/data/load_p0/stream_load/invalid_nest_json_array.json
new file mode 100644
index 0000000000..b7874664ad
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/invalid_nest_json_array.json
@@ -0,0 +1,26 @@
+[
+ {
+ "no": 1,
+ "item": {
+ "id": 1,
+ "city": [
+ "zhejiang",
+ "hangzhou",
+ "xihu"
+ ],
+ "code": 2345671
+ }
+ },
+ {
+ "no": 2,
+ "item":
+ "id": 2,
+ "city": [
+ "zhejiang",
+ "hangzhou",
+ "xiaoshan"
+ ],
+ "code": 2345672
+ }
+ }
+]
\ No newline at end of file
diff --git
a/regression-test/data/load_p0/stream_load/invalid_nest_json_array1.json
b/regression-test/data/load_p0/stream_load/invalid_nest_json_array1.json
new file mode 100644
index 0000000000..5ebe4ef998
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/invalid_nest_json_array1.json
@@ -0,0 +1,26 @@
+{
+ {
+ "no": 1,
+ "item": {
+ "id": 1,
+ city": [
+ "zhejiang",
+ "hangzhou",
+ "xihu"
+ ],
+ "code": 2345671
+ }
+ },
+ {
+ "no": 2,
+ "item":
+ "id": 2,
+ "city": [
+ "zhejiang",
+ "hangzhou",
+ "xiaoshan"
+ ],
+ "code": 2345672
+ }
+ }
+]
\ No newline at end of file
diff --git
a/regression-test/data/load_p0/stream_load/invalid_nest_json_array2.json
b/regression-test/data/load_p0/stream_load/invalid_nest_json_array2.json
new file mode 100644
index 0000000000..4685f528a1
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/invalid_nest_json_array2.json
@@ -0,0 +1,26 @@
+[
+ {
+ "no": 1,
+ "tem": {{
+ "id": 1,
+ "city": [
+ "zhejiang",
+ "hangzhou",
+ "xihu"
+ ],
+ "code": 2345671
+ }
+ },
+ {
+ "no": 2,
+ "item":
+ "id": 2,
+ "city": [
+ "zhejiang",
+ "hangzhou",
+ "xiaoshan"
+ ],
+ "code": 2345672
+ }
+ }
+]
\ No newline at end of file
diff --git
a/regression-test/data/load_p0/stream_load/invalid_nest_json_array3.json
b/regression-test/data/load_p0/stream_load/invalid_nest_json_array3.json
new file mode 100644
index 0000000000..a493bfd7d0
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/invalid_nest_json_array3.json
@@ -0,0 +1,25 @@
+[
+ {
+ "no": 1,
+ "item": {
+ "id": 1,
+ "city": [
+ "zhejiang",
+ "hangzhou",
+ "xihu
+ ],
+ "code": 2345671
+ },
+ {
+ "no": 2,
+ "item":
+ "id": 2,
+ "city": [
+ "zhejiang",
+ "hangzhou",
+ "xiaoshan"
+ ],
+ "code": 2345672
+ }
+ }
+]
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/nest_json_array.json
b/regression-test/data/load_p0/stream_load/nest_json_array.json
index b8b3cf917d..af2605f98f 100644
--- a/regression-test/data/load_p0/stream_load/nest_json_array.json
+++ b/regression-test/data/load_p0/stream_load/nest_json_array.json
@@ -50,7 +50,7 @@
{
"no": 5,
"item": {
- "id": 5,
+ "id" : 5,
"city": [
"zhejiang",
"hangzhou",
@@ -70,5 +70,46 @@
],
"code": 2345676
}
+ },
+ { "no": 7,
+ "item": {
+ "idx": 2,
+ "cityx": [
+ "zhejiang",
+ "hangzhou",
+ "xiaoshan"
+ ],
+ "codex": 2345672
+ }
+ },
+ { "no": 7,
+ "item": {
+ }
+ },
+ { "no": 7,
+ "item": []
+ },
+ { "no": 7,
+ "itemxxx": {}
+ },
+ { "no": 7,
+ "item": "123"
+ },
+ {
+ "no": 7,
+ "item": {
+ }
+ },
+ {
+ "no": 8,
+ "item": {
+ "id": 7,
+ "city": [
+ "zhejiang",
+ "hangzhou",
+ "fuyang"
+ ],
+ "code": 2345676
+ }
}
]
\ No newline at end of file
diff --git a/regression-test/data/load_p0/stream_load/simple_json2.json
b/regression-test/data/load_p0/stream_load/simple_json2.json
index eb698453de..ca3197bbf9 100644
--- a/regression-test/data/load_p0/stream_load/simple_json2.json
+++ b/regression-test/data/load_p0/stream_load/simple_json2.json
@@ -9,6 +9,8 @@
"id": 2,
"city": "shanghai"
},
+ {},
+ {"xxx":1},
{
"code": 2345673,
"id": 3,
diff --git a/regression-test/data/load_p0/stream_load/test_json_load.out
b/regression-test/data/load_p0/stream_load/test_json_load.out
index d0de96b7d7..b297fa2236 100644
--- a/regression-test/data/load_p0/stream_load/test_json_load.out
+++ b/regression-test/data/load_p0/stream_load/test_json_load.out
@@ -102,7 +102,11 @@
200 changsha 3456789
-- !select10 --
+\N \N \N
200 changsha 3456789
+19210 \N \N
+12811110 \N \N
+171171710 \N \N
-- !select11 --
1 beijing 2345671
@@ -141,10 +145,17 @@
-- !select14 --
10 2345671 \N
+10 beijing 2345671
20 2345672 \N
+20 shanghai 2345672
30 2345673 \N
40 2345674 \N
+40 shenzhen 2345674
50 2345675 \N
+50 {"guangzhou":1,"code":2345675} \N
+50 ["guangzhou"] 2345675
+50 guangzhou 2345675
+50 {"guangzhou":1} 2345675
200 changsha 3456789
-- !select15 --
@@ -162,5 +173,11 @@
4 shangcheng 2345674
5 tonglu 2345675
6 fuyang 2345676
+7 fuyang 2345676
+200 changsha 3456789
+
+-- !select17 --
+
+-- !select18 --
200 changsha 3456789
diff --git a/regression-test/suites/load_p0/stream_load/test_json_load.groovy
b/regression-test/suites/load_p0/stream_load/test_json_load.groovy
index d2c5d15fe0..f874eb2b87 100644
--- a/regression-test/suites/load_p0/stream_load/test_json_load.groovy
+++ b/regression-test/suites/load_p0/stream_load/test_json_load.groovy
@@ -115,7 +115,8 @@ suite("test_json_load", "p0") {
}
def load_json_data = {label, strip_flag, read_flag, format_flag, exprs,
json_paths,
- json_root, where_expr, fuzzy_flag, file_name,
ignore_failure=false ->
+ json_root, where_expr, fuzzy_flag, file_name,
ignore_failure=false,
+ expected_succ_rows = -1 ->
// load the json data
streamLoad {
@@ -133,19 +134,26 @@ suite("test_json_load", "p0") {
set 'fuzzy_parse', fuzzy_flag
file file_name // import json file
time 10000 // limit inflight 10s
+ if (expected_succ_rows >= 0) {
+ set 'max_filter_ratio', '1'
+ }
// if declared a check callback, the default check condition will
ignore.
// So you must check all condition
check { result, exception, startTime, endTime ->
- if (ignore_failure) { return }
+ if (ignore_failure && expected_succ_rows < 0) { return }
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("success", json.Status.toLowerCase())
- assertEquals(json.NumberTotalRows, json.NumberLoadedRows +
json.NumberUnselectedRows)
- assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0)
+ if (expected_succ_rows >= 0) {
+ assertEquals(json.NumberLoadedRows, expected_succ_rows)
+ } else {
+ assertEquals(json.NumberTotalRows, json.NumberLoadedRows +
json.NumberUnselectedRows)
+ assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0)
+ }
}
}
}
@@ -350,13 +358,13 @@ suite("test_json_load", "p0") {
create_test_table1.call(testTable)
load_json_data.call('test_json_load_case10_2', '', 'true', 'json',
'id= id * 10', '',
- '$.item', '', 'true', 'invalid_json.json', true)
+ '$.item', '', 'false', 'invalid_json.json', false,
4)
sql "sync"
qt_select10 "select * from ${testTable} order by id"
} finally {
- try_sql("DROP TABLE IF EXISTS ${testTable}")
+ // try_sql("DROP TABLE IF EXISTS ${testTable}")
}
// case11: test json file which is unordered and no use json_path
@@ -365,7 +373,7 @@ suite("test_json_load", "p0") {
create_test_table1.call(testTable)
- load_json_data.call('test_json_load_case11_2', 'true', '', 'json', '',
'', '', '', '', 'simple_json2.json')
+ load_json_data.call('test_json_load_case11_2', 'true', '', 'json', '',
'', '', '', '', 'simple_json2.json', false, 10)
sql "sync"
qt_select11 "select * from ${testTable} order by id"
@@ -435,11 +443,19 @@ suite("test_json_load", "p0") {
load_json_data.call('test_json_load_case14_2', '', 'true', 'json',
'id= id * 10', '[\"$.id\", \"$.code\"]',
'$.item', '', 'true', 'nest_json.json')
+ // invalid nest_json
+ load_json_data.call('test_json_load_case14_3', '', 'true', 'json',
'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]',
+ '$.item', '', 'true', 'invalid_nest_json1.json',
true)
+ load_json_data.call('test_json_load_case14_4', '', 'true', 'json',
'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]',
+ '$.item', '', 'true', 'invalid_nest_json2.json',
false, 7)
+ load_json_data.call('test_json_load_case14_5', '', 'true', 'json',
'id= id * 10', '[\"$.id\", \"$.city\", \"$.code\"]',
+ '$.item', '', 'true', 'invalid_nest_json3.json',
true)
+
sql "sync"
- qt_select14 "select * from ${testTable} order by id"
+ qt_select14 "select * from ${testTable} order by id, code, city"
} finally {
- try_sql("DROP TABLE IF EXISTS ${testTable}")
+ // try_sql("DROP TABLE IF EXISTS ${testTable}")
}
// case15: apply jsonpaths & exprs & json_root
@@ -465,7 +481,7 @@ suite("test_json_load", "p0") {
create_test_table1.call(testTable)
load_json_data.call('test_json_load_case16_2', 'true', '', 'json',
'id, code, city',
- '[\"$.id\", \"$.code\", \"$.city[2]\"]', '$.item',
'', 'true', 'nest_json_array.json')
+ '[\"$.id\", \"$.code\", \"$.city[2]\"]', '$.item',
'', 'true', 'nest_json_array.json', false, 7)
sql "sync"
qt_select16 "select * from ${testTable} order by id"
@@ -473,6 +489,47 @@ suite("test_json_load", "p0") {
} finally {
try_sql("DROP TABLE IF EXISTS ${testTable}")
}
+
+ // case17: invalid json
+ try {
+ sql "DROP TABLE IF EXISTS ${testTable}"
+
+ test_invalid_json_array_table.call(testTable)
+ load_json_data.call('test_json_load_case17', 'true', '', 'json', '',
'',
+ '', '', '', 'invalid_json_array.json', false, 0)
+ load_json_data.call('test_json_load_case17_1', 'true', '', 'json', '',
'',
+ '$.item', '', '', 'invalid_json_array1.json', false, 0)
+ load_json_data.call('test_json_load_case17_2', 'true', '', 'json', '',
'',
+ '$.item', '', '', 'invalid_json_array2.json', false, 0)
+ load_json_data.call('test_json_load_case17_3', 'true', '', 'json', '',
'',
+ '$.item', '', '', 'invalid_json_array3.json', false, 0)
+ sql "sync"
+ qt_select17 "select * from ${testTable}"
+
+ } finally {
+ try_sql("DROP TABLE IF EXISTS ${testTable}")
+ }
+
+ // case18: invalid nest json
+ try {
+ sql "DROP TABLE IF EXISTS ${testTable}"
+
+ create_test_table1.call(testTable)
+ load_json_data.call('test_json_load_case16_2', 'true', '', 'json',
'id, code, city',
+ '[\"$.id\", \"$.code\", \"$.city[2]\"]', '$.item',
'', 'true', 'invalid_nest_json_array.json', true)
+ load_json_data.call('test_json_load_case16_2', 'true', '', 'json',
'id, code, city',
+ '[\"$.id\", \"$.code\", \"$.city[100]\"]',
'$.item', '', 'true', 'invalid_nest_json_array1.json', true)
+ load_json_data.call('test_json_load_case16_2', 'true', '', 'json',
'id, code, city',
+ '[\"$.id\", \"$.code\", \"$.city\"]', '$.item',
'', 'true', 'invalid_nest_json_array2.json', true)
+ load_json_data.call('test_json_load_case16_2', 'true', '', 'json',
'id, code, city',
+ '[\"$.id\", \"$.code\", \"$.city[2]\"]', '$.item',
'', 'true', 'invalid_nest_json_array3.json', true)
+
+ sql "sync"
+ qt_select18 "select * from ${testTable} order by id"
+
+ } finally {
+ try_sql("DROP TABLE IF EXISTS ${testTable}")
+ }
// if 'enableHdfs' in regression-conf.groovy has been set to true,
// the test will run these case as below.
@@ -483,7 +540,7 @@ suite("test_json_load", "p0") {
def hdfs_file_path = uploadToHdfs "stream_load/simple_object_json.json"
def format = "json"
- // case17: import json use pre-filter exprs
+ // case18: import json use pre-filter exprs
try {
sql "DROP TABLE IF EXISTS ${testTable}"
@@ -498,7 +555,7 @@ suite("test_json_load", "p0") {
try_sql("DROP TABLE IF EXISTS ${testTable}")
}
- // case18: import json use pre-filter and where exprs
+ // case19: import json use pre-filter and where exprs
try {
sql "DROP TABLE IF EXISTS ${testTable}"
@@ -512,20 +569,5 @@ suite("test_json_load", "p0") {
} finally {
try_sql("DROP TABLE IF EXISTS ${testTable}")
}
-
- // case19: invalid json
- try {
- sql "DROP TABLE IF EXISTS ${testTable}"
-
- test_invalid_json_array_table.call(testTable)
- load_json_data.call('test_json_load_case19', 'true', '', 'json',
'', '',
- '', '', '', 'invalid_json_array.json', true)
-
- sql "sync"
- qt_select "select * from ${testTable}"
-
- } finally {
- try_sql("DROP TABLE IF EXISTS ${testTable}")
- }
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]