pitrou commented on code in PR #14100:
URL: https://github.com/apache/arrow/pull/14100#discussion_r977376001
##########
cpp/src/arrow/json/parser_benchmark.cc:
##########
@@ -30,24 +30,50 @@
namespace arrow {
namespace json {
-std::shared_ptr<Schema> TestSchema() {
- return schema({field("int", int32()), field("str", utf8())});
-}
-
-constexpr int seed = 0x432432;
-
-std::string TestJsonData(int num_rows, bool pretty = false) {
- std::default_random_engine engine(seed);
- std::string json;
- for (int i = 0; i < num_rows; ++i) {
- StringBuffer sb;
- Writer writer(sb);
- ABORT_NOT_OK(Generate(TestSchema(), engine, &writer));
- json += pretty ? PrettyPrint(sb.GetString()) : sb.GetString();
- json += "\n";
+class JSONGenerator {
+ public:
+ constexpr static int kSeed = 0x432432;
+
+ constexpr explicit JSONGenerator(bool pretty = false) : pretty_(pretty) {}
+
+ template <typename T>
+ std::string operator()(const T& input, int32_t num_rows) const {
+ std::default_random_engine engine(kSeed);
+ std::string json;
+ for (int i = 0; i < num_rows; ++i) {
+ StringBuffer sb;
+ Writer writer(sb);
+ ABORT_NOT_OK(Generate(input, engine, &writer));
+ json += pretty_ ? PrettyPrint(sb.GetString()) : sb.GetString();
+ json += "\n";
+ }
+ return json;
}
- return json;
+ private:
+ bool pretty_;
+};
+
+constexpr auto JSONString = JSONGenerator{false};
+constexpr auto JSONStringPretty = JSONGenerator{true};
+
+// Both field sets are the worst-case ordering of each other - i.e. the parser
cannot
+// reliably predict the next field in B given the definition of A.
+FieldVector TestFields1() {
+ return {
+ field("int", int32()),
+ field("str", utf8()),
+ };
+}
+FieldVector TestFields2() {
+ return {
+ field("str", utf8()),
+ field("int", int32()),
+ };
+}
Review Comment:
We should indeed try not to modify the characteristics of existing
benchmarks. We can however add a different set of benchmarks with a lot more
fields, potentially varying fields per line, etc.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]