fgerlits commented on code in PR #1703: URL: https://github.com/apache/nifi-minifi-cpp/pull/1703#discussion_r1414065111
########## extensions/standard-processors/processors/AttributeRollingWindow.cpp: ########## @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "AttributeRollingWindow.h" +#include <algorithm> +#include <numeric> +#include "fmt/format.h" +#include "core/ProcessContext.h" +#include "core/ProcessSession.h" +#include "core/Resource.h" +#include "utils/expected.h" +#include "utils/OptionalUtils.h" + +namespace org::apache::nifi::minifi::processors { + +void AttributeRollingWindow::onSchedule(core::ProcessContext* context, core::ProcessSessionFactory*) { + gsl_Expects(context); + time_window_ = context->getProperty<core::TimePeriodValue>(TimeWindow) + | utils::transform(&core::TimePeriodValue::getMilliseconds); + window_length_ = context->getProperty<size_t>(WindowLength) + | utils::filter([](size_t value) { return value > 0; }); + if (!time_window_ && !window_length_) { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "Either 'Time window' or 'Window length' must be set"}; + } + attribute_name_prefix_ = (context->getProperty(AttributeNamePrefix) + | utils::orElse([] { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "'Attribute name prefix' must be set"}; + })).value(); + gsl_Ensures(runningInvariant()); +} + +void AttributeRollingWindow::onTrigger(core::ProcessContext* context, core::ProcessSession* session) { + gsl_Expects(context && session && runningInvariant()); + const auto flow_file = session->get(); + if (!flow_file) { yield(); return; } + gsl_Assert(flow_file); + const auto current_value_opt = context->getProperty(ValueToTrack, flow_file); + if (!current_value_opt) { + logger_->log_warn("Missing value to track, flow file uuid: {}", flow_file->getUUIDStr()); + session->transfer(flow_file, Failure); + return; + } + const auto current_value = [¤t_value_opt] { + try { + return std::stod(*current_value_opt); + } catch (const std::exception& ex) { + throw minifi::Exception{ExceptionType::PROCESSOR_EXCEPTION, + fmt::format("Failed to convert 'Value to track' of '{}' to double", *current_value_opt)}; + } + }(); + // copy: so we can release the lock sooner + const auto state_copy = [&, now = std::chrono::system_clock::now()] { + const std::lock_guard lg{state_mutex_}; + state_.add(now, current_value); + if (window_length_) { + state_.shrinkToSize(*window_length_); + } else { + gsl_Assert(time_window_); + state_.removeOlderThan(now - *time_window_); + } + return state_.getEntries(); + }(); + const auto sorted_values = [&state_copy] { + auto values = state_copy | ranges::views::transform(&decltype(state_)::Entry::value) | ranges::to<std::vector>; + std::sort(std::begin(values), std::end(values)); + return values; + }(); + calculateAndSetAttributes(*flow_file, sorted_values); + session->transfer(flow_file, Success); +} + +/** + * Calculate statistical properties of the values in the rolling window and set them as attributes on the flow file. + * Properties: count, value (sum), mean (average), median, variance, stddev + */ +void AttributeRollingWindow::calculateAndSetAttributes(core::FlowFile &flow_file, + std::span<const double> sorted_values) const { + const auto attribute_name = [this](std::string_view suffix) { + return utils::string::join_pack(attribute_name_prefix_, suffix); + }; + const auto set_aggregate = [&flow_file, attribute_name](std::string_view name, double value) { + flow_file.setAttribute(attribute_name(name), std::to_string(value)); + }; + set_aggregate("count", sorted_values.size()); + const auto sum = std::accumulate(std::begin(sorted_values), std::end(sorted_values), 0.0); Review Comment: I would use `ranges::accumulate`: ```suggestion const auto sum = ranges::accumulate(sorted_values, 0.0); ``` ########## extensions/standard-processors/processors/AttributeRollingWindow.cpp: ########## @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "AttributeRollingWindow.h" +#include <algorithm> +#include <numeric> +#include "fmt/format.h" +#include "core/ProcessContext.h" +#include "core/ProcessSession.h" +#include "core/Resource.h" +#include "utils/expected.h" +#include "utils/OptionalUtils.h" + +namespace org::apache::nifi::minifi::processors { + +void AttributeRollingWindow::onSchedule(core::ProcessContext* context, core::ProcessSessionFactory*) { + gsl_Expects(context); + time_window_ = context->getProperty<core::TimePeriodValue>(TimeWindow) + | utils::transform(&core::TimePeriodValue::getMilliseconds); + window_length_ = context->getProperty<size_t>(WindowLength) + | utils::filter([](size_t value) { return value > 0; }); + if (!time_window_ && !window_length_) { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "Either 'Time window' or 'Window length' must be set"}; + } + attribute_name_prefix_ = (context->getProperty(AttributeNamePrefix) + | utils::orElse([] { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "'Attribute name prefix' must be set"}; + })).value(); + gsl_Ensures(runningInvariant()); +} + +void AttributeRollingWindow::onTrigger(core::ProcessContext* context, core::ProcessSession* session) { + gsl_Expects(context && session && runningInvariant()); + const auto flow_file = session->get(); + if (!flow_file) { yield(); return; } + gsl_Assert(flow_file); + const auto current_value_opt = context->getProperty(ValueToTrack, flow_file); + if (!current_value_opt) { + logger_->log_warn("Missing value to track, flow file uuid: {}", flow_file->getUUIDStr()); + session->transfer(flow_file, Failure); + return; + } + const auto current_value = [¤t_value_opt] { + try { + return std::stod(*current_value_opt); + } catch (const std::exception& ex) { + throw minifi::Exception{ExceptionType::PROCESSOR_EXCEPTION, + fmt::format("Failed to convert 'Value to track' of '{}' to double", *current_value_opt)}; + } + }(); + // copy: so we can release the lock sooner + const auto state_copy = [&, now = std::chrono::system_clock::now()] { + const std::lock_guard lg{state_mutex_}; + state_.add(now, current_value); + if (window_length_) { + state_.shrinkToSize(*window_length_); + } else { + gsl_Assert(time_window_); + state_.removeOlderThan(now - *time_window_); + } + return state_.getEntries(); + }(); + const auto sorted_values = [&state_copy] { + auto values = state_copy | ranges::views::transform(&decltype(state_)::Entry::value) | ranges::to<std::vector>; + std::sort(std::begin(values), std::end(values)); + return values; + }(); + calculateAndSetAttributes(*flow_file, sorted_values); + session->transfer(flow_file, Success); +} + +/** + * Calculate statistical properties of the values in the rolling window and set them as attributes on the flow file. + * Properties: count, value (sum), mean (average), median, variance, stddev + */ +void AttributeRollingWindow::calculateAndSetAttributes(core::FlowFile &flow_file, + std::span<const double> sorted_values) const { + const auto attribute_name = [this](std::string_view suffix) { + return utils::string::join_pack(attribute_name_prefix_, suffix); + }; + const auto set_aggregate = [&flow_file, attribute_name](std::string_view name, double value) { + flow_file.setAttribute(attribute_name(name), std::to_string(value)); + }; + set_aggregate("count", sorted_values.size()); + const auto sum = std::accumulate(std::begin(sorted_values), std::end(sorted_values), 0.0); + set_aggregate("value", sum); + const auto mean = sum / gsl::narrow_cast<double>(sorted_values.size()); + set_aggregate("mean", mean); + set_aggregate("median", [&] { + const auto mid = sorted_values.size() / 2; + return sorted_values.size() % 2 == 0 + ? std::midpoint(sorted_values[mid], sorted_values[mid - 1]) // even number of values: average the two middle values + : sorted_values[mid]; // odd number of values: take the middle value + }()); + // https://math.stackexchange.com/questions/1720876/sums-of-squares-minus-square-of-sums + const auto avg_of_squares = std::accumulate(std::begin(sorted_values), std::end(sorted_values), 0.0, [&](double acc, double value) { + return acc + std::pow(value, 2) / gsl::narrow_cast<double>(sorted_values.size()); + }); + const auto variance = avg_of_squares - std::pow(mean, 2); Review Comment: It is better to do all the additions first and divide by `n` once, at the end: ```c++ const auto n = gsl::narrow_cast<double>(sorted_values.size()); const auto mean = sum / n; [...] // https://math.stackexchange.com/questions/1720876/sums-of-squares-minus-square-of-sums static constexpr auto square = [](auto x) { return x * x; }; const auto sum_of_squares = ranges::accumulate(sorted_values, 0.0, {}, square); const auto variance = sum_of_squares / n - square(mean); ``` ########## extensions/standard-processors/RollingWindow.h: ########## @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include <mutex> +#include <queue> +#include <vector> + +namespace org::apache::nifi::minifi::processors::standard::utils { + +namespace detail { +template<typename T, typename Container, typename Comparator> +struct priority_queue : std::priority_queue<T, Container, Comparator> { + using std::priority_queue<T, Container, Comparator>::priority_queue; + + // Expose the underlying container + const Container& get_container() const & { return this->c; } + Container get_container() && { return std::move(this->c); } +}; +} // namespace detail + +template<typename Timestamp, typename Value> +class RollingWindow { + public: + struct Entry { + Timestamp timestamp{}; + Value value{}; + }; + struct EntryComparator { + // greater-than, because std::priority_queue order is reversed. This way, top() is the oldest entry. + bool operator()(const Entry& lhs, const Entry& rhs) const { + return lhs.timestamp > rhs.timestamp; + } + }; + + void removeOlderThan(Timestamp timestamp) { + while (!state_.empty() && state_.top().timestamp < timestamp) { + state_.pop(); + } + } + + /** Remove the oldest entries until the size is <= size. */ + void shrinkToSize(size_t size) { + while (state_.size() > size && !state_.empty()) { Review Comment: if `state_.size() > size (>= 0)`, then `state_` can't be empty ```suggestion while (state_.size() > size) { ``` ########## extensions/standard-processors/processors/AttributeRollingWindow.cpp: ########## @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "AttributeRollingWindow.h" +#include <algorithm> +#include <numeric> +#include "fmt/format.h" +#include "core/ProcessContext.h" +#include "core/ProcessSession.h" +#include "core/Resource.h" +#include "utils/expected.h" +#include "utils/OptionalUtils.h" + +namespace org::apache::nifi::minifi::processors { + +void AttributeRollingWindow::onSchedule(core::ProcessContext* context, core::ProcessSessionFactory*) { + gsl_Expects(context); + time_window_ = context->getProperty<core::TimePeriodValue>(TimeWindow) + | utils::transform(&core::TimePeriodValue::getMilliseconds); + window_length_ = context->getProperty<uint64_t>(WindowLength) + | utils::filter([](uint64_t value) { return value > 0; }) + | utils::transform([](uint64_t value) { return size_t{value}; }); + if (!time_window_ && !window_length_) { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "Either 'Time window' or 'Window length' must be set"}; + } + attribute_name_prefix_ = (context->getProperty(AttributeNamePrefix) + | utils::orElse([] { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "'Attribute name prefix' must be set"}; + })).value(); + gsl_Ensures(runningInvariant()); +} + +void AttributeRollingWindow::onTrigger(core::ProcessContext* context, core::ProcessSession* session) { + gsl_Expects(context && session && runningInvariant()); + const auto flow_file = session->get(); + if (!flow_file) { yield(); return; } + gsl_Assert(flow_file); + const auto current_value_opt = context->getProperty(ValueToTrack, flow_file); + if (!current_value_opt) { + logger_->log_warn("Missing value to track, flow file uuid: {}", flow_file->getUUIDStr()); + session->transfer(flow_file, Failure); + return; + } + const auto current_value = [¤t_value_opt] { + try { + return std::stod(*current_value_opt); + } catch (const std::exception& ex) { + throw minifi::Exception{ExceptionType::PROCESSOR_EXCEPTION, + fmt::format("Failed to convert 'Value to track' of '{}' to double", *current_value_opt)}; + } + }(); Review Comment: This will roll back and retry the flow file until it expires, by default forever. (It is possible that the `Value to track` is time-dependent and will fix itself, but I expect that will be the rarer case.) I think it would be better to log a warning and transfer the flow file to `Failure` as in the case of a missing `Value to track`. ########## PROCESSORS.md: ########## @@ -147,6 +148,44 @@ In the list below, the names of required properties appear in bold. Any other pr | success | success operational on the flow record | +## AttributeRollingWindow + +### Description + +Track a Rolling Window based on evaluating an Expression Language expression on each FlowFile. Each FlowFile will be emitted with the count of FlowFiles and total aggregate valueof values processed in the current window. Review Comment: tiny typo: ```suggestion Track a Rolling Window based on evaluating an Expression Language expression on each FlowFile. Each FlowFile will be emitted with the count of FlowFiles and total aggregate value of values processed in the current window. ``` ########## extensions/standard-processors/processors/AttributeRollingWindow.cpp: ########## @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "AttributeRollingWindow.h" +#include <algorithm> +#include <numeric> +#include "fmt/format.h" +#include "core/ProcessContext.h" +#include "core/ProcessSession.h" +#include "core/Resource.h" +#include "utils/expected.h" +#include "utils/OptionalUtils.h" + +namespace org::apache::nifi::minifi::processors { + +void AttributeRollingWindow::onSchedule(core::ProcessContext* context, core::ProcessSessionFactory*) { + gsl_Expects(context); + time_window_ = context->getProperty<core::TimePeriodValue>(TimeWindow) + | utils::transform(&core::TimePeriodValue::getMilliseconds); + window_length_ = context->getProperty<uint64_t>(WindowLength) + | utils::filter([](uint64_t value) { return value > 0; }) + | utils::transform([](uint64_t value) { return size_t{value}; }); + if (!time_window_ && !window_length_) { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "Either 'Time window' or 'Window length' must be set"}; + } + attribute_name_prefix_ = (context->getProperty(AttributeNamePrefix) + | utils::orElse([] { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "'Attribute name prefix' must be set"}; + })).value(); + gsl_Ensures(runningInvariant()); +} + +void AttributeRollingWindow::onTrigger(core::ProcessContext* context, core::ProcessSession* session) { + gsl_Expects(context && session && runningInvariant()); + const auto flow_file = session->get(); + if (!flow_file) { yield(); return; } + gsl_Assert(flow_file); + const auto current_value_opt = context->getProperty(ValueToTrack, flow_file); + if (!current_value_opt) { + logger_->log_warn("Missing value to track, flow file uuid: {}", flow_file->getUUIDStr()); + session->transfer(flow_file, Failure); + return; + } + const auto current_value = [¤t_value_opt] { + try { + return std::stod(*current_value_opt); + } catch (const std::exception& ex) { + throw minifi::Exception{ExceptionType::PROCESSOR_EXCEPTION, + fmt::format("Failed to convert 'Value to track' of '{}' to double", *current_value_opt)}; + } + }(); + // copy: so we can release the lock sooner + const auto state_copy = [&, now = std::chrono::system_clock::now()] { + const std::lock_guard lg{state_mutex_}; + state_.add(now, current_value); + if (window_length_) { + state_.shrinkToSize(*window_length_); + } else { + gsl_Assert(time_window_); + state_.removeOlderThan(now - *time_window_); + } + return state_.getEntries(); + }(); + const auto sorted_values = [&state_copy] { + auto values = state_copy | ranges::views::transform(&decltype(state_)::Entry::value) | ranges::to<std::vector>; + std::sort(std::begin(values), std::end(values)); + return values; + }(); + calculateAndSetAttributes(*flow_file, sorted_values); + session->transfer(flow_file, Success); +} + +/** + * Calculate statistical properties of the values in the rolling window and set them as attributes on the flow file. + * Properties: count, value (sum), mean (average), median, variance, stddev Review Comment: I would either remove the second line of the comment, or add `min`, `max`. ########## extensions/standard-processors/tests/unit/AttributeRollingWindowTests.cpp: ########## @@ -0,0 +1,101 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <memory> +#include <string_view> +#include "Catch.h" +#include "AttributeRollingWindow.h" +#include "SingleProcessorTestController.h" + +namespace org::apache::nifi::minifi::test { +using AttributeRollingWindow = processors::AttributeRollingWindow; + +bool checkAttributes(const std::map<std::string, std::string>& expected, const std::map<std::string, std::string>& actual) { + // expected may be incomplete, but if something is specified in expected, they also need to be in the actual + // set of attributes + return std::all_of(std::begin(expected), std::end(expected), [&actual](const auto& kvpair) { + const auto& key = kvpair.first; + const auto& value = kvpair.second; + return actual.at(key) == value; Review Comment: this: ```suggestion return actual.contains(key) && actual.at(key) == value; ``` would return false instead of throwing if a key is missing, which I think would be better ########## extensions/standard-processors/tests/unit/RollingWindowTests.cpp: ########## @@ -0,0 +1,108 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <string_view> +#include "Catch.h" +#include "RollingWindow.h" +#include "range/v3/view/zip.hpp" +#include "range/v3/algorithm/contains.hpp" + +namespace org::apache::nifi::minifi::test { + +using timestamp_type = int; +using value_type = const char*; +using RollingWindow = processors::standard::utils::RollingWindow<timestamp_type, value_type>; +using TimestampComparator = decltype([](const RollingWindow::Entry& lhs, const RollingWindow::Entry& rhs) { + return lhs.timestamp < rhs.timestamp; +}); + +bool compareEntriesTimestamps(std::vector<RollingWindow::Entry> entries, + std::span<const timestamp_type> expected_timestamps) { + std::sort(std::begin(entries), std::end(entries), TimestampComparator{}); Review Comment: `ranges` can do this, too: ```suggestion entries |= ranges::actions::sort(TimestampComparator{}); ``` ########## extensions/standard-processors/processors/AttributeRollingWindow.h: ########## @@ -0,0 +1,117 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include <chrono> +#include <memory> +#include <mutex> +#include <optional> +#include <queue> +#include <string_view> + +#include "core/AbstractProcessor.h" +#include "core/Annotation.h" +#include "core/logging/LoggerFactory.h" +#include "core/PropertyDefinitionBuilder.h" +#include "core/PropertyType.h" +#include "core/RelationshipDefinition.h" +#include "RollingWindow.h" +#include "StateManager.h" + +namespace org::apache::nifi::minifi::processors { + +class AttributeRollingWindow final : public core::AbstractProcessor<AttributeRollingWindow> { + public: + using core::AbstractProcessor<AttributeRollingWindow>::AbstractProcessor; + + EXTENSIONAPI static constexpr auto Description = "Track a Rolling Window based on evaluating an Expression Language " + "expression on each FlowFile. Each FlowFile will be emitted with the count of FlowFiles and total aggregate value" + "of values processed in the current window."; + + EXTENSIONAPI static constexpr auto ValueToTrack = core::PropertyDefinitionBuilder<>::createProperty("Value to track") + .withDescription("The expression on which to evaluate each FlowFile. The result of the expression will be added " + "to the rolling window value.") + .isRequired(true) + .supportsExpressionLanguage(true) + .build(); + EXTENSIONAPI static constexpr auto TimeWindow = core::PropertyDefinitionBuilder<>::createProperty("Time window") + .withDescription("The amount of time for a rolling window. The format of the value is expected to be a " + "count followed by a time unit. For example 5 millis, 10 secs, 1 min, 3 hours, 2 days, etc.") + .withPropertyType(core::StandardPropertyTypes::TIME_PERIOD_TYPE) + .build(); + EXTENSIONAPI static constexpr auto WindowLength = core::PropertyDefinitionBuilder<>::createProperty("Window length") + .withDescription("The window length in number of values. Takes precedence over 'Time window'. If set to zero, " + "the 'Time window' property is used instead.") + .isRequired(true) + .withDefaultValue("0") + .withPropertyType(core::StandardPropertyTypes::UNSIGNED_INT_TYPE) + .build(); + EXTENSIONAPI static constexpr auto AttributeNamePrefix = core::PropertyDefinitionBuilder<>::createProperty("Attribute name prefix") + .withDescription("The prefix to add to the generated attribute names. For example, if this is set to 'rolling.window.', " + "then the full attribute names will be 'rolling.window.value', 'rolling.window.count', etc.") + .isRequired(true) + .withDefaultValue("rolling.window.") + .build(); + EXTENSIONAPI static constexpr auto Properties = std::array<core::PropertyReference, 4>{ + ValueToTrack, + TimeWindow, + WindowLength, + AttributeNamePrefix + }; + + EXTENSIONAPI static constexpr auto Success = core::RelationshipDefinition{"success", "All FlowFiles that are " + "successfully processed are routed to this relationship."}; + EXTENSIONAPI static constexpr auto Failure = core::RelationshipDefinition{"failure", "When a FlowFile fails, " + "it is routed here."}; + EXTENSIONAPI static constexpr auto Relationships = std::array{Success, Failure}; + + EXTENSIONAPI static constexpr auto Count = core::OutputAttributeDefinition<1>{"<prefix>count", {Success}, "Number of the values in the rolling window"}; + EXTENSIONAPI static constexpr auto Value = core::OutputAttributeDefinition<1>{"<prefix>value", {Success}, "Sum of the values in the rolling window"}; + EXTENSIONAPI static constexpr auto Mean = core::OutputAttributeDefinition<1>{"<prefix>mean", {Success}, "Mean of the values in the rolling window"}; + EXTENSIONAPI static constexpr auto Median = core::OutputAttributeDefinition<1>{"<prefix>median", {Success}, "Median of the values in the rolling window"}; + EXTENSIONAPI static constexpr auto Variance = core::OutputAttributeDefinition<1>{"<prefix>variance", {Success}, "Variance of the values in the rolling window"}; + EXTENSIONAPI static constexpr auto Stddev = core::OutputAttributeDefinition<1>{"<prefix>stddev", {Success}, "Standard deviation of the values in the rolling window"}; + EXTENSIONAPI static constexpr auto Min = core::OutputAttributeDefinition<1>{"<prefix>min", {Success}, "Smallest value in the rolling window"}; + EXTENSIONAPI static constexpr auto Max = core::OutputAttributeDefinition<1>{"<prefix>max", {Success}, "Largest value in the rolling window"}; Review Comment: `NumRelationships` defaults to `1`, so we could write `core::OutputAttributeDefinition<>{...}` ########## extensions/standard-processors/processors/AttributeRollingWindow.cpp: ########## @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "AttributeRollingWindow.h" +#include <algorithm> +#include <numeric> +#include "fmt/format.h" +#include "core/ProcessContext.h" +#include "core/ProcessSession.h" +#include "core/Resource.h" +#include "utils/expected.h" +#include "utils/OptionalUtils.h" + +namespace org::apache::nifi::minifi::processors { + +void AttributeRollingWindow::onSchedule(core::ProcessContext* context, core::ProcessSessionFactory*) { + gsl_Expects(context); + time_window_ = context->getProperty<core::TimePeriodValue>(TimeWindow) + | utils::transform(&core::TimePeriodValue::getMilliseconds); + window_length_ = context->getProperty<uint64_t>(WindowLength) + | utils::filter([](uint64_t value) { return value > 0; }) + | utils::transform([](uint64_t value) { return size_t{value}; }); Review Comment: If `size_t` is smaller than 64 bits and `value` doesn't fit, do we want to at least log something? ########## extensions/standard-processors/processors/AttributeRollingWindow.cpp: ########## @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "AttributeRollingWindow.h" +#include <algorithm> +#include <numeric> +#include "fmt/format.h" +#include "core/ProcessContext.h" +#include "core/ProcessSession.h" +#include "core/Resource.h" +#include "utils/expected.h" +#include "utils/OptionalUtils.h" + +namespace org::apache::nifi::minifi::processors { + +void AttributeRollingWindow::onSchedule(core::ProcessContext* context, core::ProcessSessionFactory*) { + gsl_Expects(context); + time_window_ = context->getProperty<core::TimePeriodValue>(TimeWindow) + | utils::transform(&core::TimePeriodValue::getMilliseconds); + window_length_ = context->getProperty<uint64_t>(WindowLength) + | utils::filter([](uint64_t value) { return value > 0; }) + | utils::transform([](uint64_t value) { return size_t{value}; }); + if (!time_window_ && !window_length_) { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "Either 'Time window' or 'Window length' must be set"}; + } + attribute_name_prefix_ = (context->getProperty(AttributeNamePrefix) + | utils::orElse([] { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "'Attribute name prefix' must be set"}; + })).value(); Review Comment: this exception can't really happen, because the property has a default value ########## extensions/standard-processors/processors/AttributeRollingWindow.cpp: ########## @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "AttributeRollingWindow.h" +#include <algorithm> +#include <numeric> +#include "fmt/format.h" +#include "core/ProcessContext.h" +#include "core/ProcessSession.h" +#include "core/Resource.h" +#include "utils/expected.h" +#include "utils/OptionalUtils.h" + +namespace org::apache::nifi::minifi::processors { + +void AttributeRollingWindow::onSchedule(core::ProcessContext* context, core::ProcessSessionFactory*) { + gsl_Expects(context); + time_window_ = context->getProperty<core::TimePeriodValue>(TimeWindow) + | utils::transform(&core::TimePeriodValue::getMilliseconds); + window_length_ = context->getProperty<uint64_t>(WindowLength) + | utils::filter([](uint64_t value) { return value > 0; }) + | utils::transform([](uint64_t value) { return size_t{value}; }); + if (!time_window_ && !window_length_) { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "Either 'Time window' or 'Window length' must be set"}; + } + attribute_name_prefix_ = (context->getProperty(AttributeNamePrefix) + | utils::orElse([] { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "'Attribute name prefix' must be set"}; + })).value(); + gsl_Ensures(runningInvariant()); +} + +void AttributeRollingWindow::onTrigger(core::ProcessContext* context, core::ProcessSession* session) { + gsl_Expects(context && session && runningInvariant()); + const auto flow_file = session->get(); + if (!flow_file) { yield(); return; } + gsl_Assert(flow_file); + const auto current_value_opt = context->getProperty(ValueToTrack, flow_file); + if (!current_value_opt) { + logger_->log_warn("Missing value to track, flow file uuid: {}", flow_file->getUUIDStr()); + session->transfer(flow_file, Failure); + return; + } + const auto current_value = [¤t_value_opt] { + try { + return std::stod(*current_value_opt); + } catch (const std::exception& ex) { + throw minifi::Exception{ExceptionType::PROCESSOR_EXCEPTION, + fmt::format("Failed to convert 'Value to track' of '{}' to double", *current_value_opt)}; + } + }(); + // copy: so we can release the lock sooner + const auto state_copy = [&, now = std::chrono::system_clock::now()] { + const std::lock_guard lg{state_mutex_}; + state_.add(now, current_value); + if (window_length_) { + state_.shrinkToSize(*window_length_); + } else { + gsl_Assert(time_window_); + state_.removeOlderThan(now - *time_window_); + } + return state_.getEntries(); + }(); + const auto sorted_values = [&state_copy] { + auto values = state_copy | ranges::views::transform(&decltype(state_)::Entry::value) | ranges::to<std::vector>; + std::sort(std::begin(values), std::end(values)); + return values; + }(); + calculateAndSetAttributes(*flow_file, sorted_values); + session->transfer(flow_file, Success); +} + +/** + * Calculate statistical properties of the values in the rolling window and set them as attributes on the flow file. + * Properties: count, value (sum), mean (average), median, variance, stddev + */ +void AttributeRollingWindow::calculateAndSetAttributes(core::FlowFile &flow_file, + std::span<const double> sorted_values) const { + const auto attribute_name = [this](std::string_view suffix) { + return utils::string::join_pack(attribute_name_prefix_, suffix); + }; + const auto set_aggregate = [&flow_file, attribute_name](std::string_view name, double value) { + flow_file.setAttribute(attribute_name(name), std::to_string(value)); + }; + set_aggregate("count", sorted_values.size()); + const auto sum = std::accumulate(std::begin(sorted_values), std::end(sorted_values), 0.0); + set_aggregate("value", sum); + const auto mean = sum / gsl::narrow_cast<double>(sorted_values.size()); + set_aggregate("mean", mean); + set_aggregate("median", [&] { + const auto mid = sorted_values.size() / 2; + return sorted_values.size() % 2 == 0 + ? std::midpoint(sorted_values[mid], sorted_values[mid - 1]) // even number of values: average the two middle values Review Comment: If `sorted_value` is empty, we will crash here (if we're lucky). This should never happen, but I would add a `gsl_Expects` to the start of this function just to be safe. ########## extensions/standard-processors/processors/AttributeRollingWindow.cpp: ########## @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "AttributeRollingWindow.h" +#include <algorithm> +#include <numeric> +#include "fmt/format.h" +#include "core/ProcessContext.h" +#include "core/ProcessSession.h" +#include "core/Resource.h" +#include "utils/expected.h" +#include "utils/OptionalUtils.h" + +namespace org::apache::nifi::minifi::processors { + +void AttributeRollingWindow::onSchedule(core::ProcessContext* context, core::ProcessSessionFactory*) { + gsl_Expects(context); + time_window_ = context->getProperty<core::TimePeriodValue>(TimeWindow) + | utils::transform(&core::TimePeriodValue::getMilliseconds); + window_length_ = context->getProperty<uint64_t>(WindowLength) + | utils::filter([](uint64_t value) { return value > 0; }) + | utils::transform([](uint64_t value) { return size_t{value}; }); + if (!time_window_ && !window_length_) { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "Either 'Time window' or 'Window length' must be set"}; + } + attribute_name_prefix_ = (context->getProperty(AttributeNamePrefix) + | utils::orElse([] { + throw minifi::Exception{ExceptionType::PROCESS_SCHEDULE_EXCEPTION, "'Attribute name prefix' must be set"}; + })).value(); + gsl_Ensures(runningInvariant()); +} + +void AttributeRollingWindow::onTrigger(core::ProcessContext* context, core::ProcessSession* session) { + gsl_Expects(context && session && runningInvariant()); + const auto flow_file = session->get(); + if (!flow_file) { yield(); return; } + gsl_Assert(flow_file); + const auto current_value_opt = context->getProperty(ValueToTrack, flow_file); + if (!current_value_opt) { + logger_->log_warn("Missing value to track, flow file uuid: {}", flow_file->getUUIDStr()); + session->transfer(flow_file, Failure); + return; + } + const auto current_value = [¤t_value_opt] { + try { + return std::stod(*current_value_opt); + } catch (const std::exception& ex) { + throw minifi::Exception{ExceptionType::PROCESSOR_EXCEPTION, + fmt::format("Failed to convert 'Value to track' of '{}' to double", *current_value_opt)}; + } + }(); + // copy: so we can release the lock sooner + const auto state_copy = [&, now = std::chrono::system_clock::now()] { + const std::lock_guard lg{state_mutex_}; + state_.add(now, current_value); + if (window_length_) { + state_.shrinkToSize(*window_length_); + } else { + gsl_Assert(time_window_); + state_.removeOlderThan(now - *time_window_); + } + return state_.getEntries(); + }(); + const auto sorted_values = [&state_copy] { + auto values = state_copy | ranges::views::transform(&decltype(state_)::Entry::value) | ranges::to<std::vector>; + std::sort(std::begin(values), std::end(values)); Review Comment: ```suggestion values |= ranges::actions::sort; ``` ########## libminifi/include/core/AbstractProcessor.h: ########## @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include <string> +#include <string_view> +#include <type_traits> +#include "range/v3/view/transform.hpp" +#include "core/Annotation.h" +#include "core/Core.h" +#include "core/Processor.h" +#include "core/PropertyDefinition.h" +#include "core/RelationshipDefinition.h" + +namespace org::apache::nifi::minifi::core { +template<typename ProcessorT> +class AbstractProcessor : public Processor { + public: + using Processor::Processor; + + void initialize() final { + static_assert(std::is_same_v<typename decltype(ProcessorT::Properties)::value_type, PropertyReference>); + static_assert(std::is_same_v<typename decltype(ProcessorT::Relationships)::value_type, RelationshipDefinition>); + setSupportedProperties(ProcessorT::Properties); + setSupportedRelationships(ProcessorT::Relationships); + } + + void onSchedule(core::ProcessContext*, core::ProcessSessionFactory*) override = 0; + void onTrigger(core::ProcessContext*, core::ProcessSession*) override = 0; + + bool supportsDynamicProperties() const noexcept final { return ProcessorT::SupportsDynamicProperties; } + bool supportsDynamicRelationships() const noexcept final { return ProcessorT::SupportsDynamicRelationships; } + minifi::core::annotation::Input getInputRequirement() const noexcept final { return ProcessorT::InputRequirement; } + bool isSingleThreaded() const noexcept final { return ProcessorT::IsSingleThreaded; } + std::string getProcessorType() const final { + constexpr auto class_name = className<ProcessorT>(); + constexpr auto last_colon_index = class_name.find_last_of(':'); + constexpr auto end = class_name.substr(last_colon_index + 1); + if constexpr (last_colon_index == std::string_view::npos) { + return std::string{class_name}; + } Review Comment: should we switch lines 52 and 53-55? `.substr(npos + 1)` looks like it may not compile on some compilers ########## libminifi/src/core/extension/ExtensionManager.cpp: ########## @@ -72,9 +72,13 @@ bool ExtensionManager::initialize(const std::shared_ptr<Configure>& config) { })); for (const auto& candidate : candidates) { auto library = internal::asDynamicLibrary(candidate); - if (!library || !library->verify(logger_)) { + if (!library) { continue; } + if (!library->verify(logger_)) { + logger_->log_warn("Skipping library '{}' at '{}': failed verification, different build?", + library->name, library->getFullPath()); + } Review Comment: we log that we are skipping it, but we aren't: is a `continue` missing? ########## PROCESSORS.md: ########## @@ -147,6 +148,44 @@ In the list below, the names of required properties appear in bold. Any other pr | success | success operational on the flow record | +## AttributeRollingWindow + +### Description + +Track a Rolling Window based on evaluating an Expression Language expression on each FlowFile. Each FlowFile will be emitted with the count of FlowFiles and total aggregate valueof values processed in the current window. + +### Properties + +In the list below, the names of required properties appear in bold. Any other properties (not in bold) are considered optional. The table also indicates any default values, and whether a property supports the NiFi Expression Language. + +| Name | Default Value | Allowable Values | Description | +|---------------------------|-----------------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Value to track** | | | The expression on which to evaluate each FlowFile. The result of the expression will be added to the rolling window value.<br/>**Supports Expression Language: true** | +| Time window | | | The amount of time for a rolling window. The format of the value is expected to be a count followed by a time unit. For example 5 millis, 10 secs, 1 min, 3 hours, 2 days, etc. | +| **Window length** | 0 | | The window length in number of values. Takes precedence over 'Time window'. If set to zero, the 'Time window' property is used instead. | +| **Attribute name prefix** | rolling.window. | | The prefix to add to the generated attribute names. For example, if this is set to 'rolling.window.', then the full attribute names will be 'rolling.window.value', 'rolling.window.count', etc. | + +### Relationships + +| Name | Description | +|---------|--------------------------------------------------------------------------------| +| success | All FlowFiles that are successfully processed are routed to this relationship. | +| failure | When a FlowFile fails, it is routed here. | + +### Output Attributes + +| Attribute | Relationship | Description | +|------------------|--------------|--------------------------------------------------------| +| <prefix>count | success | Number of the values in the rolling window | +| <prefix>value | success | Sum of the values in the rolling window | Review Comment: I'm not a fan of using `value` instead of `sum`, but I guess we want to be compatible with NiFi :sigh_emoji: -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@nifi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org