OliverKeyes has submitted this change and it was merged. Change subject: Adjust dwelltime calculation ......................................................................
Adjust dwelltime calculation - Makes dwell_time_ output max intertime - Brings dwelltime > threshold checking out into dwell_time - Adds sample event log data and unit tests Bug: T128929 Change-Id: I60ad61077b1db34f668973180285937fbe7d300d --- M .Rbuildignore M DESCRIPTION M NAMESPACE M R/RcppExports.R M R/dwell.R M R/utilities.R A inst/extdata/sample_events.csv M man/dwell_time.Rd M man/ortiz.Rd M src/RcppExports.cpp M src/dwell.cpp M src/dwell.h M src/ortiz.cpp A tests/testthat.R A tests/testthat/test-dwelltime.R 15 files changed, 98 insertions(+), 53 deletions(-) Approvals: OliverKeyes: Verified; Looks good to me, approved diff --git a/.Rbuildignore b/.Rbuildignore index fdb7e2b..d17ba1a 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,3 +4,4 @@ ^README.Rmd$ ^.travis.yml$ ^appveyor.yml$ +^.*\.gitreview$ diff --git a/DESCRIPTION b/DESCRIPTION index dd4319b..0f7ace9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,17 @@ Package: ortiz Title: Speedy User Satisfaction Computations -Version: 0.0.1 +Version: 0.0.2 Author: Oliver Keyes -Maintainer: Oliver Keyes <[email protected]> -Description: Speedily computes various user satisfaction-related metrics for Search. +Maintainer: Mikhail Popov <[email protected]> +Description: Speedily computes various user satisfaction-related metrics for + Search. License: MIT + file LICENSE LazyData: true -URL: https://gerrit.wikimedia.org/r/#/admin/projects/wikimedia/discovery/ortiz -BugReports: https://phabricator.wikimedia.org/maniphest/task/create/?projects=Discovery +URL: https://git.wikimedia.org/summary/?r=wikimedia/discovery/ortiz.git +BugReports: https://phabricator.wikimedia.org/maniphest/task/create/? + projects=Discovery LinkingTo: Rcpp -Imports: Rcpp +Imports: + Rcpp +RoxygenNote: 5.0.1 +Suggests: testthat diff --git a/NAMESPACE b/NAMESPACE index 165486d..3bd0ff1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,4 @@ -# Generated by roxygen2 (4.1.1): do not edit by hand +# Generated by roxygen2: do not edit by hand export(dwell_time) importFrom(Rcpp,sourceCpp) diff --git a/R/RcppExports.R b/R/RcppExports.R index d6e42b5..9263b78 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,7 +1,7 @@ # This file was generated by Rcpp::compileAttributes # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -dwell_time_ <- function(timestamps, dwell_value) { - .Call('ortiz_dwell_time_', PACKAGE = 'ortiz', timestamps, dwell_value) +dwell_time_ <- function(timestamps) { + .Call('ortiz_dwell_time_', PACKAGE = 'ortiz', timestamps) } diff --git a/R/dwell.R b/R/dwell.R index bc663d2..ff2ff79 100644 --- a/R/dwell.R +++ b/R/dwell.R @@ -8,23 +8,23 @@ #' #'@param data a data.frame containing session data #' -#'@param ids the name or indices of the column containing unique session IDs +#'@param id_col the name or index of the column containing unique session IDs #' -#'@param timestamps the name or indices of the column containing timestamps. +#'@param ts_col the name or index of the column containing timestamps #' #'@param dwell_threshold the value (in seconds) to use to indicate a "successful" #'session. #' #'@export -dwell_time <- function(data, ids, timestamps, dwell_threshold = 100){ +dwell_time <- function(data, id_col, ts_col, dwell_threshold = 100) { # Check type. We need timestamps to end up as numeric seconds representations, # which we can trivially convert to if they're POSIX (or if they're already) # formatted) but can't handle consistently otherwise. - data <- numeric_check(data, timestamps) + data <- numeric_check(data, ts_col) # Split the data up per unique session/user ID. - split_data <- split(x = data[,timestamps], f = data[,ids]) + split_data <- split(x = data[, ts_col], f = data[, id_col]) - return(dwell_time_(split_data, dwell_threshold)) + return(dwell_time_(split_data) > dwell_threshold) } diff --git a/R/utilities.R b/R/utilities.R index 8de4951..8fa01d1 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -1,9 +1,8 @@ -numeric_check <- function(data, timestamps){ - if(any(c("POSIXlt","POSIXt") %in% c(class(data[,timestamps])))){ - data[,timestamps] <- as.numeric(data[,timestamps]) - } else if(!any(c("numeric","integer") %in% class(data[,timestamps]))){ - stop("The timestamps column must be either a numeric value representing seconds, - or POSIX timestamps") +numeric_check <- function(data, timestamps) { + if (any(c("POSIXt", "POSIXlt", "POSIXct") %in% unlist(lapply(data[, timestamps], class)))) { + data[, timestamps] <- as.numeric(data[, timestamps]) + } else if (!any(c("numeric", "integer") %in% class(data[, timestamps]))) { + stop("The timestamps column must be either a numeric value representing seconds or POSIX timestamps") } return(data) } diff --git a/inst/extdata/sample_events.csv b/inst/extdata/sample_events.csv new file mode 100644 index 0000000..f85d6ae --- /dev/null +++ b/inst/extdata/sample_events.csv @@ -0,0 +1,19 @@ +timestamp,session_id +2016-03-01T06:52:49Z,0024c4506bf92e1c +2016-03-01T06:52:59Z,0024c4506bf92e1c +2016-03-01T06:53:09Z,0024c4506bf92e1c +2016-03-01T06:53:19Z,0024c4506bf92e1c +2016-03-01T06:53:29Z,0024c4506bf92e1c +2016-03-01T06:53:39Z,0024c4506bf92e1c +2016-03-01T06:53:50Z,0024c4506bf92e1c +2016-03-01T06:54:21Z,0024c4506bf92e1c +2016-03-01T15:07:25Z,fffb978cc690214c +2016-03-01T15:07:35Z,fffb978cc690214c +2016-03-01T15:07:45Z,fffb978cc690214c +2016-03-01T15:07:55Z,fffb978cc690214c +2016-03-01T15:08:05Z,fffb978cc690214c +2016-03-01T15:08:15Z,fffb978cc690214c +2016-03-01T15:08:25Z,fffb978cc690214c +2016-03-01T15:08:55Z,fffb978cc690214c +2016-03-01T15:09:25Z,fffb978cc690214c +2016-03-01T15:09:55Z,fffb978cc690214c diff --git a/man/dwell_time.Rd b/man/dwell_time.Rd index 979eecc..e5acc1c 100644 --- a/man/dwell_time.Rd +++ b/man/dwell_time.Rd @@ -1,17 +1,17 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/dwell.R \name{dwell_time} \alias{dwell_time} \title{Identify User Satisfaction Rate Based on Dwell Time} \usage{ -dwell_time(data, ids, timestamps, dwell_threshold = 100) +dwell_time(data, id_col, ts_col, dwell_threshold = 100) } \arguments{ \item{data}{a data.frame containing session data} -\item{ids}{the name or indices of the column containing unique session IDs} +\item{id_col}{the name or index of the column containing unique session IDs} -\item{timestamps}{the name or indices of the column containing timestamps.} +\item{ts_col}{the name or index of the column containing timestamps} \item{dwell_threshold}{the value (in seconds) to use to indicate a "successful" session.} diff --git a/man/ortiz.Rd b/man/ortiz.Rd index 87868cf..af75927 100644 --- a/man/ortiz.Rd +++ b/man/ortiz.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/ortiz.R \docType{package} \name{ortiz} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index f349fed..dda9989 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -6,14 +6,13 @@ using namespace Rcpp; // dwell_time_ -std::vector < bool > dwell_time_(std::list < std::vector < int > > timestamps, int dwell_value); -RcppExport SEXP ortiz_dwell_time_(SEXP timestampsSEXP, SEXP dwell_valueSEXP) { +std::vector < unsigned int > dwell_time_(std::list < std::vector < int > > timestamps); +RcppExport SEXP ortiz_dwell_time_(SEXP timestampsSEXP) { BEGIN_RCPP Rcpp::RObject __result; Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< std::list < std::vector < int > > >::type timestamps(timestampsSEXP); - Rcpp::traits::input_parameter< int >::type dwell_value(dwell_valueSEXP); - __result = Rcpp::wrap(dwell_time_(timestamps, dwell_value)); + __result = Rcpp::wrap(dwell_time_(timestamps)); return __result; END_RCPP } diff --git a/src/dwell.cpp b/src/dwell.cpp index 586bb07..0fd394b 100644 --- a/src/dwell.cpp +++ b/src/dwell.cpp @@ -1,35 +1,36 @@ #include "dwell.h" -bool dwelltime::dwell_single(std::vector < int > timestamps){ +unsigned int dwelltime::dwell_single(std::vector < int > timestamps){ // We can't compute if there was only one event (this should // only happen if a user never made it to a result but whatever) int input_size = timestamps.size(); if(input_size < 2){ - return false; + return 0; } - // Ensure timestamps are sorted - sort(timestamps.begin(), timestamps.end()); + // Timestamps should be sorted already, sorting them here would + // mess with the visit-page and last-check-in intertimes. + // sort(timestamps.begin(), timestamps.end()); + unsigned int max_intertime = 0; - // Compute intertime values. If any of them are > - // dwell_time, we have a success. + // Compute intertime values and figure out the maximum one. for(unsigned int i = 1; i < input_size; i++){ - if((timestamps[i] - timestamps[i-1]) >= dwell_val){ - return true; + if (timestamps[i] - timestamps[i-1] > max_intertime){ + max_intertime = timestamps[i] - timestamps[i-1]; } } - // Otherwise, fail - return false; + // Return + return max_intertime; } -std::vector < bool > dwelltime::dwell_vector(std::list < std::vector < int > > timestamps){ +std::vector < unsigned int > dwelltime::dwell_vector(std::list < std::vector < int > > timestamps){ // Create output object unsigned int input_size = timestamps.size(); - std::vector < bool > output(input_size); + std::vector < unsigned int > output(input_size); // We can't iterate over a list with indices, which is sad (and means we have to // distinctly increment an integer for assignment to the output vector) @@ -44,8 +45,4 @@ // Return return output; -} - -dwelltime::dwelltime(int val){ - dwell_val = val; } diff --git a/src/dwell.h b/src/dwell.h index 064cedd..be121ea 100644 --- a/src/dwell.h +++ b/src/dwell.h @@ -8,15 +8,12 @@ private: - unsigned int dwell_val; - - bool dwell_single(std::vector < int > timestamps); + unsigned int dwell_single(std::vector < int > timestamps); public: - std::vector < bool > dwell_vector(std::list < std::vector < int > > timestamps); + std::vector < unsigned int > dwell_vector(std::list < std::vector < int > > timestamps); - dwelltime(int val); }; #endif diff --git a/src/ortiz.cpp b/src/ortiz.cpp index 4bc5017..2215e56 100644 --- a/src/ortiz.cpp +++ b/src/ortiz.cpp @@ -1,10 +1,10 @@ #include "dwell.h" // [[Rcpp::export]] -std::vector < bool > dwell_time_(std::list < std::vector < int > > timestamps, int dwell_value){ +std::vector < unsigned int > dwell_time_(std::list < std::vector < int > > timestamps){ // Instantiate the dwelltime class, giving it the dwell value - dwelltime dwell_inst(dwell_value); + dwelltime dwell_inst; // Compute dwell times for the input list and return it. return dwell_inst.dwell_vector(timestamps); diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..b1db86a --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(ortiz) + +test_check("ortiz") diff --git a/tests/testthat/test-dwelltime.R b/tests/testthat/test-dwelltime.R new file mode 100644 index 0000000..ff48c69 --- /dev/null +++ b/tests/testthat/test-dwelltime.R @@ -0,0 +1,24 @@ +context("dwelltime") + +sample_events <- read.csv(system.file("extdata", "sample_events.csv", package="ortiz"), stringsAsFactors = FALSE) +sample_events$timestamp <- as.POSIXct(sample_events$timestamp, format = "%Y-%m-%dT%H:%M:%SZ", tz = "UTC") + +test_that("intertimes are correctly estimated", { + expect_equal(ortiz:::dwell_time_(split(x = sample_events[, "timestamp"], f = sample_events[, "session_id"])), c(31, 30)) +}) + +test_that("session intertimes are correctly estimated", { + expect_equal(ortiz:::dwell_time_(split(x = sample_events[c(1, 8:9, 18), "timestamp"], f = sample_events[c(1, 8:9, 18), "session_id"])), c(92, 150)) +}) + +test_that("threshold of 0 returns all TRUE", { + expect_equal(dwell_time(sample_events, "session_id", "timestamp", 0), c(TRUE, TRUE)) +}) + +test_that("threshold of 100 returns all FALSE", { + expect_equal(dwell_time(sample_events, "session_id", "timestamp", 100), c(FALSE, FALSE)) +}) + +test_that("threshold of 30 returns 50% pass rate", { + expect_equal(dwell_time(sample_events, "session_id", "timestamp", 30), c(TRUE, FALSE)) +}) -- To view, visit https://gerrit.wikimedia.org/r/275144 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I60ad61077b1db34f668973180285937fbe7d300d Gerrit-PatchSet: 4 Gerrit-Project: wikimedia/discovery/ortiz Gerrit-Branch: master Gerrit-Owner: Bearloga <[email protected]> Gerrit-Reviewer: OliverKeyes <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
