OliverKeyes has submitted this change and it was merged.

Change subject: Adjust dwelltime calculation
......................................................................


Adjust dwelltime calculation

- Makes dwell_time_ output max intertime
- Brings dwelltime > threshold checking out into dwell_time
- Adds sample event log data and unit tests

Bug: T128929
Change-Id: I60ad61077b1db34f668973180285937fbe7d300d
---
M .Rbuildignore
M DESCRIPTION
M NAMESPACE
M R/RcppExports.R
M R/dwell.R
M R/utilities.R
A inst/extdata/sample_events.csv
M man/dwell_time.Rd
M man/ortiz.Rd
M src/RcppExports.cpp
M src/dwell.cpp
M src/dwell.h
M src/ortiz.cpp
A tests/testthat.R
A tests/testthat/test-dwelltime.R
15 files changed, 98 insertions(+), 53 deletions(-)

Approvals:
  OliverKeyes: Verified; Looks good to me, approved



diff --git a/.Rbuildignore b/.Rbuildignore
index fdb7e2b..d17ba1a 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -4,3 +4,4 @@
 ^README.Rmd$
 ^.travis.yml$
 ^appveyor.yml$
+^.*\.gitreview$
diff --git a/DESCRIPTION b/DESCRIPTION
index dd4319b..0f7ace9 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,12 +1,17 @@
 Package: ortiz
 Title: Speedy User Satisfaction Computations
-Version: 0.0.1
+Version: 0.0.2
 Author: Oliver Keyes
-Maintainer: Oliver Keyes <[email protected]>
-Description: Speedily computes various user satisfaction-related metrics for 
Search.
+Maintainer: Mikhail Popov <[email protected]>
+Description: Speedily computes various user satisfaction-related metrics for
+    Search.
 License: MIT + file LICENSE
 LazyData: true
-URL: https://gerrit.wikimedia.org/r/#/admin/projects/wikimedia/discovery/ortiz
-BugReports: 
https://phabricator.wikimedia.org/maniphest/task/create/?projects=Discovery
+URL: https://git.wikimedia.org/summary/?r=wikimedia/discovery/ortiz.git
+BugReports: https://phabricator.wikimedia.org/maniphest/task/create/?
+    projects=Discovery
 LinkingTo: Rcpp
-Imports: Rcpp
+Imports:
+    Rcpp
+RoxygenNote: 5.0.1
+Suggests: testthat
diff --git a/NAMESPACE b/NAMESPACE
index 165486d..3bd0ff1 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,4 +1,4 @@
-# Generated by roxygen2 (4.1.1): do not edit by hand
+# Generated by roxygen2: do not edit by hand
 
 export(dwell_time)
 importFrom(Rcpp,sourceCpp)
diff --git a/R/RcppExports.R b/R/RcppExports.R
index d6e42b5..9263b78 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -1,7 +1,7 @@
 # This file was generated by Rcpp::compileAttributes
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-dwell_time_ <- function(timestamps, dwell_value) {
-    .Call('ortiz_dwell_time_', PACKAGE = 'ortiz', timestamps, dwell_value)
+dwell_time_ <- function(timestamps) {
+    .Call('ortiz_dwell_time_', PACKAGE = 'ortiz', timestamps)
 }
 
diff --git a/R/dwell.R b/R/dwell.R
index bc663d2..ff2ff79 100644
--- a/R/dwell.R
+++ b/R/dwell.R
@@ -8,23 +8,23 @@
 #'
 #'@param data a data.frame containing session data
 #'
-#'@param ids the name or indices of the column containing unique session IDs
+#'@param id_col the name or index of the column containing unique session IDs
 #'
-#'@param timestamps the name or indices of the column containing timestamps.
+#'@param ts_col the name or index of the column containing timestamps
 #'
 #'@param dwell_threshold the value (in seconds) to use to indicate a 
"successful"
 #'session.
 #'
 #'@export
-dwell_time <- function(data, ids, timestamps, dwell_threshold = 100){
+dwell_time <- function(data, id_col, ts_col, dwell_threshold = 100) {
   
   # Check type. We need timestamps to end up as numeric seconds 
representations,
   # which we can trivially convert to if they're POSIX (or if they're already)
   # formatted) but can't handle consistently otherwise.
-  data <- numeric_check(data, timestamps)
+  data <- numeric_check(data, ts_col)
   
   # Split the data up per unique session/user ID.
-  split_data <- split(x = data[,timestamps], f = data[,ids])
+  split_data <- split(x = data[, ts_col], f = data[, id_col])
   
-  return(dwell_time_(split_data, dwell_threshold))
+  return(dwell_time_(split_data) > dwell_threshold)
 }
diff --git a/R/utilities.R b/R/utilities.R
index 8de4951..8fa01d1 100644
--- a/R/utilities.R
+++ b/R/utilities.R
@@ -1,9 +1,8 @@
-numeric_check <- function(data, timestamps){
-  if(any(c("POSIXlt","POSIXt") %in% c(class(data[,timestamps])))){
-    data[,timestamps] <- as.numeric(data[,timestamps])
-  } else if(!any(c("numeric","integer") %in% class(data[,timestamps]))){
-    stop("The timestamps column must be either a numeric value representing 
seconds,
-         or POSIX timestamps")
+numeric_check <- function(data, timestamps) {
+  if (any(c("POSIXt", "POSIXlt", "POSIXct") %in% unlist(lapply(data[, 
timestamps], class)))) {
+    data[, timestamps] <- as.numeric(data[, timestamps])
+  } else if (!any(c("numeric", "integer") %in% class(data[, timestamps]))) {
+    stop("The timestamps column must be either a numeric value representing 
seconds or POSIX timestamps")
   }
   return(data)
 }
diff --git a/inst/extdata/sample_events.csv b/inst/extdata/sample_events.csv
new file mode 100644
index 0000000..f85d6ae
--- /dev/null
+++ b/inst/extdata/sample_events.csv
@@ -0,0 +1,19 @@
+timestamp,session_id
+2016-03-01T06:52:49Z,0024c4506bf92e1c
+2016-03-01T06:52:59Z,0024c4506bf92e1c
+2016-03-01T06:53:09Z,0024c4506bf92e1c
+2016-03-01T06:53:19Z,0024c4506bf92e1c
+2016-03-01T06:53:29Z,0024c4506bf92e1c
+2016-03-01T06:53:39Z,0024c4506bf92e1c
+2016-03-01T06:53:50Z,0024c4506bf92e1c
+2016-03-01T06:54:21Z,0024c4506bf92e1c
+2016-03-01T15:07:25Z,fffb978cc690214c
+2016-03-01T15:07:35Z,fffb978cc690214c
+2016-03-01T15:07:45Z,fffb978cc690214c
+2016-03-01T15:07:55Z,fffb978cc690214c
+2016-03-01T15:08:05Z,fffb978cc690214c
+2016-03-01T15:08:15Z,fffb978cc690214c
+2016-03-01T15:08:25Z,fffb978cc690214c
+2016-03-01T15:08:55Z,fffb978cc690214c
+2016-03-01T15:09:25Z,fffb978cc690214c
+2016-03-01T15:09:55Z,fffb978cc690214c
diff --git a/man/dwell_time.Rd b/man/dwell_time.Rd
index 979eecc..e5acc1c 100644
--- a/man/dwell_time.Rd
+++ b/man/dwell_time.Rd
@@ -1,17 +1,17 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/dwell.R
 \name{dwell_time}
 \alias{dwell_time}
 \title{Identify User Satisfaction Rate Based on Dwell Time}
 \usage{
-dwell_time(data, ids, timestamps, dwell_threshold = 100)
+dwell_time(data, id_col, ts_col, dwell_threshold = 100)
 }
 \arguments{
 \item{data}{a data.frame containing session data}
 
-\item{ids}{the name or indices of the column containing unique session IDs}
+\item{id_col}{the name or index of the column containing unique session IDs}
 
-\item{timestamps}{the name or indices of the column containing timestamps.}
+\item{ts_col}{the name or index of the column containing timestamps}
 
 \item{dwell_threshold}{the value (in seconds) to use to indicate a "successful"
 session.}
diff --git a/man/ortiz.Rd b/man/ortiz.Rd
index 87868cf..af75927 100644
--- a/man/ortiz.Rd
+++ b/man/ortiz.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ortiz.R
 \docType{package}
 \name{ortiz}
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index f349fed..dda9989 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -6,14 +6,13 @@
 using namespace Rcpp;
 
 // dwell_time_
-std::vector < bool > dwell_time_(std::list < std::vector < int > > timestamps, 
int dwell_value);
-RcppExport SEXP ortiz_dwell_time_(SEXP timestampsSEXP, SEXP dwell_valueSEXP) {
+std::vector < unsigned int > dwell_time_(std::list < std::vector < int > > 
timestamps);
+RcppExport SEXP ortiz_dwell_time_(SEXP timestampsSEXP) {
 BEGIN_RCPP
     Rcpp::RObject __result;
     Rcpp::RNGScope __rngScope;
     Rcpp::traits::input_parameter< std::list < std::vector < int > > >::type 
timestamps(timestampsSEXP);
-    Rcpp::traits::input_parameter< int >::type dwell_value(dwell_valueSEXP);
-    __result = Rcpp::wrap(dwell_time_(timestamps, dwell_value));
+    __result = Rcpp::wrap(dwell_time_(timestamps));
     return __result;
 END_RCPP
 }
diff --git a/src/dwell.cpp b/src/dwell.cpp
index 586bb07..0fd394b 100644
--- a/src/dwell.cpp
+++ b/src/dwell.cpp
@@ -1,35 +1,36 @@
 #include "dwell.h"
 
-bool dwelltime::dwell_single(std::vector < int > timestamps){
+unsigned int dwelltime::dwell_single(std::vector < int > timestamps){
   
   // We can't compute if there was only one event (this should
   // only happen if a user never made it to a result but whatever)
   int input_size = timestamps.size();
   if(input_size < 2){
-    return false;
+    return 0;
   }
   
-  // Ensure timestamps are sorted
-  sort(timestamps.begin(), timestamps.end());
+  // Timestamps should be sorted already, sorting them here would
+  // mess with the visit-page and last-check-in intertimes.
+  // sort(timestamps.begin(), timestamps.end());
+  unsigned int max_intertime = 0;
   
-  // Compute intertime values. If any of them are >
-  // dwell_time, we have a success.
+  // Compute intertime values and figure out the maximum one.
   for(unsigned int i = 1; i < input_size; i++){
-    if((timestamps[i] - timestamps[i-1]) >= dwell_val){
-      return true;
+    if (timestamps[i] - timestamps[i-1] > max_intertime){
+      max_intertime = timestamps[i] - timestamps[i-1];
     }
   }
   
-  // Otherwise, fail
-  return false;
+  // Return
+  return max_intertime;
   
 }
 
-std::vector < bool > dwelltime::dwell_vector(std::list < std::vector < int > > 
timestamps){
+std::vector < unsigned int > dwelltime::dwell_vector(std::list < std::vector < 
int > > timestamps){
   
   // Create output object
   unsigned int input_size = timestamps.size();
-  std::vector < bool > output(input_size);
+  std::vector < unsigned int > output(input_size);
   
   // We can't iterate over a list with indices, which is sad (and means we 
have to
   // distinctly increment an integer for assignment to the output vector)
@@ -44,8 +45,4 @@
   
   // Return
   return output;
-}
-
-dwelltime::dwelltime(int val){
-  dwell_val = val;
 }
diff --git a/src/dwell.h b/src/dwell.h
index 064cedd..be121ea 100644
--- a/src/dwell.h
+++ b/src/dwell.h
@@ -8,15 +8,12 @@
   
 private:
   
-  unsigned int dwell_val;
-  
-  bool dwell_single(std::vector < int > timestamps);
+  unsigned int dwell_single(std::vector < int > timestamps);
   
 public:
   
-  std::vector < bool > dwell_vector(std::list < std::vector < int > > 
timestamps);
+  std::vector < unsigned int > dwell_vector(std::list < std::vector < int > > 
timestamps);
   
-  dwelltime(int val);
 };
 
 #endif
diff --git a/src/ortiz.cpp b/src/ortiz.cpp
index 4bc5017..2215e56 100644
--- a/src/ortiz.cpp
+++ b/src/ortiz.cpp
@@ -1,10 +1,10 @@
 #include "dwell.h"
 
 // [[Rcpp::export]]
-std::vector < bool > dwell_time_(std::list < std::vector < int > > timestamps, 
int dwell_value){
+std::vector < unsigned int > dwell_time_(std::list < std::vector < int > > 
timestamps){
   
   // Instantiate the dwelltime class, giving it the dwell value
-  dwelltime dwell_inst(dwell_value);
+  dwelltime dwell_inst;
   
   // Compute dwell times for the input list and return it.
   return dwell_inst.dwell_vector(timestamps);
diff --git a/tests/testthat.R b/tests/testthat.R
new file mode 100644
index 0000000..b1db86a
--- /dev/null
+++ b/tests/testthat.R
@@ -0,0 +1,4 @@
+library(testthat)
+library(ortiz)
+
+test_check("ortiz")
diff --git a/tests/testthat/test-dwelltime.R b/tests/testthat/test-dwelltime.R
new file mode 100644
index 0000000..ff48c69
--- /dev/null
+++ b/tests/testthat/test-dwelltime.R
@@ -0,0 +1,24 @@
+context("dwelltime")
+
+sample_events <- read.csv(system.file("extdata", "sample_events.csv", 
package="ortiz"), stringsAsFactors = FALSE)
+sample_events$timestamp <- as.POSIXct(sample_events$timestamp, format = 
"%Y-%m-%dT%H:%M:%SZ", tz = "UTC")
+
+test_that("intertimes are correctly estimated", {
+  expect_equal(ortiz:::dwell_time_(split(x = sample_events[, "timestamp"], f = 
sample_events[, "session_id"])), c(31, 30))
+})
+
+test_that("session intertimes are correctly estimated", {
+  expect_equal(ortiz:::dwell_time_(split(x = sample_events[c(1, 8:9, 18), 
"timestamp"], f = sample_events[c(1, 8:9, 18), "session_id"])), c(92, 150))
+})
+
+test_that("threshold of 0 returns all TRUE", {
+  expect_equal(dwell_time(sample_events, "session_id", "timestamp", 0), 
c(TRUE, TRUE))
+})
+
+test_that("threshold of 100 returns all FALSE", {
+  expect_equal(dwell_time(sample_events, "session_id", "timestamp", 100), 
c(FALSE, FALSE))
+})
+
+test_that("threshold of 30 returns 50% pass rate", {
+  expect_equal(dwell_time(sample_events, "session_id", "timestamp", 30), 
c(TRUE, FALSE))
+})

-- 
To view, visit https://gerrit.wikimedia.org/r/275144
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I60ad61077b1db34f668973180285937fbe7d300d
Gerrit-PatchSet: 4
Gerrit-Project: wikimedia/discovery/ortiz
Gerrit-Branch: master
Gerrit-Owner: Bearloga <[email protected]>
Gerrit-Reviewer: OliverKeyes <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to