[MediaWiki-commits] [Gerrit] wikimedia...rainbow[master]: Point to new datasets

Bearloga (Code Review) Thu, 02 Feb 2017 18:45:07 -0800

Bearloga has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/335746 )


Change subject: Point to new datasets
......................................................................

Point to new datasets

Change-Id: Id384962d485931ebcb904e491ae0bf641d38c9bf
---
M CHANGELOG.md
M server.R
M utils.R
3 files changed, 272 insertions(+), 249 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/rainbow 
refs/changes/46/335746/1

diff --git a/CHANGELOG.md b/CHANGELOG.md
index efb55e8..f01c6ea 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,17 @@
 # Change Log (Patch Notes)
+
 All notable changes to this project will be documented in this file.
 
+## 2017/02/02
+- Updated to work with new datasets generated by Reportupdater-based golden 
([T150915](https://phabricator.wikimedia.org/T150915))
+
+## 2016
+- Added PaulScore ([T144424](https://phabricator.wikimedia.org/T144424))
+- Added ZRR broken up by language-project pairs 
([T126244](https://phabricator.wikimedia.org/T126244))
+- Added Invoke Source and Click Positions for Android 
([T143726](https://phabricator.wikimedia.org/T143726))
+- Added visited result survival 
([T113297](https://phabricator.wikimedia.org/T113297))
+- Added dwell-time & user engagement metrics 
([T113297](https://phabricator.wikimedia.org/T113297), 
[T113513](https://phabricator.wikimedia.org/T113513), [Change 
240593](https://gerrit.wikimedia.org/r/#/c/240593/))
+
 ## 2015/11/10
 - Updated the readme
 - Moved certain code blocks to **polloi** for use in other dashboards
diff --git a/server.R b/server.R
index 62678bd..343dd5f 100644
--- a/server.R
+++ b/server.R
@@ -1,9 +1,9 @@
-library(shiny)
-library(shinydashboard)
-library(dygraphs)
-library(sparkline)
-library(DT)
-library(data.table)
+suppressPackageStartupMessages({
+  library(shiny)
+  library(shinydashboard)
+  library(dygraphs)
+  library(sparkline)
+})
 
 source("utils.R")
 
@@ -162,7 +162,7 @@
   ## App value boxes
   output$app_event_searches <- renderValueBox(
     valueBox(
-      value = android_dygraph_means[3],
+      value = ios_dygraph_means["search sessions"] + 
android_dygraph_means["search sessions"],
       subtitle = "Search sessions per day",
       icon = icon("search"),
       color = "green"
@@ -171,7 +171,7 @@
 
   output$app_event_resultsets <- renderValueBox(
     valueBox(
-      value = android_dygraph_means[2],
+      value = ios_dygraph_means["Result pages opened"] + 
android_dygraph_means["Result pages opened"],
       subtitle = "Result sets per day",
       icon = icon("list", lib = "glyphicon"),
       color = "green"
@@ -180,7 +180,7 @@
 
   output$app_event_clickthroughs <- renderValueBox(
     valueBox(
-      value = android_dygraph_means[1],
+      value = ios_dygraph_means["clickthroughs"] + 
android_dygraph_means["clickthroughs"],
       subtitle = "Clickthroughs per day",
       icon = icon("hand-up", lib = "glyphicon"),
       color = "green"
@@ -240,36 +240,35 @@
 
   ## API plots
   output$cirrus_aggregate <- renderDygraph({
-    split_dataset$cirrus[, c(1, 3)] %>%
+    split_dataset$cirrus %>%
       polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, input$smoothing_fulltext_search)) 
%>%
-
       polloi::make_dygraph(xlab = "Date", ylab = "Searches", title = 
"Full-text via API usage by day", legend_name = "Searches") %>%
       dyRangeSelector
   })
 
   output$open_aggregate <- renderDygraph({
-    split_dataset$open[, c(1, 3)] %>%
+    split_dataset$open %>%
       polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, input$smoothing_open_search)) %>%
       polloi::make_dygraph(xlab = "Date", ylab = "Searches", title = 
"OpenSearch API usage by day", legend_name = "Searches") %>%
       dyRangeSelector
   })
 
   output$geo_aggregate <- renderDygraph({
-    split_dataset$geo[, c(1, 3)] %>%
+    split_dataset$geo %>%
       polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, input$smoothing_geo_search)) %>%
       polloi::make_dygraph(xlab = "Date", ylab = "Searches", title = "Geo 
Search API usage by day", legend_name = "Searches") %>%
       dyRangeSelector
   })
 
   output$language_aggregate <- renderDygraph({
-    split_dataset$language[, c(1, 3)] %>%
+    split_dataset$language %>%
       polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, input$smoothing_language_search)) 
%>%
       polloi::make_dygraph(xlab = "Date", ylab = "Searches", title = "Language 
Search API usage by day", legend_name = "Searches") %>%
       dyRangeSelector
   })
 
   output$prefix_aggregate <- renderDygraph({
-    split_dataset$prefix[, c(1, 3)] %>%
+    split_dataset$prefix %>%
       polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, input$smoothing_prefix_search)) 
%>%
       polloi::make_dygraph(xlab = "Date", ylab = "Searches", title = "Prefix 
Search API usage by day", legend_name = "Searches") %>%
       dyRangeSelector
@@ -277,7 +276,8 @@
 
   # Failure plots
   output$failure_rate_plot <- renderDygraph({
-    polloi::data_select(input$failure_rate_automata, 
failure_data_with_automata, failure_data_no_automata) %>%
+    input$failure_rate_automata %>%
+      polloi::data_select(failure_data_with_automata, 
failure_data_no_automata) %>%
       polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, input$smoothing_failure_rate)) %>%
       polloi::make_dygraph(xlab = "Date", ylab = "Zero Results Rate (%)", 
title = "Zero Results Rate, by day",
                            legend_name = "ZRR") %>%
@@ -288,7 +288,8 @@
   })
 
   output$failure_rate_change_plot <- renderDygraph({
-    polloi::data_select(input$failure_rate_automata, 
failure_roc_with_automata, failure_roc_no_automata) %>%
+    input$failure_rate_automata %>%
+      polloi::data_select(failure_roc_with_automata, failure_roc_no_automata) 
%>%
       polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, input$smoothing_failure_rate)) %>%
       polloi::make_dygraph(xlab = "Date", ylab = "Change", title = "Zero 
Results rate change, by day", legend_name = "Change") %>%
       dyAxis("y", axisLabelFormatter = "function(x) { return x + '%'; }", 
valueFormatter = "function(x) { return Math.round(x, 3) + '%'; }") %>%
@@ -300,8 +301,7 @@
     xts_data <- input$failure_breakdown_automata %>%
       polloi::data_select(failure_breakdown_with_automata, 
failure_breakdown_no_automata) %>%
       polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, 
input$smoothing_failure_breakdown)) %>%
-
-      { xts(.[, -1], order.by = .$date) }
+      { xts::xts(.[, -1], order.by = .$date) }
     xts_data %>% dygraph(xlab = "Date", ylab = "Zero Results Rate",
                          main = "Zero result rate by search type") %>%
       dyLegend(width = 600, show = "always", labelsDiv = 
"failure_breakdown_plot_legend") %>%
@@ -328,7 +328,8 @@
   })
 
   output$suggestion_dygraph_plot <- renderDygraph({
-    polloi::data_select(input$failure_suggestions_automata, 
suggestion_with_automata, suggestion_no_automata) %>%
+    input$failure_suggestions_automata %>%
+      polloi::data_select(suggestion_with_automata, suggestion_no_automata) %>%
       polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, 
input$smoothing_failure_suggestions)) %>%
       polloi::make_dygraph(xlab = "Date", ylab = "Zero Results Rate", title = 
"Zero Result Rates with Search Suggestions") %>%
       dyAxis("y", axisLabelFormatter = "function(x) { return x + '%'; }", 
valueFormatter = "function(x) { return x + '%'; }") %>%
@@ -384,7 +385,8 @@
   })
 
   output$failure_langproj_plot <- renderDygraph({
-    polloi::data_select(input$failure_langproj_automata, 
langproj_with_automata, langproj_no_automata) %>%
+    input$failure_langproj_automata %>%
+      polloi::data_select(langproj_with_automata, langproj_no_automata) %>%
       aggregate_wikis(input$language_selector, input$project_selector) %>%
       polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, 
input$smoothing_failure_langproj)) %>%
       polloi::make_dygraph(xlab = "", ylab = "Zero Results Rate", title = 
"Zero result rate by language and project") %>%
@@ -417,7 +419,7 @@
              temp <- dates %>%
                as.character("%e") %>%
                as.numeric %>%
-               sapply(toOrdinal) %>%
+               sapply(toOrdinal::toOrdinal) %>%
                sub("([a-z]{2})", "<sup>\\1</sup>", .) %>%
                paste0(as.character(dates, "%A, %b "), .)
            },
@@ -426,7 +428,7 @@
              temp <- dates %>%
                as.character("%e") %>%
                as.numeric %>%
-               sapply(toOrdinal) %>%
+               sapply(toOrdinal::toOrdinal) %>%
                sub("([a-z]{2})", "<sup>\\1</sup>", .) %>%
                paste0(as.character(dates, "%b "), .) %>%
                {
@@ -438,7 +440,7 @@
              temp <- dates %>%
                as.character("%e") %>%
                as.numeric %>%
-               sapply(toOrdinal) %>%
+               sapply(toOrdinal::toOrdinal) %>%
                sub("([a-z]{2})", "<sup>\\1</sup>", .) %>%
                paste0(as.character(dates, "%b "), .) %>%
                {
@@ -450,7 +452,7 @@
              return(dates %>%
                       as.character("%e") %>%
                       as.numeric %>%
-                      sapply(toOrdinal) %>%
+                      sapply(toOrdinal::toOrdinal) %>%
                       sub("([a-z]{2})", "<sup>\\1</sup>", .) %>%
                       paste0(as.character(dates, "%B "), .) %>%
                       paste0(collapse = "-") %>%
@@ -485,8 +487,7 @@
   output$kpi_summary_box_zero_results <- renderValueBox({
     date_range <- input$kpi_summary_date_range_selector
     if (date_range == "all") return(div("Zero results rate"))
-    x <- polloi::subset_by_date_range(failure_data_with_automata, from = 
start_date(date_range), to = Sys.Date() - 1)
-    x <- transform(x, Rate = rate)$Rate
+    x <- polloi::subset_by_date_range(failure_data_with_automata, from = 
start_date(date_range), to = Sys.Date() - 1)$rate
     if (date_range == "quarterly") {
       return(valueBox(subtitle = "Zero results rate", color = "orange",
                       value = sprintf("%.1f%%", median(x))))
@@ -497,7 +498,7 @@
         return(valueBox(
           subtitle = sprintf("Zero results rate (%.1f%%)", z),
           value = sprintf("%.1f%%", y2),
-          icon = cond_icon(z > 0), color = polloi::cond_color(z > 0, "red")
+          icon = polloi::cond_icon(z > 0), color = polloi::cond_color(z > 0, 
"red")
         ))
       }
       return(valueBox(subtitle = "Zero results rate (no change)",
@@ -510,9 +511,9 @@
     if (date_range == "all") return(div("API usage"))
     x <- split_dataset %>%
       lapply(polloi::subset_by_date_range, from = start_date(date_range), to = 
Sys.Date() - 1) %>%
-      lapply(function(x) return(x$events)) %>%
-      do.call(cbind, .) %>%
-      transform(total = cirrus + geo + language + open + prefix) %>%
+      dplyr::bind_rows(.id = "api") %>%
+      dplyr::group_by(date) %>%
+      dplyr::summarize(total = sum(calls)) %>%
       { .$total }
     if (date_range == "quarterly") {
       return(valueBox(subtitle = "API usage", value = 
polloi::compress(median(x), 0), color = "orange"))
@@ -535,17 +536,17 @@
     x <- polloi::subset_by_date_range(augmented_clickthroughs, from = 
start_date(date_range), to = Sys.Date() - 1)
     if (date_range == "quarterly") {
       return(valueBox(subtitle = "User engagement", color = "orange",
-                      value = sprintf("%.1f%%", median(x$user_engagement))))
+                      value = sprintf("%.1f%%", median(x$`User engagement`))))
     }
-    y1 <- median(polloi::half(x$user_engagement))
-    y2 <- median(polloi::half(x$user_engagement, FALSE))
+    y1 <- median(polloi::half(x$`User engagement`))
+    y2 <- median(polloi::half(x$`User engagement`, FALSE))
     z <- 100 * (y2 - y1)/y1
     if (!is.na(z)) {
       if (abs(z) > 0) {
         return(valueBox(
           subtitle = sprintf("User engagement (%.1f%%)", z),
           value = sprintf("%.1f%%", y2),
-          icon = cond_icon(z > 0), color = polloi::cond_color(z > 0, "green")
+          icon = polloi::cond_icon(z > 0), color = polloi::cond_color(z > 0, 
"green")
         ))
       }
       return(valueBox(subtitle = "User engagement (no change)",
@@ -556,16 +557,13 @@
 
   ## KPI Sparklines
   output$sparkline_load_time <- sparkline:::renderSparkline({
-    if(input$kpi_summary_date_range_selector == "all"){
+    if (input$kpi_summary_date_range_selector == "all") {
       output_sl <- list(desktop_load_data, mobile_load_data, 
android_load_data, ios_load_data)
-    } else{
+    } else {
       output_sl <- list(desktop_load_data, mobile_load_data, 
android_load_data, ios_load_data) %>%
         lapply(polloi::subset_by_date_range, from = Sys.Date() - 91, to = 
Sys.Date() - 1)
     }
     output_sl <- output_sl %>%
-      lapply(function(platform_load_data) {
-        platform_load_data[, c("date", "Median")]
-      }) %>%
       dplyr::bind_rows(.id = "platform") %>%
       dplyr::group_by(date) %>%
       dplyr::summarize(Median = median(Median)) %>%
@@ -595,9 +593,9 @@
     return(sparkline::spk_composite(sl1, sl2))
   })
   output$sparkline_zero_results <- sparkline:::renderSparkline({
-    if(input$kpi_summary_date_range_selector == "all"){
+    if (input$kpi_summary_date_range_selector == "all") {
       output_sl <- failure_data_with_automata
-    } else{
+    } else {
       output_sl <- failure_data_with_automata %>%
         polloi::subset_by_date_range(from = Sys.Date() - 91, to = Sys.Date() - 
1)
     }
@@ -611,11 +609,11 @@
                                 chartRangeMin = min(output_sl), chartRangeMax 
= max(output_sl),
                                 highlightLineColor = 'orange', 
highlightSpotColor = 'orange')
     # highlight selected date range
-    if (input$kpi_summary_date_range_selector == "weekly"){
+    if (input$kpi_summary_date_range_selector == "weekly") {
       output_highlight <- c(rep(NA, length(output_sl)-7), 
output_sl[(length(output_sl)-6):length(output_sl)])
-    } else if (input$kpi_summary_date_range_selector == "monthly"){
+    } else if (input$kpi_summary_date_range_selector == "monthly") {
       output_highlight <- c(rep(NA, length(output_sl)-30), 
output_sl[(length(output_sl)-29):length(output_sl)])
-    } else if (input$kpi_summary_date_range_selector == "quarterly"){
+    } else if (input$kpi_summary_date_range_selector == "quarterly") {
       output_highlight <- output_sl
     } else {
       return(sl1)
@@ -628,19 +626,16 @@
     return(sparkline::spk_composite(sl1, sl2))
   })
   output$sparkline_api_usage <- sparkline:::renderSparkline({
-    if(input$kpi_summary_date_range_selector == "all"){
+    if (input$kpi_summary_date_range_selector == "all") {
       output_sl <- split_dataset
-    } else{
+    } else {
       output_sl <- split_dataset %>%
         lapply(polloi::subset_by_date_range, from = Sys.Date() - 91, to = 
Sys.Date() - 1)
     }
     output_sl <- output_sl %>%
-      lapply(function(platform_load_data) {
-        platform_load_data[, c("date", "events")]
-      }) %>%
       dplyr::bind_rows(.id = "api") %>%
       dplyr::group_by(date) %>%
-      dplyr::summarize(total = sum(events)) %>%
+      dplyr::summarize(total = sum(calls)) %>%
       dplyr::select(total) %>%
       unlist(use.names = FALSE)
     sl1 <- sparkline::sparkline(values = output_sl, type = "line",
@@ -649,11 +644,11 @@
                                 chartRangeMin = min(output_sl), chartRangeMax 
= max(output_sl),
                                 highlightLineColor = 'orange', 
highlightSpotColor = 'orange')
     # highlight selected date range
-    if (input$kpi_summary_date_range_selector == "weekly"){
+    if (input$kpi_summary_date_range_selector == "weekly") {
       output_highlight <- c(rep(NA, length(output_sl)-7), 
output_sl[(length(output_sl)-6):length(output_sl)])
-    } else if (input$kpi_summary_date_range_selector == "monthly"){
+    } else if (input$kpi_summary_date_range_selector == "monthly") {
       output_highlight <- c(rep(NA, length(output_sl)-30), 
output_sl[(length(output_sl)-29):length(output_sl)])
-    } else if (input$kpi_summary_date_range_selector == "quarterly"){
+    } else if (input$kpi_summary_date_range_selector == "quarterly") {
       output_highlight <- output_sl
     } else {
       return(sl1)
@@ -666,14 +661,14 @@
     return(sparkline::spk_composite(sl1, sl2))
   })
   output$sparkline_augmented_clickthroughs <- sparkline:::renderSparkline({
-    if(input$kpi_summary_date_range_selector == "all"){
+    if(input$kpi_summary_date_range_selector == "all") {
       output_sl <- augmented_clickthroughs
-    } else{
+    } else {
       output_sl <- augmented_clickthroughs %>%
         polloi::subset_by_date_range(from = Sys.Date() - 91, to = Sys.Date() - 
1)
     }
     output_sl <- output_sl %>%
-      dplyr::select(user_engagement) %>%
+      dplyr::select(`User engagement`) %>%
       unlist(use.names = FALSE) %>%
       round(2)
     sl1 <- sparkline::sparkline(values = output_sl, type = "line",
@@ -682,11 +677,11 @@
                                 chartRangeMin = min(output_sl), chartRangeMax 
= max(output_sl),
                                 highlightLineColor = 'orange', 
highlightSpotColor = 'orange')
     # highlight selected date range
-    if (input$kpi_summary_date_range_selector == "weekly"){
+    if (input$kpi_summary_date_range_selector == "weekly") {
       output_highlight <- c(rep(NA, length(output_sl)-7), 
output_sl[(length(output_sl)-6):length(output_sl)])
-    } else if (input$kpi_summary_date_range_selector == "monthly"){
+    } else if (input$kpi_summary_date_range_selector == "monthly") {
       output_highlight <- c(rep(NA, length(output_sl)-30), 
output_sl[(length(output_sl)-29):length(output_sl)])
-    } else if (input$kpi_summary_date_range_selector == "quarterly"){
+    } else if (input$kpi_summary_date_range_selector == "quarterly") {
       output_highlight <- output_sl
     } else {
       return(sl1)
@@ -707,12 +702,10 @@
                                       all = NA, daily = 1, weekly = 8, monthly 
= 31, quarterly = 91)
     load_times <- list(desktop_load_data, mobile_load_data, android_load_data, 
ios_load_data) %>%
       {
-        if (is.na(start_date)) {
-          lapply(., function(dataset) {
-            return(dataset[!duplicated(dataset$date, dataset$event_type, 
fromLast = TRUE), ])
-          })
-        } else {
+        if (!is.na(start_date)) {
           lapply(., polloi::subset_by_date_range, from = start_date, to = 
Sys.Date() - 1)
+        } else {
+          .
         }
       } %>%
       lapply(function(data_tail) return(data_tail[, c('date', 'Median')])) %>%
@@ -744,10 +737,10 @@
     start_date <- Sys.Date() - switch(input$kpi_summary_date_range_selector, 
all = NA, daily = 1, weekly = 8, monthly = 31, quarterly = 91)
     zrr <- failure_data_with_automata %>%
       {
-        if (is.na(start_date)) {
-          .
-        } else {
+        if (!is.na(start_date)) {
           polloi::subset_by_date_range(., from = start_date, to = Sys.Date())
+        } else {
+          .
         }
       } %>%
       transform(`Rate` = rate)
@@ -787,25 +780,23 @@
     start_date <- Sys.Date() - switch(input$kpi_summary_date_range_selector, 
all = NA, daily = 1, weekly = 8, monthly = 31, quarterly = 91)
     api_usage <- split_dataset %>%
       {
-        if (is.na(start_date)) {
-          lapply(., function(dataset) {
-            return(dataset[!duplicated(dataset$date, dataset$event_type, 
fromLast = TRUE), ])
-          })
-        } else {
+        if (!is.na(start_date)) {
           lapply(., polloi::subset_by_date_range, from = start_date, to = 
Sys.Date() - 1)
+        } else {
+          .
         }
       } %>%
-      dplyr::bind_rows() %>%
-      tidyr::spread("event_type", "events") %>%
-      as.data.frame
+      dplyr::bind_rows(.id = "api") %>%
+      tidyr::spread("api", "calls")
     if ( input$kpi_api_usage_series_include_open ) {
-      api_usage <- transform(api_usage, all = cirrus + geo + language + open + 
prefix)
+      api_usage <- dplyr::mutate(api_usage, all = cirrus + geo + language + 
open + prefix)
     } else {
-      api_usage <- transform(api_usage, all = cirrus + geo + language + prefix)
+      api_usage <- dplyr::mutate(api_usage, all = cirrus + geo + language + 
prefix)
     }
     if ( input$kpi_api_usage_series_data == "raw" ) {
-      api_usage %<>% polloi::smoother(ifelse(smooth_level == "global", 
input$smoothing_global, smooth_level), rename = FALSE)
-      api_usage <- xts::xts(api_usage[, -1], api_usage[, 1])
+      api_usage %<>%
+        polloi::smoother(ifelse(smooth_level == "global", 
input$smoothing_global, smooth_level), rename = FALSE) %>%
+        { xts::xts(.[, -1], order.by = .$date) }
       if (!input$kpi_api_usage_series_include_open) {
         colnames(api_usage)[6] <- "all except open"
       }
@@ -821,16 +812,17 @@
                dyCSS(css = system.file("custom.css", package = "polloi")) %>%
                dyRangeSelector)
     }
-    api_usage_change <- transform(api_usage,
-                                  cirrus = polloi::percent_change(cirrus),
-                                  geo = polloi::percent_change(geo),
-                                  language = polloi::percent_change(language),
-                                  open = polloi::percent_change(open),
-                                  prefix = polloi::percent_change(prefix),
-                                  all = polloi::percent_change(all)) %>%
-                                  { .[-1, ] }
-    api_usage_change %<>% polloi::smoother(ifelse(smooth_level == "global", 
input$smoothing_global, smooth_level), rename = FALSE)
-    api_usage_change <- xts::xts(api_usage_change[, -1], api_usage_change[, 1])
+    api_usage_change <- dplyr::mutate(
+      api_usage,
+      cirrus = polloi::percent_change(cirrus),
+      geo = polloi::percent_change(geo),
+      language = polloi::percent_change(language),
+      open = polloi::percent_change(open),
+      prefix = polloi::percent_change(prefix),
+      all = polloi::percent_change(all)) %>%
+      { .[-1, ] } %>%
+      polloi::smoother(ifelse(smooth_level == "global", 
input$smoothing_global, smooth_level), rename = FALSE) %>%
+      { xts::xts(.[, -1], .$date) }
     if (!input$kpi_api_usage_series_include_open) 
colnames(api_usage_change)[6] <- "all except open"
     return(dygraph(api_usage_change,
                    main = "Day-to-day % change over time",
@@ -846,10 +838,10 @@
     start_date <- Sys.Date() - switch(input$kpi_summary_date_range_selector, 
all = NA, daily = 1, weekly = 8, monthly = 31, quarterly = 91)
     smoothed_data <- augmented_clickthroughs %>%
       {
-        if (is.na(start_date)) {
-          .
-        } else {
+        if (!is.na(start_date)) {
           polloi::subset_by_date_range(., from = start_date, to = Sys.Date())
+        } else {
+          .
         }
       } %>%
       polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, 
input$smoothing_augmented_clickthroughs))
@@ -862,56 +854,53 @@
       dyEvent(as.Date("2016-07-12"), "A (schema switch)", labelLoc = "bottom")
   })
 
-  output$monthly_metrics_tbl <- DT::renderDataTable(
-    {
-      temp <- data.frame(
+  output$monthly_metrics_tbl <- DT::renderDataTable({
+    temp <- data.frame(
       KPI = c("Load time", "Zero results rate", "API Usage", "User 
engagement"),
-      Units = c("ms", "%", "", "%")
+      Units = c("ms", "%", "", "%"),
+      stringsAsFactors = FALSE
     )
 
     prev_month <- as.Date(paste(input$monthy_metrics_year, which(month.name == 
input$monthy_metrics_month), "1", sep = "-"))
     prev_prev_month <- prev_month - months(1)
     prev_year <- prev_month - months(12)
 
-    smoothed_load_times <- list(Desktop = desktop_load_data,
-                                Mobile = mobile_load_data,
-                                Android = android_load_data,
-                                iOS = ios_load_data) %>%
-      lapply(function(platform_load_data) {
-        platform_load_data[, c("date", "Median")]
-      }) %>%
+    smoothed_load_times <- list(
+        Desktop = desktop_load_data,
+        Mobile = mobile_load_data,
+        Android = android_load_data,
+        iOS = ios_load_data
+      ) %>%
       dplyr::bind_rows(.id = "platform") %>%
       dplyr::group_by(date) %>%
       dplyr::summarize(Median = median(Median)) %>%
       polloi::smoother("month", rename = FALSE)
     smoothed_zrr <- polloi::smoother(failure_data_with_automata, "month", 
rename = FALSE)
     smoothed_api <- split_dataset %>%
-      lapply(function(platform_load_data) {
-        platform_load_data[, c("date", "events")]
-      }) %>%
       dplyr::bind_rows(.id = "api") %>%
       dplyr::group_by(date) %>%
-      dplyr::summarize(total = sum(events)) %>%
+      dplyr::summarize(total = sum(calls)) %>%
       polloi::smoother("month", rename = FALSE)
-    smoothed_engagement <- augmented_clickthroughs[, c("date", 
"user_engagement")] %>%
+    smoothed_engagement <- augmented_clickthroughs %>%
+      dplyr::select(c(date, `User engagement`)) %>%
       polloi::smoother("month", rename = FALSE)
     temp$Current <- c(
       smoothed_load_times$Median[smoothed_load_times$date == prev_month],
       smoothed_zrr$rate[smoothed_zrr$date == prev_month],
       smoothed_api$total[smoothed_api$date == prev_month],
-      smoothed_engagement$user_engagement[smoothed_engagement$date == 
prev_month]
+      smoothed_engagement$`User engagement`[smoothed_engagement$date == 
prev_month]
     )
     temp$Previous_month <- c(
       smoothed_load_times$Median[smoothed_load_times$date == prev_prev_month],
       smoothed_zrr$rate[smoothed_zrr$date == prev_prev_month],
       smoothed_api$total[smoothed_api$date == prev_prev_month],
-      smoothed_engagement$user_engagement[smoothed_engagement$date == 
prev_prev_month]
+      smoothed_engagement$`User engagement`[smoothed_engagement$date == 
prev_prev_month]
     )
     temp$Previous_year <- c(
       ifelse(sum(smoothed_load_times$date == prev_year) == 0, NA, 
smoothed_load_times$Median[smoothed_load_times$date == prev_year]),
       ifelse(sum(smoothed_zrr$date == prev_year) == 0, NA, 
smoothed_zrr$rate[smoothed_zrr$date == prev_year]),
       ifelse(sum(smoothed_api$date == prev_year) == 0, NA, 
smoothed_api$total[smoothed_api$date == prev_year]),
-      ifelse(sum(smoothed_engagement$date == prev_year) == 0, NA, 
smoothed_engagement$user_engagement[smoothed_engagement$date == prev_year])
+      ifelse(sum(smoothed_engagement$date == prev_year) == 0, NA, 
smoothed_engagement$`User engagement`[smoothed_engagement$date == prev_year])
     )
     temp$Anchors <- c("kpi_load_time", "kpi_zero_results", "kpi_api_usage", 
"kpi_augmented_clickthroughs")
 
@@ -945,7 +934,7 @@
       paste(smoothed_api %>% dplyr::arrange(date) %>% dplyr::mutate(month = 
zoo::as.yearmon(date)) %>%
               dplyr::select(-date) %>% dplyr::distinct() %>% {.$total}, 
collapse = ","),
       paste(smoothed_engagement %>% dplyr::arrange(date) %>% 
dplyr::mutate(month = zoo::as.yearmon(date)) %>%
-              dplyr::select(-date) %>% dplyr::distinct() %>% 
{.$user_engagement}, collapse = ",")
+              dplyr::select(-date) %>% dplyr::distinct() %>% {.$`User 
engagement`}, collapse = ",")
     )
     cols_to_keep <- c(1, 5, 4, 3, 7, 8, 9)
     if (!input$monthly_metrics_prev_month) {
@@ -954,21 +943,28 @@
     if (!input$monthly_metrics_prev_year) {
       cols_to_keep <- base::setdiff(cols_to_keep, 5)
     }
-    column_def <- list(list(targets = length(cols_to_keep)-1, render = 
JS("function(data, type, full){ return '<span class=sparkSeries>' + data + 
'</span>' }")))
+    column_def <- list(list(
+      targets = length(cols_to_keep) - 1,
+      render = DT::JS("function(data, type, full){ return '<span 
class=sparkSeries>' + data + '</span>' }")
+    ))
     line_string <- "type: 'line', lineColor: 'black', fillColor: '#ccc', 
highlightLineColor: 'orange', highlightSpotColor: 'orange'"
-    callback_fnc <- JS(paste0("function (oSettings, json) {
+    callback_fnc <- DT::JS(paste0("function (oSettings, json) {
       $('.sparkSeries:not(:has(canvas))').sparkline('html', { ", line_string, 
" });
       $('a[id^=mm_kpi_]').click(function(){
       var target = $(this).attr('id').replace('mm_', '');
       $('a[data-value=\"'+target+'\"]').click();});
       $('a[id^=mm_kpi_]').hover(function() 
{$(this).css('cursor','pointer');});\n}"), collapse = "")
-    mm_dt <- datatable(data.table(temp[, cols_to_keep]), rownames = FALSE,
-      options = list(searching = F, paging = F, info = F, ordering = F,
-                     columnDefs = column_def, fnDrawCallback = callback_fnc), 
escape=F)
+    mm_dt <- DT::datatable(
+      temp[, cols_to_keep], rownames = FALSE,
+      options = list(
+        searching = FALSE, paging = FALSE, info = FALSE, ordering = FALSE,
+        columnDefs = column_def, fnDrawCallback = callback_fnc
+      ),
+      escape = FALSE
+    )
     mm_dt$dependencies <- append(mm_dt$dependencies, 
htmlwidgets:::getDependency("sparkline"))
-    mm_dt
-    }
-  )
+    return(mm_dt)
+  })
 
   # Check datasets for missing data and notify user which datasets are missing 
data (if any)
   output$message_menu <- renderMenu({
diff --git a/utils.R b/utils.R
index adb0711..98fdb4e 100644
--- a/utils.R
+++ b/utils.R
@@ -1,155 +1,159 @@
-#Dependent libs
-library(reshape2)
-library(ggplot2)
-library(toOrdinal)
 library(magrittr)
-library(polloi)
-library(xts)
-library(tidyr)
 
 ## Read in desktop data and generate means for the value boxes, along with a 
time-series appropriate form for
 ## dygraphs.
 read_desktop <- function() {
-  data <- polloi::read_dataset("search/desktop_event_counts.tsv", col_types = 
"Dci")
-  names(data)[1] <- 'date' # Will be unnecessary after 
https://gerrit.wikimedia.org/r/#/c/250856/
-  interim <- reshape2::dcast(data, formula = date ~ action, fun.aggregate = 
sum)
-  interim[is.na(interim)] <- 0
-  desktop_dygraph_set <<- interim
-  desktop_dygraph_means <<- round(colMeans(desktop_dygraph_set[,2:5]))
-  interim <- polloi::read_dataset("search/desktop_load_times.tsv", col_types = 
"Dddd")
-  names(interim)[1] <- 'date' # Will be unnecessary after 
https://gerrit.wikimedia.org/r/#/c/250856/
-  desktop_load_data <<- interim
+  desktop_dygraph_set <<- 
polloi::read_dataset("discovery/search/desktop_event_counts.tsv", col_types = 
"Dci") %>%
+    dplyr::filter(!is.na(action), !is.na(events)) %>%
+    tidyr::spread(action, events, fill = 0)
+  desktop_dygraph_means <<- round(colMeans(desktop_dygraph_set[, 2:5]))
+  desktop_load_data <<- 
polloi::read_dataset("discovery/search/desktop_load_times.tsv", col_types = 
"Dddd") %>%
+    dplyr::filter(!is.na(Median))
 }
 
 read_web <- function() {
-  data <- polloi::read_dataset("search/mobile_event_counts.tsv", col_types = 
"Dci")
-  names(data)[1] <- 'date' # Will be unnecessary after 
https://gerrit.wikimedia.org/r/#/c/250856/
-  interim <- reshape2::dcast(data, formula = date ~ action, fun.aggregate = 
sum)
-  interim[is.na(interim)] <- 0
-  mobile_dygraph_set <<- interim
-  mobile_dygraph_means <<- round(colMeans(mobile_dygraph_set[,2:4]))
-  interim <- polloi::read_dataset("search/mobile_load_times.tsv", col_types = 
"Dddd")
-  names(interim)[1] <- 'date' # Will be unnecessary after 
https://gerrit.wikimedia.org/r/#/c/250856/
-  mobile_load_data <<- interim
+  mobile_dygraph_set <<- 
polloi::read_dataset("discovery/search/mobile_event_counts.tsv", col_types = 
"Dci") %>%
+    dplyr::filter(!is.na(action), !is.na(events)) %>%
+    tidyr::spread(action, events, fill = 0)
+  mobile_dygraph_means <<- round(colMeans(mobile_dygraph_set[, 2:4]))
+  mobile_load_data <<- 
polloi::read_dataset("discovery/search/mobile_load_times.tsv", col_types = 
"Dddd") %>%
+    dplyr::filter(!is.na(Median))
 }
 
 read_apps <- function() {
+  data <- polloi::read_dataset("discovery/search/app_event_counts.tsv", 
col_types = "Dcci") %>%
+    dplyr::filter(!is.na(action), !is.na(events)) %>%
+    dplyr::distinct(date, platform, action, .keep_all = TRUE)
+  ios <- data %>%
+    dplyr::filter(platform == "iOS") %>%
+    dplyr::select(-platform) %>%
+    tidyr::spread(action, events, fill = 0)
+  android <- data %>%
+    dplyr::filter(platform == "Android") %>%
+    dplyr::select(-platform) %>%
+    tidyr::spread(action, events, fill = 0)
 
-  data <- polloi::read_dataset("search/app_event_counts.tsv", col_types = 
"Dcci")
-  names(data)[1] <- 'date' # Will be unnecessary after 
https://gerrit.wikimedia.org/r/#/c/250856/
-  ios <- reshape2::dcast(data[data$platform == "iOS",], formula = date ~ 
action, fun.aggregate = sum)
-  android <- reshape2::dcast(data[data$platform == "Android",], formula = date 
~ action, fun.aggregate = sum)
   ios_dygraph_set <<- ios
-  ios_dygraph_means <<- round(colMeans(ios[,2:4]))
+  ios_dygraph_means <<- round(colMeans(ios[, 2:4]))
 
   android_dygraph_set <<- android
-  android_dygraph_means <<- round(colMeans(android[,2:4]))
+  android_dygraph_means <<- round(colMeans(android[, 2:4]))
 
-  app_load_data <- polloi::read_dataset("search/app_load_times.tsv", col_types 
= "Dcddd")
-  names(app_load_data)[1] <- 'date' # Will be unnecessary after 
https://gerrit.wikimedia.org/r/#/c/250856/
+  app_load_data <- polloi::read_dataset("discovery/search/app_load_times.tsv", 
col_types = "Dcddd") %>%
+    dplyr::filter(!is.na(Median)) %>%
+    dplyr::distinct(date, platform, .keep_all = TRUE)
   ios_load_data <<- app_load_data[app_load_data$platform == "iOS", 
names(app_load_data) != "platform"]
   android_load_data <<- app_load_data[app_load_data$platform == "Android", 
names(app_load_data) != "platform"]
 
-  position_interim <- polloi::read_dataset("search/click_position_counts.tsv", 
col_types = "Dci") %>%
+  position_interim <- 
polloi::read_dataset("discovery/search/click_position_counts.tsv", col_types = 
"Dci") %>%
+    dplyr::filter(!is.na(click_position), !is.na(events)) %>%
+    dplyr::distinct(date, click_position, .keep_all = TRUE) %>%
     dplyr::group_by(date) %>%
     dplyr::mutate(prop = round(events/sum(events)*100, 2)) %>%
     dplyr::ungroup() %>%
     dplyr::select(-events) %>%
-    reshape2::dcast(formula = date ~ click_position, fun.aggregate = sum)
-  position_interim <- position_interim[,c("date", "1", "2", "3", "4", "5", 
"6", "7", "8", "9", "10-19", "20-100", "100+")]
+    tidyr::spread(click_position, prop, fill = 0)
+  position_interim <- position_interim[, c("date", "1", "2", "3", "4", "5", 
"6", "7", "8", "9", "10-19", "20-100", "100+")]
   names(position_interim) <- c("date", "1st", "2nd", "3rd", "4th", "5th", 
"6th", "7th", "8th", "9th", "10th-19th", "20th-100th", "101st+")
   position_prop <<- position_interim
-  source_prop <<- polloi::read_dataset("search/invoke_source_counts.tsv", 
col_types = "Dci") %>%
+  source_prop <<- 
polloi::read_dataset("discovery/search/invoke_source_counts.tsv", col_types = 
"Dci") %>%
+    dplyr::filter(!is.na(invoke_source), !is.na(events)) %>%
+    dplyr::distinct(date, invoke_source, .keep_all = TRUE) %>%
     dplyr::group_by(date) %>%
     dplyr::mutate(prop = round(events/sum(events)*100, 2)) %>%
     dplyr::ungroup() %>%
     dplyr::select(-events) %>%
-    reshape2::dcast(formula = date ~ invoke_source, fun.aggregate = sum)
+    tidyr::spread(invoke_source, prop, fill = 0)
 }
 
 read_api <- function(){
-  data <- polloi::read_dataset("search/search_api_aggregates.tsv", col_types = 
"cci")
-  names(data)[1] <- 'date' # Will be unnecessary after 
https://gerrit.wikimedia.org/r/#/c/250856/
-  data$date <- as.Date(data$date)
-  data <- data[order(data$event_type), ]
-  split_dataset <<- split(data, f = data$event_type)
+  split_dataset <<- 
polloi::read_dataset("discovery/search/search_api_usage.tsv", col_types = 
"Dci") %>%
+    dplyr::filter(!is.na(api), !is.na(calls)) %>%
+    dplyr::distinct(date, api, .keep_all = TRUE) %>%
+    dplyr::arrange(api, date) %>%
+    { split(., f = .$api) } %>%
+    lapply(dplyr::select_, .dots = list(quote(-api)))
 }
 
 read_failures <- function(date) {
-
-  interim <- 
polloi::read_dataset("search/cirrus_query_aggregates_with_automata.tsv", 
col_types = "Dd")
-  interim$rate <- interim$rate*100
-  failure_data_with_automata <<- interim
-
-  interim <- 
polloi::read_dataset("search/cirrus_query_aggregates_no_automata.tsv", 
col_types = "Dd")
-  interim$rate <- interim$rate*100
-  failure_data_no_automata <<- interim
-
+  ## Zero results rate
+  ### With automata
+  failure_data_with_automata <<- 
polloi::read_dataset("discovery/search/cirrus_query_aggregates_with_automata.tsv",
 col_types = "Dd") %>%
+    dplyr::filter(!is.na(rate)) %>%
+    dplyr::mutate(rate = 100 * rate)
+  ### Without automata
+  failure_data_no_automata <<- 
polloi::read_dataset("discovery/search/cirrus_query_aggregates_no_automata.tsv",
 col_types = "Dd") %>%
+    dplyr::filter(!is.na(rate)) %>%
+    dplyr::mutate(rate = 100 * rate)
+  ## Day-to-day change
+  ### With automata
   interim_new <- 
failure_data_with_automata$rate[2:nrow(failure_data_with_automata)]
   interim_old <- 
failure_data_with_automata$rate[1:(nrow(failure_data_with_automata)-1)]
-  interim <- 100 * (interim_new - interim_old)/interim_old
-
-  failure_roc_with_automata <<- data.frame(date = 
failure_data_with_automata$date[2:nrow(failure_data_with_automata)],
-                                           daily_change = interim,
-                                           stringsAsFactors = FALSE)
-
+  failure_roc_with_automata <<- data.frame(
+    date = failure_data_with_automata$date[2:nrow(failure_data_with_automata)],
+    daily_change = 100 * (interim_new - interim_old)/interim_old,
+    stringsAsFactors = FALSE
+  )
+  ### Without automata
   interim_new <- 
failure_data_no_automata$rate[2:nrow(failure_data_no_automata)]
   interim_old <- 
failure_data_no_automata$rate[1:(nrow(failure_data_no_automata)-1)]
-  interim <- 100 * (interim_new - interim_old)/interim_old
-
-  failure_roc_no_automata <<- data.frame(date = 
failure_data_no_automata$date[2:nrow(failure_data_no_automata)],
-                                         daily_change = interim,
-                                         stringsAsFactors = FALSE)
-
-  interim_breakdown_with_automata <- 
polloi::read_dataset("search/cirrus_query_breakdowns_with_automata.tsv", 
col_types = "Dcd")
-  interim_breakdown_with_automata$rate <- 
interim_breakdown_with_automata$rate*100
-  interim_breakdown_with_automata$query_type <- 
as.character(factor(interim_breakdown_with_automata$query_type,
-    levels = c("Full-Text Search", "Prefix Search", "full_text", "prefix", 
"comp_suggest", "more_like", "regex", "GeoData_spatial_search"),
-    labels = c("Full-Text Search", "Prefix Search", "Full-Text", "Prefix", 
"Completion Suggester", "More Like", "Regex", "Geospatial")))
-  failure_breakdown_with_automata <<- 
reshape2::dcast(interim_breakdown_with_automata,
-                                                      formula = date ~ 
query_type, fun.aggregate = sum,
-                                                      fill = as.double(NA))
-
-  interim_breakdown_no_automata <- 
polloi::read_dataset("search/cirrus_query_breakdowns_no_automata.tsv", 
col_types = "Dcd")
-  interim_breakdown_no_automata$rate <- interim_breakdown_no_automata$rate*100
-  interim_breakdown_no_automata$query_type <- 
as.character(factor(interim_breakdown_no_automata$query_type,
-    levels = c("Full-Text Search", "Prefix Search", "full_text", "prefix", 
"comp_suggest", "more_like", "regex", "GeoData_spatial_search"),
-    labels = c("Full-Text Search", "Prefix Search", "Full-Text", "Prefix", 
"Completion Suggester", "More Like", "Regex", "Geospatial")))
-  failure_breakdown_no_automata <<- 
reshape2::dcast(interim_breakdown_no_automata,
-                                                    formula = date ~ 
query_type, fun.aggregate = sum,
-                                                    fill = as.double(NA))
-
-  # Fix to make the suggestion dataset compatible with ZRR data format switch:
-  
interim_breakdown_with_automata$query_type[interim_breakdown_with_automata$query_type
 == "Full-Text"] <- "Full-Text Search"
-  
interim_breakdown_no_automata$query_type[interim_breakdown_no_automata$query_type
 == "Full-Text"] <- "Full-Text Search"
-  # Correction for 31 January 2016 when "Full Text" appears twice (once as 
"Full-Text Search" and once as "Full-Text"):
-  interim_breakdown_with_automata <- 
interim_breakdown_with_automata[!duplicated(interim_breakdown_with_automata[, 
c('date', 'query_type')]), ]
-  interim_breakdown_no_automata <- 
interim_breakdown_no_automata[!duplicated(interim_breakdown_no_automata[, 
c('date', 'query_type')]), ]
-
-  interim <- 
polloi::read_dataset("search/cirrus_suggestion_breakdown_with_automata.tsv", 
col_types = "Dd")
-  interim$rate <- interim$rate*100
-  interim$query_type <- "Full-Text with Suggestions"
-  interim <- rbind(interim[,c("date", "query_type", "rate")],
-                   
interim_breakdown_with_automata[interim_breakdown_with_automata$date %in% 
interim$date
-                                                   & 
interim_breakdown_with_automata$query_type == "Full-Text Search",])
-  suggestion_with_automata <<- reshape2::dcast(interim, formula = date ~ 
query_type, fun.aggregate = sum,
-                                               fill = as.double(NA))
-
-  interim <- 
polloi::read_dataset("search/cirrus_suggestion_breakdown_no_automata.tsv", 
col_types = "Dd")
-  interim$rate <- interim$rate*100
-  interim$query_type <- "Full-Text with Suggestions"
-  interim <- rbind(interim[,c("date", "query_type", "rate")],
-                   
interim_breakdown_no_automata[interim_breakdown_no_automata$date %in% 
interim$date
-                                                 & 
interim_breakdown_no_automata$query_type == "Full-Text Search",])
-  suggestion_no_automata <<- reshape2::dcast(interim, formula = date ~ 
query_type, fun.aggregate = sum,
-                                             fill = as.double(NA))
-
-  interim <- 
polloi::read_dataset("search/cirrus_langproj_breakdown_with_automata.tsv", na = 
"~", col_types = "Dccii")
-  interim$language %<>% sub("NA", "(None)", .)
-  langproj_with_automata <<- interim
-  interim <- 
polloi::read_dataset("search/cirrus_langproj_breakdown_no_automata.tsv", na = 
"~", col_types = "Dccii")
-  interim$language %<>% sub("NA", "(None)", .)
-  langproj_no_automata <<- interim
+  failure_roc_no_automata <<- data.frame(
+    date = failure_data_no_automata$date[2:nrow(failure_data_no_automata)],
+    daily_change = 100 * (interim_new - interim_old)/interim_old,
+    stringsAsFactors = FALSE
+  )
+  ## ZRR by type
+  ### With automata
+  failure_breakdown_with_automata <<- 
polloi::read_dataset("discovery/search/cirrus_query_breakdowns_with_automata.tsv",
 col_types = "Dcd") %>%
+    dplyr::filter(!is.na(query_type), !is.na(rate)) %>%
+    dplyr::mutate(
+      rate = 100 * rate,
+      query_type = as.character(factor(
+        query_type,
+        levels = c("Full-Text Search", "Prefix Search", "full_text", "prefix", 
"comp_suggest", "more_like", "regex", "GeoData_spatial_search"),
+        labels = c("Full-Text Search", "Prefix Search", "Full-Text", "Prefix", 
"Completion Suggester", "More Like", "Regex", "Geospatial")
+      )),
+      query_type = dplyr::if_else(query_type == "Full-Text", "Full-Text 
Search", query_type)
+    ) %>%
+    dplyr::distinct(date, query_type, .keep_all = TRUE) %>%
+    tidyr::spread(query_type, rate, fill = as.double(NA))
+  ### Without automata
+  failure_breakdown_no_automata <<- 
polloi::read_dataset("discovery/search/cirrus_query_breakdowns_no_automata.tsv",
 col_types = "Dcd") %>%
+    dplyr::filter(!is.na(query_type), !is.na(rate)) %>%
+    dplyr::mutate(
+      rate = 100 * rate,
+      query_type = as.character(factor(
+        query_type,
+        levels = c("Full-Text Search", "Prefix Search", "full_text", "prefix", 
"comp_suggest", "more_like", "regex", "GeoData_spatial_search"),
+        labels = c("Full-Text Search", "Prefix Search", "Full-Text", "Prefix", 
"Completion Suggester", "More Like", "Regex", "Geospatial")
+      )),
+      query_type = dplyr::if_else(query_type == "Full-Text", "Full-Text 
Search", query_type)
+    ) %>%
+    dplyr::distinct(date, query_type, .keep_all = TRUE) %>%
+    tidyr::spread(query_type, rate, fill = as.double(NA))
+  ## ZRR with suggestions
+  ### With automata
+  suggestion_with_automata <<- 
polloi::read_dataset("discovery/search/cirrus_suggestion_breakdown_with_automata.tsv",
 col_types = "Dd") %>%
+    dplyr::filter(!is.na(rate)) %>%
+    dplyr::transmute(date = date, `Full-Text with Suggestions` = 100 * rate) 
%>%
+    dplyr::full_join(dplyr::select(failure_breakdown_with_automata, c(date, 
`Full-Text Search`)), by = "date") %>%
+    dplyr::arrange(date)
+  ### Without automata
+  suggestion_no_automata <<- 
polloi::read_dataset("discovery/search/cirrus_suggestion_breakdown_no_automata.tsv",
 col_types = "Dd") %>%
+    dplyr::filter(!is.na(rate)) %>%
+    dplyr::transmute(date = date, `Full-Text with Suggestions` = 100 * rate) 
%>%
+    dplyr::full_join(dplyr::select(failure_breakdown_no_automata, c(date, 
`Full-Text Search`)), by = "date") %>%
+    dplyr::arrange(date)
+  ## Broken down by language-project pair
+  ### With automata
+  langproj_with_automata <<- 
polloi::read_dataset("discovery/search/cirrus_langproj_breakdown_with_automata.tsv",
 na = "~", col_types = "Dccii") %>%
+    dplyr::filter(!is.na(zero_results), !is.na(total)) %>%
+    dplyr::mutate(language = sub("NA", "(None)", language))
+  ### Without automata
+  langproj_no_automata <<- 
polloi::read_dataset("discovery/search/cirrus_langproj_breakdown_no_automata.tsv",
 na = "~", col_types = "Dccii") %>%
+    dplyr::filter(!is.na(zero_results), !is.na(total)) %>%
+    dplyr::mutate(language = sub("NA", "(None)", language))
+  ### Summaries for sorting
   available_languages <<- langproj_with_automata %>%
     dplyr::group_by(language) %>%
     dplyr::summarize(volume = sum(as.numeric(total))) %>%
@@ -168,27 +172,39 @@
 }
 
 read_augmented_clickthrough <- function() {
-  data <- polloi::read_dataset("search/search_threshold_pass_rate.tsv", 
col_types = "Dd")
-  temp <- polloi::safe_tail(desktop_dygraph_set, nrow(data))[, 
c('clickthroughs', 'Result pages opened')] +
-    polloi::safe_tail(mobile_dygraph_set, nrow(data))[, c('clickthroughs', 
'Result pages opened')] +
-    polloi::safe_tail(ios_dygraph_set, nrow(data))[, c('clickthroughs', 
'Result pages opened')] +
-    polloi::safe_tail(android_dygraph_set, nrow(data))[, c('clickthroughs', 
'Result pages opened')]
-  intermediary_dataset <- cbind(data, clickthrough_rate = 100 * 
temp$clickthroughs/temp$'Result pages opened')
-  colnames(intermediary_dataset) <- c("date", "threshold_passing_rate", 
"clickthrough_rate")
-  intermediary_dataset$threshold_passing_rate <- 100 * 
intermediary_dataset$threshold_passing_rate
-  augmented_clickthroughs <<- transform(intermediary_dataset, user_engagement 
= (threshold_passing_rate + clickthrough_rate)/2)
+  threshold_data <- 
polloi::read_dataset("discovery/search/search_threshold_pass_rate.tsv", 
col_types = "Dd") %>%
+    dplyr::filter(!is.na(threshold_pass)) %>%
+    dplyr::mutate(threshold_pass = 100 * threshold_pass)
+  augmented_clickthroughs <<- list(
+    desktop = dplyr::select(desktop_dygraph_set, c(date, clickthroughs, 
`Result pages opened`)),
+    mobile = dplyr::select(mobile_dygraph_set, c(date, clickthroughs, `Result 
pages opened`)),
+    ios = dplyr::select(ios_dygraph_set, c(date, clickthroughs, `Result pages 
opened`)),
+    android = dplyr::select(android_dygraph_set, c(date, clickthroughs, 
`Result pages opened`))
+  ) %>%
+    dplyr::bind_rows(.id = "platform") %>%
+    dplyr::group_by(date) %>%
+    dplyr::summarize(clickthroughs = sum(clickthroughs), serps = sum(`Result 
pages opened`)) %>%
+    dplyr::right_join(threshold_data, by = "date") %>%
+    dplyr::transmute(
+      date = date,
+      `Threshold-passing %` = threshold_pass,
+      `Clickthrough rate` = 100 * clickthroughs/serps,
+      `User engagement` = (threshold_pass + `Clickthrough rate`)/2
+    )
 }
 
 read_lethal_dose <- function() {
-  intermediary_dataset <- 
polloi::read_dataset("search/sample_page_visit_ld.tsv", col_types = "Diiiiiii")
-  colnames(intermediary_dataset) <- c("date", "10%", "25%", "50%", "75%", 
"90%", "95%", "99%")
-  user_page_visit_dataset <<- intermediary_dataset
+  user_page_visit_dataset <<- 
polloi::read_dataset("discovery/search/sample_page_visit_ld.tsv", col_types = 
"Dddddddd") %>%
+    dplyr::filter(!is.na(LD10)) %>%
+    set_colnames(c("date", "10%", "25%", "50%", "75%", "90%", "95%", "99%"))
 }
 
 read_paul_score <- function() {
-  data <- polloi::read_dataset("search/paulscore_approximations.tsv", 
col_types = "Dcddddddddd")[, c("date", "event_source", "pow_1", "pow_5", 
"pow_9")]
-  paulscore_autocomplete <<- data[data$event_source == "autocomplete", -2] %>% 
set_names(c("date", "F = 0.1", "F = 0.5", "F = 0.9"))
-  paulscore_fulltext <<- data[data$event_source == "fulltext", -2] %>% 
set_names(c("date", "F = 0.1", "F = 0.5", "F = 0.9"))
+  paulscore <- 
polloi::read_dataset("discovery/search/paulscore_approximations.tsv", col_types 
= "Dcddddddddd") %>%
+    dplyr::filter(!is.na(event_source)) %>%
+    dplyr::select(c(date, event_source, `F = 0.1` = pow_1, `F = 0.5` = pow_5, 
`F = 0.9` = pow_9))
+  paulscore_autocomplete <<- dplyr::filter(paulscore, event_source == 
"autocomplete") %>% dplyr::select(-event_source)
+  paulscore_fulltext <<- dplyr::filter(paulscore, event_source == "fulltext") 
%>% dplyr::select(-event_source)
 }
 
 aggregate_wikis <- function(data, languages, projects) {

-- 
To view, visit https://gerrit.wikimedia.org/r/335746
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id384962d485931ebcb904e491ae0bf641d38c9bf
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/rainbow
Gerrit-Branch: master
Gerrit-Owner: Bearloga <mpo...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] wikimedia...rainbow[master]: Point to new datasets

Reply via email to