Bearloga has submitted this change and it was merged.

Change subject: Update to use new format
......................................................................


Update to use new format

In I05984ad713ed18554afaa98dcbbe3dfade766fda, we updated the data
  collection scripts to employ a new version of the UDFs, which
  resulted in a new data format for the referer traffic data. This
  patch updates the Traffic dashboard to be compatible with the new
  data format which breaks down external to internally-referred PVs.

Bug: T130083, T129137
Change-Id: Ia5c3bf012acbda4120e12c4df14ed167ff3dd4f3
---
M server.R
M tab_documentation/traffic_summary.md
M utils.R
3 files changed, 23 insertions(+), 16 deletions(-)

Approvals:
  Bearloga: Verified; Looks good to me, approved



diff --git a/server.R b/server.R
index bd18fbe..09cf7d4 100644
--- a/server.R
+++ b/server.R
@@ -19,8 +19,10 @@
       polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, input$smoothing_traffic_summary)) 
%>%
       
polloi::subset_by_date_range(time_frame_range(input$traffic_summary_timeframe, 
input$traffic_summary_timeframe_daterange)) %>%
       polloi::make_dygraph(xlab = "Date", ylab = "Pageviews",
-                           title = "Pageviews from external search engines") 
%>%
-      dyLegend(labelsDiv = "traffic_summary_legend", show = "always")
+                           title = "Sources of page views (e.g. search engines 
and internal referers)") %>%
+      dyLegend(labelsDiv = "traffic_summary_legend", show = "always") %>%
+      dyAnnotation(x = as.Date("2016-03-07"), text = "A",
+                   tooltip = "Switched to a new UDF")
   })
   
   output$traffic_bysearch_dygraph <- renderDygraph({
diff --git a/tab_documentation/traffic_summary.md 
b/tab_documentation/traffic_summary.md
index 3954f81..ffbad33 100644
--- a/tab_documentation/traffic_summary.md
+++ b/tab_documentation/traffic_summary.md
@@ -9,9 +9,9 @@
 General trends
 ------
 
-Outages and inaccuracies
+Outages and notes
 ------
-None so far!
+- **A**: We switched to a finalized version of the UDF that extracts internal 
traffic (see [T130083](https://phabricator.wikimedia.org/T130083))
 
 Questions, bug reports, and feature suggestions
 ------
diff --git a/utils.R b/utils.R
index 3a801de..f4c0bce 100644
--- a/utils.R
+++ b/utils.R
@@ -1,38 +1,43 @@
 library(polloi)
 library(data.table)
-library(dplyr)
 
 # Read in the traffic data
 read_traffic <- function() {
   
   # Read in the initial data.
-  data <- polloi::read_dataset(path = "external_traffic/referer_data.tsv") %>%
-    dplyr::rename(date = timestamp) %>%
-    as.data.table
+  data <- polloi::read_dataset(path = "external_traffic/referer_data.tsv")
   
   # Deduplicate
-  data <- data[!duplicated(data[,1:(ncol(data) - 1), with=FALSE], fromLast = 
TRUE)]
+  # data <- data[!duplicated(data[,1:(ncol(data) - 1), with=FALSE], fromLast = 
TRUE)]
+  # Not sure what happened between 2016-02-04 and 2016-03-06 that caused the 
pageviews to
+  # come out split.
   
   # Format
   data$is_search <- ifelse(data$is_search, "Referred by search", "Not referred 
by search")
-  data$search_engine[data$search_engine %in% c("none","None")] <- "Not 
referred by search"
+  data$search_engine[data$search_engine == "none"] <- "Not referred by search"
+  data$referer_class[data$referer_class == "none"] <- "none (direct)"
+  data$referer_class[data$referer_class == "external (search engine)"] <- 
"search engine"
+  data$referer_class[data$referer_class == "external"] <- "external but not 
search engine"
+  data <- as.data.table(data)
   
   # Write out the overall values for traffic
   holding <- data[, j = list(pageviews = sum(pageviews)),
-                  by = c("date", "is_search", "access_method")]
+                  by = c("date", "referer_class", "access_method")]
   holding <- split(holding, f = holding$access_method)
-  holding$all <- data[,j = list(pageviews = sum(pageviews)),
-                      by = c("date", "is_search")]
+  holding$total <- data[,j = list(pageviews = sum(pageviews)),
+                        by = c("date", "referer_class")]
   names(holding) <- c("Desktop", "Mobile Web", "All")
   summary_traffic_data <<- lapply(holding, function(x){
-    return(reshape2::dcast(x, formula = date ~ is_search, fun.aggregate = sum))
+    return(reshape2::dcast(x, formula = date ~ referer_class, fun.aggregate = 
sum))
   })
   
   # Generate per-engine values
-  holding <- data[, j = list(pageviews = sum(pageviews)),
+  holding <- data[which(data$referer_class == "search engine"),
+                  j = list(pageviews = sum(pageviews)),
                   by = c("date", "search_engine", "access_method")]
   holding <- split(holding, f = holding$access_method)
-  holding$all <- data[, j = list(pageviews = sum(pageviews)),
+  holding$all <- data[which(data$referer_class == "search engine"),
+                      j = list(pageviews = sum(pageviews)),
                       by = c("date", "search_engine")]
   names(holding) <- c("Desktop", "Mobile Web", "All")
   bysearch_traffic_data <<- lapply(holding, function(x){

-- 
To view, visit https://gerrit.wikimedia.org/r/281488
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ia5c3bf012acbda4120e12c4df14ed167ff3dd4f3
Gerrit-PatchSet: 3
Gerrit-Project: wikimedia/discovery/wonderbolt
Gerrit-Branch: master
Gerrit-Owner: Bearloga <[email protected]>
Gerrit-Reviewer: Bearloga <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to