[MediaWiki-commits] [Gerrit] wikimedia...golden[master]: Expand "other" countries data

2016-09-14 Thread Bearloga (Code Review)
Bearloga has submitted this change and it was merged.

Change subject: Expand "other" countries data
..


Expand "other" countries data

- Traffic, clickthrough rate, number of visits, ctr_visit (proportion of visits 
that have at least one clickthrough), number of sessions, ctr_session 
(proportion of sessions that have at least one clickthrough), by all countries 
and US regions
- Last action by all countries and US regions
- Most common section clicked per visit by all countries and US regions
- First visit clickthrough by all countries and US regions

Also fixed two bugs in:
- Most common section clicked
- Generate click breakdown (last action): 1) add sort by timestamp 2) delete 
lines which modify raw data

Bug: T138107
Change-Id: I51c8ca4222175d0ba5c1c48d187c9b0bbe8f7a3d
---
M portal/portal.R
1 file changed, 85 insertions(+), 9 deletions(-)

Approvals:
  Bearloga: Verified; Looks good to me, approved



diff --git a/portal/portal.R b/portal/portal.R
index bf8d3b6..5b77aed 100644
--- a/portal/portal.R
+++ b/portal/portal.R
@@ -46,7 +46,6 @@
 dplyr::arrange(session, ts) %>%
 dplyr::group_by(session) %>%
 dplyr::mutate(visit = cumsum(type == "landing")) %>%
-dplyr::filter(visit == 1) %>%
 dplyr::filter(type == "clickthrough") %>%
 dplyr::group_by(date, session, visit, section_used) %>%
 dplyr::tally() %>%
@@ -75,19 +74,25 @@
 dplyr::mutate(date = data$date[1]) %>%
 dplyr::select(c(date, `no action`, `primary links`, `search`, `secondary 
links`, `other languages`, `other projects`))
   
-  # Generate click breakdown
-  data <- data[order(data$type, decreasing = FALSE), ]
-  data <- data[!duplicated(data$session), ]
-  breakdown_data <- data[, j = list(events = .N), by = c("date", 
"section_used")]
+  # Generate click breakdown (last action)
+  breakdown_data <- data %>%
+dplyr::arrange(ts) %>%
+dplyr::filter(!duplicated(session, fromLast = TRUE)) %>%
+dplyr::group_by(date, section_used) %>%
+dplyr::summarize(events = n()) %>%
+data.table::as.data.table()
   
   # Generate by-country breakdown with regional data for US
-  regions <- data.frame(abb = paste0("US:", c(as.character(state.abb), "DC")),
+  data("ISO_3166_1", package = "ISOcodes")
+  us_other_abb <- c("AS", "GU", "MP", "PR", "VI")
+  us_other_mask <- match(us_other_abb, ISO_3166_1$Alpha_2)
+  regions <- data.frame(abb = c(paste0("US:", c(as.character(state.abb), 
"DC")), us_other_abb),
 # ^ need to verify that District of Columbia shows up 
as DC and not another abbreviation
-region = paste0("U.S. (", 
c(as.character(state.region), "South"), ")"),
-state = c(state.name, "District of Columbia"),
+region = paste0("U.S. (", 
c(as.character(state.region), "South", rep("Other",5)), ")"),
+state = c(state.name, "District of Columbia", 
ISO_3166_1$Name[us_other_mask]),
 stringsAsFactors = FALSE)
   regions$region[regions$region == "U.S. (North Central)"] <- "U.S. (Midwest)"
-  regions$region[state.division == "Pacific"] <- "U.S. (Pacific)" # see 
https://phabricator.wikimedia.org/T136257#2399411
+  regions$region[c(state.division == "Pacific", rep(FALSE, 5))] <- "U.S. 
(Pacific)" # see https://phabricator.wikimedia.org/T136257#2399411
   countries <- data.frame(abb = c(regions$abb, "GB", "CA",
   "DE", "IN", "AU", "CN",
   "RU", "PH", "FR"),
@@ -110,6 +115,71 @@
 dplyr::select(c(date, country, events)) %>%
 dplyr::arrange(desc(country))
 
+  # Experimental: Generate all countries breakdown
+  all_countries <- data.frame(abb = c(regions$abb, 
ISO_3166_1$Alpha_2[-us_other_mask]),
+  name = c(regions$region, 
ISO_3166_1$Name[-us_other_mask]),
+  stringsAsFactors = FALSE)
+  data_w_countryname <- as.data.frame(data) %>%
+dplyr::mutate(country = ifelse(country %in% all_countries$abb, country, 
"Other")) %>%
+dplyr::left_join(all_countries, by = c("country" = "abb")) %>%
+dplyr::mutate(name = ifelse(is.na(name), "Other", name)) %>%
+dplyr::select(-country) %>% dplyr::rename(country = name)
+
+  ctr_visit <- data_w_countryname %>%
+dplyr::arrange(session, ts) %>%
+dplyr::group_by(session) %>%
+dplyr::mutate(visit = cumsum(type == "landing")) %>%
+dplyr::group_by(date, country, session, visit) %>%
+dplyr::summarize(dummy_clt = sum(type=="clickthrough")>1) %>%
+dplyr::group_by(country) %>%
+dplyr::summarize(n_visit = n(), ctr_visit = sum(dummy_clt)/n())
+  ctr_session <- data_w_countryname %>%
+dplyr::group_by(date, country, session) %>%
+dplyr::summarize(dummy_clt = sum(type=="clickthrough")>1) %>%
+dplyr::group_by(country) %>%
+dplyr::summarize(n_session = n(), ctr_session = sum(dummy_clt)/n()) 
+  all_country_data <-

[MediaWiki-commits] [Gerrit] wikimedia...golden[master]: Expand "other" countries data

2016-09-13 Thread Chelsyx (Code Review)
Chelsyx has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/310473

Change subject: Expand "other" countries data
..

Expand "other" countries data

- Traffic and clickthrough rate by all countries and US regions
- Last action by all countries and US regions
- Most common section clicked per visit by all countries and US regions
- First visit clickthrough by all countries and US regions

Also fixed two bugs in:
- Most common section clicked
- Generate click breakdown (last action): 1) add sort by timestamp 2) delete 
lines which modify raw data

Bug: T138107
Change-Id: I51c8ca4222175d0ba5c1c48d187c9b0bbe8f7a3d
---
M portal/portal.R
1 file changed, 74 insertions(+), 5 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/golden 
refs/changes/73/310473/1

diff --git a/portal/portal.R b/portal/portal.R
index bf8d3b6..475aa00 100644
--- a/portal/portal.R
+++ b/portal/portal.R
@@ -46,7 +46,6 @@
 dplyr::arrange(session, ts) %>%
 dplyr::group_by(session) %>%
 dplyr::mutate(visit = cumsum(type == "landing")) %>%
-dplyr::filter(visit == 1) %>%
 dplyr::filter(type == "clickthrough") %>%
 dplyr::group_by(date, session, visit, section_used) %>%
 dplyr::tally() %>%
@@ -75,10 +74,13 @@
 dplyr::mutate(date = data$date[1]) %>%
 dplyr::select(c(date, `no action`, `primary links`, `search`, `secondary 
links`, `other languages`, `other projects`))
   
-  # Generate click breakdown
-  data <- data[order(data$type, decreasing = FALSE), ]
-  data <- data[!duplicated(data$session), ]
-  breakdown_data <- data[, j = list(events = .N), by = c("date", 
"section_used")]
+  # Generate click breakdown (last action)
+  breakdown_data <- data %>%
+dplyr::arrange(ts) %>%
+   dplyr::filter(!duplicated(session, fromLast = TRUE)) %>%
+   dplyr::group_by(date, section_used) %>%
+   dplyr::summarize(events = n()) %>%
+   data.table::as.data.table()
   
   # Generate by-country breakdown with regional data for US
   regions <- data.frame(abb = paste0("US:", c(as.character(state.abb), "DC")),
@@ -110,6 +112,67 @@
 dplyr::select(c(date, country, events)) %>%
 dplyr::arrange(desc(country))
 
+  # Experimental: Generate all countries breakdown
+  data("ISO_3166_1", package = "ISOcodes")
+  all_countries <- data.frame(abb = c(regions$abb, ISO_3166_1$Alpha_2),
+  name = c(regions$region, ISO_3166_1$Name),
+  stringsAsFactors = FALSE)
+  all_country_data <- as.data.frame(data) %>%
+dplyr::mutate(country = ifelse(country %in% all_countries$abb, country, 
"Other")) %>%
+dplyr::left_join(all_countries, by = c("country" = "abb")) %>%
+dplyr::mutate(name = ifelse(is.na(name), "Other", name)) %>%
+dplyr::select(-country) %>% dplyr::rename(country = name) %>%
+dplyr::group_by(country) %>%
+dplyr::summarize(events = n(), ctr = sum(type=="clickthrough")/n()) %>%
+dplyr::mutate(date = date) %>%
+dplyr::select(c(date, country, events, ctr)) %>%
+dplyr::arrange(desc(country))   
+  
+  # Last action by country
+  last_action_country <- as.data.frame(data) %>%
+dplyr::mutate(country = ifelse(country %in% all_countries$abb, country, 
"Other")) %>%
+dplyr::left_join(all_countries, by = c("country" = "abb")) %>%
+dplyr::mutate(name = ifelse(is.na(name), "Other", name)) %>%
+dplyr::select(-country) %>% dplyr::rename(country = name) %>%
+dplyr::arrange(ts) %>%
+   dplyr::filter(!duplicated(session, fromLast = TRUE)) %>%
+   dplyr::group_by(date, country, section_used) %>%
+   dplyr::summarize(events = n()) %>%
+   dplyr::mutate(prop = events/sum(events))
+
+  # Most common section clicked by country
+  most_common_country <- as.data.frame(data) %>%
+dplyr::mutate(country = ifelse(country %in% all_countries$abb, country, 
"Other")) %>%
+dplyr::left_join(all_countries, by = c("country" = "abb")) %>%
+dplyr::mutate(name = ifelse(is.na(name), "Other", name)) %>%
+dplyr::select(-country) %>% dplyr::rename(country = name) %>%
+dplyr::arrange(session, ts) %>%
+dplyr::group_by(session) %>%
+dplyr::mutate(visit = cumsum(type == "landing")) %>%
+dplyr::filter(type == "clickthrough") %>%
+dplyr::group_by(date, country, session, visit, section_used) %>%
+dplyr::tally() %>%
+dplyr::top_n(1, n) %>%
+dplyr::ungroup() %>%
+dplyr::group_by(date, country, section_used) %>%
+dplyr::summarize(visits = n()) %>%
+   dplyr::mutate(prop = visits/sum(visits)) %>%
+dplyr::ungroup()
+
+  # First visit clickthrough rates by country
+  first_visits_country <- as.data.frame(data) %>%
+dplyr::mutate(country = ifelse(country %in% all_countries$abb, country, 
"Other")) %>%
+dplyr::left_join(all_countries, by = c("country" = "abb")) %>%
+dplyr::mutate(name = ifelse(is.na(name), "Other", name)) %>%
+dplyr::selec