[MediaWiki-commits] [Gerrit] wikimedia...golden[master]: Expand "other" countries data
Bearloga has submitted this change and it was merged. Change subject: Expand "other" countries data .. Expand "other" countries data - Traffic, clickthrough rate, number of visits, ctr_visit (proportion of visits that have at least one clickthrough), number of sessions, ctr_session (proportion of sessions that have at least one clickthrough), by all countries and US regions - Last action by all countries and US regions - Most common section clicked per visit by all countries and US regions - First visit clickthrough by all countries and US regions Also fixed two bugs in: - Most common section clicked - Generate click breakdown (last action): 1) add sort by timestamp 2) delete lines which modify raw data Bug: T138107 Change-Id: I51c8ca4222175d0ba5c1c48d187c9b0bbe8f7a3d --- M portal/portal.R 1 file changed, 85 insertions(+), 9 deletions(-) Approvals: Bearloga: Verified; Looks good to me, approved diff --git a/portal/portal.R b/portal/portal.R index bf8d3b6..5b77aed 100644 --- a/portal/portal.R +++ b/portal/portal.R @@ -46,7 +46,6 @@ dplyr::arrange(session, ts) %>% dplyr::group_by(session) %>% dplyr::mutate(visit = cumsum(type == "landing")) %>% -dplyr::filter(visit == 1) %>% dplyr::filter(type == "clickthrough") %>% dplyr::group_by(date, session, visit, section_used) %>% dplyr::tally() %>% @@ -75,19 +74,25 @@ dplyr::mutate(date = data$date[1]) %>% dplyr::select(c(date, `no action`, `primary links`, `search`, `secondary links`, `other languages`, `other projects`)) - # Generate click breakdown - data <- data[order(data$type, decreasing = FALSE), ] - data <- data[!duplicated(data$session), ] - breakdown_data <- data[, j = list(events = .N), by = c("date", "section_used")] + # Generate click breakdown (last action) + breakdown_data <- data %>% +dplyr::arrange(ts) %>% +dplyr::filter(!duplicated(session, fromLast = TRUE)) %>% +dplyr::group_by(date, section_used) %>% +dplyr::summarize(events = n()) %>% +data.table::as.data.table() # Generate by-country breakdown with regional data for US - regions <- data.frame(abb = paste0("US:", c(as.character(state.abb), "DC")), + data("ISO_3166_1", package = "ISOcodes") + us_other_abb <- c("AS", "GU", "MP", "PR", "VI") + us_other_mask <- match(us_other_abb, ISO_3166_1$Alpha_2) + regions <- data.frame(abb = c(paste0("US:", c(as.character(state.abb), "DC")), us_other_abb), # ^ need to verify that District of Columbia shows up as DC and not another abbreviation -region = paste0("U.S. (", c(as.character(state.region), "South"), ")"), -state = c(state.name, "District of Columbia"), +region = paste0("U.S. (", c(as.character(state.region), "South", rep("Other",5)), ")"), +state = c(state.name, "District of Columbia", ISO_3166_1$Name[us_other_mask]), stringsAsFactors = FALSE) regions$region[regions$region == "U.S. (North Central)"] <- "U.S. (Midwest)" - regions$region[state.division == "Pacific"] <- "U.S. (Pacific)" # see https://phabricator.wikimedia.org/T136257#2399411 + regions$region[c(state.division == "Pacific", rep(FALSE, 5))] <- "U.S. (Pacific)" # see https://phabricator.wikimedia.org/T136257#2399411 countries <- data.frame(abb = c(regions$abb, "GB", "CA", "DE", "IN", "AU", "CN", "RU", "PH", "FR"), @@ -110,6 +115,71 @@ dplyr::select(c(date, country, events)) %>% dplyr::arrange(desc(country)) + # Experimental: Generate all countries breakdown + all_countries <- data.frame(abb = c(regions$abb, ISO_3166_1$Alpha_2[-us_other_mask]), + name = c(regions$region, ISO_3166_1$Name[-us_other_mask]), + stringsAsFactors = FALSE) + data_w_countryname <- as.data.frame(data) %>% +dplyr::mutate(country = ifelse(country %in% all_countries$abb, country, "Other")) %>% +dplyr::left_join(all_countries, by = c("country" = "abb")) %>% +dplyr::mutate(name = ifelse(is.na(name), "Other", name)) %>% +dplyr::select(-country) %>% dplyr::rename(country = name) + + ctr_visit <- data_w_countryname %>% +dplyr::arrange(session, ts) %>% +dplyr::group_by(session) %>% +dplyr::mutate(visit = cumsum(type == "landing")) %>% +dplyr::group_by(date, country, session, visit) %>% +dplyr::summarize(dummy_clt = sum(type=="clickthrough")>1) %>% +dplyr::group_by(country) %>% +dplyr::summarize(n_visit = n(), ctr_visit = sum(dummy_clt)/n()) + ctr_session <- data_w_countryname %>% +dplyr::group_by(date, country, session) %>% +dplyr::summarize(dummy_clt = sum(type=="clickthrough")>1) %>% +dplyr::group_by(country) %>% +dplyr::summarize(n_session = n(), ctr_session = sum(dummy_clt)/n()) + all_country_data <-
[MediaWiki-commits] [Gerrit] wikimedia...golden[master]: Expand "other" countries data
Chelsyx has uploaded a new change for review. https://gerrit.wikimedia.org/r/310473 Change subject: Expand "other" countries data .. Expand "other" countries data - Traffic and clickthrough rate by all countries and US regions - Last action by all countries and US regions - Most common section clicked per visit by all countries and US regions - First visit clickthrough by all countries and US regions Also fixed two bugs in: - Most common section clicked - Generate click breakdown (last action): 1) add sort by timestamp 2) delete lines which modify raw data Bug: T138107 Change-Id: I51c8ca4222175d0ba5c1c48d187c9b0bbe8f7a3d --- M portal/portal.R 1 file changed, 74 insertions(+), 5 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/golden refs/changes/73/310473/1 diff --git a/portal/portal.R b/portal/portal.R index bf8d3b6..475aa00 100644 --- a/portal/portal.R +++ b/portal/portal.R @@ -46,7 +46,6 @@ dplyr::arrange(session, ts) %>% dplyr::group_by(session) %>% dplyr::mutate(visit = cumsum(type == "landing")) %>% -dplyr::filter(visit == 1) %>% dplyr::filter(type == "clickthrough") %>% dplyr::group_by(date, session, visit, section_used) %>% dplyr::tally() %>% @@ -75,10 +74,13 @@ dplyr::mutate(date = data$date[1]) %>% dplyr::select(c(date, `no action`, `primary links`, `search`, `secondary links`, `other languages`, `other projects`)) - # Generate click breakdown - data <- data[order(data$type, decreasing = FALSE), ] - data <- data[!duplicated(data$session), ] - breakdown_data <- data[, j = list(events = .N), by = c("date", "section_used")] + # Generate click breakdown (last action) + breakdown_data <- data %>% +dplyr::arrange(ts) %>% + dplyr::filter(!duplicated(session, fromLast = TRUE)) %>% + dplyr::group_by(date, section_used) %>% + dplyr::summarize(events = n()) %>% + data.table::as.data.table() # Generate by-country breakdown with regional data for US regions <- data.frame(abb = paste0("US:", c(as.character(state.abb), "DC")), @@ -110,6 +112,67 @@ dplyr::select(c(date, country, events)) %>% dplyr::arrange(desc(country)) + # Experimental: Generate all countries breakdown + data("ISO_3166_1", package = "ISOcodes") + all_countries <- data.frame(abb = c(regions$abb, ISO_3166_1$Alpha_2), + name = c(regions$region, ISO_3166_1$Name), + stringsAsFactors = FALSE) + all_country_data <- as.data.frame(data) %>% +dplyr::mutate(country = ifelse(country %in% all_countries$abb, country, "Other")) %>% +dplyr::left_join(all_countries, by = c("country" = "abb")) %>% +dplyr::mutate(name = ifelse(is.na(name), "Other", name)) %>% +dplyr::select(-country) %>% dplyr::rename(country = name) %>% +dplyr::group_by(country) %>% +dplyr::summarize(events = n(), ctr = sum(type=="clickthrough")/n()) %>% +dplyr::mutate(date = date) %>% +dplyr::select(c(date, country, events, ctr)) %>% +dplyr::arrange(desc(country)) + + # Last action by country + last_action_country <- as.data.frame(data) %>% +dplyr::mutate(country = ifelse(country %in% all_countries$abb, country, "Other")) %>% +dplyr::left_join(all_countries, by = c("country" = "abb")) %>% +dplyr::mutate(name = ifelse(is.na(name), "Other", name)) %>% +dplyr::select(-country) %>% dplyr::rename(country = name) %>% +dplyr::arrange(ts) %>% + dplyr::filter(!duplicated(session, fromLast = TRUE)) %>% + dplyr::group_by(date, country, section_used) %>% + dplyr::summarize(events = n()) %>% + dplyr::mutate(prop = events/sum(events)) + + # Most common section clicked by country + most_common_country <- as.data.frame(data) %>% +dplyr::mutate(country = ifelse(country %in% all_countries$abb, country, "Other")) %>% +dplyr::left_join(all_countries, by = c("country" = "abb")) %>% +dplyr::mutate(name = ifelse(is.na(name), "Other", name)) %>% +dplyr::select(-country) %>% dplyr::rename(country = name) %>% +dplyr::arrange(session, ts) %>% +dplyr::group_by(session) %>% +dplyr::mutate(visit = cumsum(type == "landing")) %>% +dplyr::filter(type == "clickthrough") %>% +dplyr::group_by(date, country, session, visit, section_used) %>% +dplyr::tally() %>% +dplyr::top_n(1, n) %>% +dplyr::ungroup() %>% +dplyr::group_by(date, country, section_used) %>% +dplyr::summarize(visits = n()) %>% + dplyr::mutate(prop = visits/sum(visits)) %>% +dplyr::ungroup() + + # First visit clickthrough rates by country + first_visits_country <- as.data.frame(data) %>% +dplyr::mutate(country = ifelse(country %in% all_countries$abb, country, "Other")) %>% +dplyr::left_join(all_countries, by = c("country" = "abb")) %>% +dplyr::mutate(name = ifelse(is.na(name), "Other", name)) %>% +dplyr::selec