Bearloga has submitted this change and it was merged. Change subject: Expand "other" countries data ......................................................................
Expand "other" countries data - Traffic, clickthrough rate, number of visits, ctr_visit (proportion of visits that have at least one clickthrough), number of sessions, ctr_session (proportion of sessions that have at least one clickthrough), by all countries and US regions - Last action by all countries and US regions - Most common section clicked per visit by all countries and US regions - First visit clickthrough by all countries and US regions Also fixed two bugs in: - Most common section clicked - Generate click breakdown (last action): 1) add sort by timestamp 2) delete lines which modify raw data Bug: T138107 Change-Id: I51c8ca4222175d0ba5c1c48d187c9b0bbe8f7a3d --- M portal/portal.R 1 file changed, 85 insertions(+), 9 deletions(-) Approvals: Bearloga: Verified; Looks good to me, approved diff --git a/portal/portal.R b/portal/portal.R index bf8d3b6..5b77aed 100644 --- a/portal/portal.R +++ b/portal/portal.R @@ -46,7 +46,6 @@ dplyr::arrange(session, ts) %>% dplyr::group_by(session) %>% dplyr::mutate(visit = cumsum(type == "landing")) %>% - dplyr::filter(visit == 1) %>% dplyr::filter(type == "clickthrough") %>% dplyr::group_by(date, session, visit, section_used) %>% dplyr::tally() %>% @@ -75,19 +74,25 @@ dplyr::mutate(date = data$date[1]) %>% dplyr::select(c(date, `no action`, `primary links`, `search`, `secondary links`, `other languages`, `other projects`)) - # Generate click breakdown - data <- data[order(data$type, decreasing = FALSE), ] - data <- data[!duplicated(data$session), ] - breakdown_data <- data[, j = list(events = .N), by = c("date", "section_used")] + # Generate click breakdown (last action) + breakdown_data <- data %>% + dplyr::arrange(ts) %>% + dplyr::filter(!duplicated(session, fromLast = TRUE)) %>% + dplyr::group_by(date, section_used) %>% + dplyr::summarize(events = n()) %>% + data.table::as.data.table() # Generate by-country breakdown with regional data for US - regions <- data.frame(abb = paste0("US:", c(as.character(state.abb), "DC")), + data("ISO_3166_1", package = "ISOcodes") + us_other_abb <- c("AS", "GU", "MP", "PR", "VI") + us_other_mask <- match(us_other_abb, ISO_3166_1$Alpha_2) + regions <- data.frame(abb = c(paste0("US:", c(as.character(state.abb), "DC")), us_other_abb), # ^ need to verify that District of Columbia shows up as DC and not another abbreviation - region = paste0("U.S. (", c(as.character(state.region), "South"), ")"), - state = c(state.name, "District of Columbia"), + region = paste0("U.S. (", c(as.character(state.region), "South", rep("Other",5)), ")"), + state = c(state.name, "District of Columbia", ISO_3166_1$Name[us_other_mask]), stringsAsFactors = FALSE) regions$region[regions$region == "U.S. (North Central)"] <- "U.S. (Midwest)" - regions$region[state.division == "Pacific"] <- "U.S. (Pacific)" # see https://phabricator.wikimedia.org/T136257#2399411 + regions$region[c(state.division == "Pacific", rep(FALSE, 5))] <- "U.S. (Pacific)" # see https://phabricator.wikimedia.org/T136257#2399411 countries <- data.frame(abb = c(regions$abb, "GB", "CA", "DE", "IN", "AU", "CN", "RU", "PH", "FR"), @@ -110,6 +115,71 @@ dplyr::select(c(date, country, events)) %>% dplyr::arrange(desc(country)) + # Experimental: Generate all countries breakdown + all_countries <- data.frame(abb = c(regions$abb, ISO_3166_1$Alpha_2[-us_other_mask]), + name = c(regions$region, ISO_3166_1$Name[-us_other_mask]), + stringsAsFactors = FALSE) + data_w_countryname <- as.data.frame(data) %>% + dplyr::mutate(country = ifelse(country %in% all_countries$abb, country, "Other")) %>% + dplyr::left_join(all_countries, by = c("country" = "abb")) %>% + dplyr::mutate(name = ifelse(is.na(name), "Other", name)) %>% + dplyr::select(-country) %>% dplyr::rename(country = name) + + ctr_visit <- data_w_countryname %>% + dplyr::arrange(session, ts) %>% + dplyr::group_by(session) %>% + dplyr::mutate(visit = cumsum(type == "landing")) %>% + dplyr::group_by(date, country, session, visit) %>% + dplyr::summarize(dummy_clt = sum(type=="clickthrough")>1) %>% + dplyr::group_by(country) %>% + dplyr::summarize(n_visit = n(), ctr_visit = sum(dummy_clt)/n()) + ctr_session <- data_w_countryname %>% + dplyr::group_by(date, country, session) %>% + dplyr::summarize(dummy_clt = sum(type=="clickthrough")>1) %>% + dplyr::group_by(country) %>% + dplyr::summarize(n_session = n(), ctr_session = sum(dummy_clt)/n()) + all_country_data <- data_w_countryname %>% + dplyr::group_by(country) %>% + dplyr::summarize(events = n(), ctr = sum(type=="clickthrough")/n()) %>% + dplyr::mutate(date = date) %>% + dplyr::select(c(date, country, events, ctr)) %>% + dplyr::arrange(desc(country)) %>% + dplyr::left_join(ctr_visit, by="country") %>% + dplyr::left_join(ctr_session, by="country") + + # Last action by country + last_action_country <- data_w_countryname %>% + dplyr::arrange(ts) %>% + dplyr::filter(!duplicated(session, fromLast = TRUE)) %>% + dplyr::group_by(date, section_used, country) %>% + dplyr::summarize(events = n()) %>% + dplyr::mutate(prop = events/sum(events)) + + # Most common section clicked by country + most_common_country <- data_w_countryname %>% + dplyr::arrange(session, ts) %>% + dplyr::group_by(session) %>% + dplyr::mutate(visit = cumsum(type == "landing")) %>% + dplyr::filter(type == "clickthrough") %>% + dplyr::group_by(date, country, session, visit, section_used) %>% + dplyr::tally() %>% + dplyr::top_n(1, n) %>% + dplyr::ungroup() %>% + dplyr::group_by(date, section_used, country) %>% + dplyr::summarize(visits = n()) %>% + dplyr::mutate(prop = visits/sum(visits)) %>% + dplyr::ungroup() + + # First visit clickthrough rates by country + first_visits_country <- data_w_countryname %>% + dplyr::arrange(session, ts) %>% + dplyr::group_by(session) %>% + dplyr::mutate(visit = cumsum(type == "landing")) %>% + dplyr::filter(visit == 1) %>% + dplyr::group_by(date, section_used, country) %>% + dplyr::summarize(sessions = n()) %>% + dplyr::mutate(proportion = sessions/sum(sessions)) + # Get user agent data wmf::set_proxies() # To allow for the latest YAML to be retrieved. uaparser::update_regexes() @@ -128,5 +198,11 @@ wmf::write_conditional(country_data, file.path(base_path, "country_data.tsv")) wmf::write_conditional(ua_data, file.path(base_path, "user_agent_data.tsv")) + days_to_keep <- 60 + wmf::rewrite_conditional(all_country_data, file.path(base_path, "all_country_data.tsv"), days_to_keep) + wmf::rewrite_conditional(last_action_country, file.path(base_path, "last_action_country.tsv"), days_to_keep) + wmf::rewrite_conditional(most_common_country, file.path(base_path, "most_common_country.tsv"), days_to_keep) + wmf::rewrite_conditional(first_visits_country, file.path(base_path, "first_visits_country.tsv"), days_to_keep) + return(invisible()) } -- To view, visit https://gerrit.wikimedia.org/r/310473 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I51c8ca4222175d0ba5c1c48d187c9b0bbe8f7a3d Gerrit-PatchSet: 3 Gerrit-Project: wikimedia/discovery/golden Gerrit-Branch: master Gerrit-Owner: Chelsyx <c...@wikimedia.org> Gerrit-Reviewer: Bearloga <mpo...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits