Chelsyx has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/341743 )
Change subject: Annotate Reportupdater migration on graphs ...................................................................... Annotate Reportupdater migration on graphs Bug: T150915 Change-Id: Idd8b46e61db9e33788d2be63564c3dc40334dc5f --- M server.R M tab_documentation/action_breakdown.md M tab_documentation/applinks.md M tab_documentation/clickthrough_rate.md M tab_documentation/dwelltime.md M tab_documentation/first_visit.md M tab_documentation/geography.md M tab_documentation/languages_summary.md M tab_documentation/languages_visited.md M tab_documentation/most_common.md M tab_documentation/pageviews.md M tab_documentation/referers_byengine.md M tab_documentation/referers_summary.md M tab_documentation/sisproj.md M utils.R 15 files changed, 95 insertions(+), 63 deletions(-) Approvals: Chelsyx: Verified; Looks good to me, approved diff --git a/server.R b/server.R index 9da5d5a..2d82d55 100644 --- a/server.R +++ b/server.R @@ -51,7 +51,8 @@ dyEvent(as.Date("2016-05-18"), "Sister Links Updated", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-06-02"), "Detect Language Deployed", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-08-16"), "Secondary Links Collapsed", labelLoc = "bottom", color = "white") %>% - dyEvent(as.Date("2016-09-13"), "B (schema switch)", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-09-13"), "B (schema switch)", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$action_breakdown_dygraph <- renderDygraph({ @@ -68,7 +69,8 @@ dyEvent(as.Date("2016-05-18"), "Sister Links Updated", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-06-02"), "Detect Language Deployed", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-08-16"), "Secondary Links Collapsed", labelLoc = "bottom", color = "white") %>% - dyEvent(as.Date("2016-09-13"), "B (schema switch)", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-09-13"), "B (schema switch)", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$most_common_dygraph <- renderDygraph({ @@ -83,7 +85,8 @@ dyEvent(as.Date("2016-05-18"), "Sister Links Updated", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-06-02"), "Detect Language Deployed", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-08-16"), "Secondary Links Collapsed", labelLoc = "bottom", color = "white") %>% - dyEvent(as.Date("2016-09-13"), "A (schema switch)", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-09-13"), "A (schema switch)", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$first_visit_dygraph <- renderDygraph({ @@ -99,7 +102,8 @@ dyEvent(as.Date("2016-05-18"), "Sister Links Updated", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-06-02"), "Detect Language Deployed", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-08-16"), "Secondary Links Collapsed", labelLoc = "bottom", color = "white") %>% - dyEvent(as.Date("2016-09-13"), "A (schema switch)", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-09-13"), "A (schema switch)", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$dwelltime_dygraph <- renderDygraph({ @@ -115,7 +119,8 @@ dyEvent(as.Date("2016-05-18"), "Sister Links Updated", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-06-02"), "Detect Language Deployed", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-08-16"), "Secondary Links Collapsed", labelLoc = "bottom", color = "white") %>% - dyEvent(as.Date("2016-09-13"), "B (schema switch)", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-09-13"), "B (schema switch)", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$sisproj_dygraph <- renderDygraph({ @@ -137,7 +142,8 @@ polloi::smoother(smooth_level = polloi::smooth_switch(input$smoothing_global, input$smoothing_sisproj), rename = FALSE) %>% polloi::make_dygraph("Date", ifelse(input$sisproj_type == "prop", "Proportion (%)", input$sisproj_metric), paste(ifelse(input$sisproj_metric == "Clicks", "Clicks", "Users who clicked"), "on links other Wikimedia Foundation projects")) %>% - dyLegend(labelsDiv = "sisproj_legend", show = "always", width = 600) + dyLegend(labelsDiv = "sisproj_legend", show = "always", width = 600) %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") return(dy) }) @@ -187,7 +193,8 @@ tidyr::spread(group, clicks, fill = 0) %>% polloi::smoother(smooth_level = polloi::smooth_switch(input$smoothing_global, input$smoothing_applinks), rename = FALSE) %>% polloi::make_dygraph("Date", ifelse(input$applinks_type == "prop", "Proportion (%)", "Clicks"), "Clicks on Wikipedia mobile app links") %>% - dyLegend(labelsDiv = "applinks_legend", show = "always", width = 600) + dyLegend(labelsDiv = "applinks_legend", show = "always", width = 600) %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") return(dy) }) @@ -222,7 +229,8 @@ dyEvent(as.Date("2016-06-02"), "Detect Language Deployed", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-06-28"), "A (regional U.S.)", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-08-16"), "Secondary Links Collapsed", labelLoc = "bottom", color = "white") %>% - dyEvent(as.Date("2016-09-13"), "B (schema switch)", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-09-13"), "B (schema switch)", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$browser_selector_container <- renderUI({ @@ -330,7 +338,8 @@ dyEvent(as.Date("2016-03-10"), "Search Box Deployed", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-05-18"), "Sister Links Updated", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-06-02"), "Detect Language Deployed", labelLoc = "bottom", color = "white") %>% - dyEvent(as.Date("2016-08-16"), "Secondary Links Collapsed", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-08-16"), "Secondary Links Collapsed", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$pageview_dygraph <- renderDygraph({ @@ -348,7 +357,8 @@ dyEvent(as.Date("2016-06-09"), "B (unexplainable rise)", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-06-22"), "C (pageview redefined)", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-07-11"), "D (split-pageviews)", labelLoc = "bottom", color = "white") %>% - dyEvent(as.Date("2016-08-16"), "Secondary Links Collapsed", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-08-16"), "Secondary Links Collapsed", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$referer_summary_dygraph <- renderDygraph({ @@ -362,7 +372,8 @@ dyLegend(labelsDiv = "referer_summary_legend", show = "always") %>% dyRangeSelector(fillColor = "", strokeColor = "", retainDateWindow = TRUE) %>% dyEvent(as.Date("2016-03-07"), "A (UDF switch)", labelLoc = "bottom", color = "white") %>% - dyEvent(as.Date("2016-05-01"), "B (search-redirect.php)", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-05-01"), "B (search-redirect.php)", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$search_engines_dygraph <- renderDygraph({ @@ -374,7 +385,8 @@ dyAxis("x", axisLabelFormatter = polloi::custom_axis_formatter, axisLabelWidth = 70) %>% dyAxis("y", valueFormatter = 'function(x) { return x + "%"; }') %>% dyLegend(labelsDiv = "search_engines_legend", show = "always") %>% - dyRangeSelector(fillColor = "", strokeColor = "", retainDateWindow = TRUE) + dyRangeSelector(fillColor = "", strokeColor = "", retainDateWindow = TRUE) %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$s_dygraph <- renderDygraph({ @@ -434,7 +446,8 @@ dyEvent(as.Date("2016-05-18"), "Sister Links Updated", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-06-02"), "Detect Language Deployed", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-08-16"), "Secondary Links Collapsed", labelLoc = "bottom", color = "white") %>% - dyEvent(as.Date("2016-09-13"), "B (schema switch)", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-09-13"), "B (schema switch)", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) lv_reactive <- reactiveValues(choices = NULL, selected_langs = NULL) @@ -585,7 +598,8 @@ dyEvent(as.Date("2016-05-18"), "Sister Links Updated", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-06-02"), "Detect Language Deployed", labelLoc = "bottom", color = "white") %>% dyEvent(as.Date("2016-08-16"), "Secondary Links Collapsed", labelLoc = "bottom", color = "white") %>% - dyEvent(as.Date("2016-09-13"), "B (schema switch)", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-09-13"), "B (schema switch)", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) # General Geo Breakdowns @@ -744,7 +758,8 @@ dyAxis("x", rangePad = 8) %>% dyLegend(width = 400, labelsDiv = "cntr_a_legend", show = "always", showZeroValues = FALSE) %>% dyCSS(css = "www/inverse.css") %>% - dyEvent(as.Date("2016-09-13"), "Schema Switch", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-09-13"), "Schema Switch", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$traffic_pie_pl <- renderHighchart({ if(input$traffic_select %in% c('events','ctr_all')){ @@ -971,7 +986,8 @@ dyAxis("x", rangePad = 8) %>% dyLegend(width = 400, labelsDiv = "cntr_f_legend", show = "always", showZeroValues = FALSE) %>% dyCSS(css = "www/inverse.css") %>% - dyEvent(as.Date("2016-09-13"), "Schema Switch", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-09-13"), "Schema Switch", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$first_visit_pie_pl <- renderHighchart({ data4pie <- first_visits_country_dt() %>% @@ -1174,7 +1190,8 @@ dyAxis("x", rangePad = 8) %>% dyLegend(width = 400, labelsDiv = "cntr_l_legend", show = "always", showZeroValues = FALSE) %>% dyCSS(css = "www/inverse.css") %>% - dyEvent(as.Date("2016-09-13"), "Schema Switch", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-09-13"), "Schema Switch", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$last_action_pie_pl <- renderHighchart({ data4pie <- last_action_country_dt() %>% @@ -1376,7 +1393,8 @@ dyAxis("x", rangePad = 8) %>% dyLegend(width = 400, labelsDiv = "cntr_m_legend", show = "always", showZeroValues = FALSE) %>% dyCSS(css = "www/inverse.css") %>% - dyEvent(as.Date("2016-09-13"), "Schema Switch", labelLoc = "bottom", color = "white") + dyEvent(as.Date("2016-09-13"), "Schema Switch", labelLoc = "bottom", color = "white") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom", color = "white") }) output$most_common_pie_pl <- renderHighchart({ data4pie <- most_common_country_dt() %>% diff --git a/tab_documentation/action_breakdown.md b/tab_documentation/action_breakdown.md index 3cc73ec..6ec8197 100644 --- a/tab_documentation/action_breakdown.md +++ b/tab_documentation/action_breakdown.md @@ -18,8 +18,9 @@ Outages and inaccuracies ------ -- From 7 December (marked "A") the sampling changed to exclude a broader range of browsers, resulting in alterations to things like clickthrough rate and dwell time. We expect this to resolve itself on 4 January when a new schema version is launched. -- **B** (13 September 2016): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +- '__A__' (2015-12-07): The sampling changed to exclude a broader range of browsers, resulting in alterations to things like clickthrough rate and dwell time. We expect this to resolve itself on 4 January when a new schema version is launched. +* '__B__' (2016-09-13): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. Broadly-speaking, it's worth noting that (as with all data based on JavaScript logging) the code that gathers this information requires a certain amount of browser capabilities to function. It's probably not going to work on 10 year old Nokia brick phones, and so the data will be biased against users using those kinds of devices. diff --git a/tab_documentation/applinks.md b/tab_documentation/applinks.md index 1c64e7d..f22fd49 100644 --- a/tab_documentation/applinks.md +++ b/tab_documentation/applinks.md @@ -6,6 +6,8 @@ Outages and inaccuracies ------ +* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. + Broadly-speaking, it's worth noting that (as with all data based on JavaScript logging) the code that gathers this information requires a certain amount of browser capabilities to function. It's probably not going to work on 10 year old Nokia brick phones, and so the data will be biased against users using those kinds of devices. Questions, bug reports, and feature suggestions diff --git a/tab_documentation/clickthrough_rate.md b/tab_documentation/clickthrough_rate.md index d78af97..0ec1a3c 100644 --- a/tab_documentation/clickthrough_rate.md +++ b/tab_documentation/clickthrough_rate.md @@ -13,8 +13,9 @@ Outages and inaccuracies ------ -- From 7 December (marked "A") the sampling changed to exclude a broader range of browsers, resulting in alterations to things like clickthrough rate and dwell time. We expect this to resolve itself on 4 January when a new schema version is launched. -- **B** (13 September 2016): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__A__' (2015-12-07): The sampling changed to exclude a broader range of browsers, resulting in alterations to things like clickthrough rate and dwell time. We expect this to resolve itself on 4 January when a new schema version is launched. +* '__B__' (2016-09-13): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. Broadly-speaking, it's worth noting that (as with all data based on JavaScript logging) the code that gathers this information requires a certain amount of browser capabilities to function. It's probably not going to work on 10 year old Nokia brick phones, and so the data will be biased against users using those kinds of devices. diff --git a/tab_documentation/dwelltime.md b/tab_documentation/dwelltime.md index 254f6ac..6133f1f 100644 --- a/tab_documentation/dwelltime.md +++ b/tab_documentation/dwelltime.md @@ -6,8 +6,9 @@ Outages and inaccuracies ------ -- From 7 December (marked "A") the sampling changed to exclude a broader range of browsers, resulting in alterations to things like clickthrough rate and dwell time. We expect this to resolve itself on 4 January when a new schema version is launched. -- **B** (13 September 2016): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__A__' (2015-12-07): The sampling changed to exclude a broader range of browsers, resulting in alterations to things like clickthrough rate and dwell time. We expect this to resolve itself on 4 January when a new schema version is launched. +* '__B__' (2016-09-13): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. Broadly-speaking, it's worth noting that (as with all data based on JavaScript logging) the code that gathers this information requires a certain amount of browser capabilities to function. It's probably not going to work on 10 year old Nokia brick phones, and so the data will be biased against users using those kinds of devices. diff --git a/tab_documentation/first_visit.md b/tab_documentation/first_visit.md index 0164e46..9617b62 100644 --- a/tab_documentation/first_visit.md +++ b/tab_documentation/first_visit.md @@ -18,7 +18,8 @@ Outages and notes ------- -- **A** (13 September 2016): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__A__' (2016-09-13): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. Questions, bug reports, and feature suggestions ------ diff --git a/tab_documentation/geography.md b/tab_documentation/geography.md index 7fb2d0f..3e14093 100644 --- a/tab_documentation/geography.md +++ b/tab_documentation/geography.md @@ -7,7 +7,7 @@ ------ * Broadly-speaking, it's worth noting that (as with all data based on JavaScript logging) the code that gathers this information requires a certain amount of browser capabilities to function. It's probably not going to work on 10 year old Nokia brick phones, and so the data will be biased against users using those kinds of devices. -* __A__: On 28 June 2016 our Event Logging system started recording a finer view of U.S. traffic, breaking it down into 5 regions: +* '__A__': on 2016-06-28 our Event Logging system started recording a finer view of U.S. traffic, breaking it down into 5 regions: - **Northeast Region** - New England Division: Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island and Vermont - Middle Atlantic Division: New Jersey, New York and Pennsylvania @@ -22,7 +22,8 @@ - Mountain Division: Arizona, Colorado, Idaho, Montana, Nevada, New Mexico, Utah and Wyoming - **Pacific Region** - Alaska, California, Hawaii, Oregon and Washington -* __B__ (13 September 2016): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__B__' (2016-09-13): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. See [T136257](https://phabricator.wikimedia.org/T136257) for more details. diff --git a/tab_documentation/languages_summary.md b/tab_documentation/languages_summary.md index a888671..6c10a08 100644 --- a/tab_documentation/languages_summary.md +++ b/tab_documentation/languages_summary.md @@ -14,10 +14,11 @@ ## Outages and notes -- **A**: Languages visited data backfilled +* '__A__': Languages visited data backfilled 1. The data we used for a retrospective study of Portal deployments started on 16 November 2016, although there were filters applied to the data used in the analysis. Specifically, known spiders were excluded and only data from the first 10 visits per session was kept for data storage space reasons. 2. When we began work on this part of the dashboard, we could only backfill data from 2016-05-10 due to the 90-day restriction our event logging system has. Therefore, we had to use the previously saved (slightly filtered) data to backfill visited language counts from November 16th to May 9th. We checked how the filtered data (post May 10th) compared to the unfiltered data and some counts were off by 1-8 clicks, hence why we are noting the difference here. -- **B** (13 September 2016): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__B__' (2016-09-13): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. ## Questions, bug reports, and feature suggestions diff --git a/tab_documentation/languages_visited.md b/tab_documentation/languages_visited.md index aa5b78b..6f167a4 100644 --- a/tab_documentation/languages_visited.md +++ b/tab_documentation/languages_visited.md @@ -18,10 +18,11 @@ ## Outages and notes -- **A**: Languages visited data backfilled +* '__A__': Languages visited data backfilled 1. The data we used for a retrospective study of Portal deployments started on 16 November 2016, although there were filters applied to the data used in the analysis. Specifically, known spiders were excluded and only data from the first 10 visits per session was kept for data storage space reasons. 2. When we began work on this part of the dashboard, we could only backfill data from 2016-05-10 due to the 90-day restriction our event logging system has. Therefore, we had to use the previously saved (slightly filtered) data to backfill visited language counts from November 16th to May 9th. We checked how the filtered data (post May 10th) compared to the unfiltered data and some counts were off by 1-8 clicks, hence why we are noting the difference here. -- **B** (13 September 2016): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__B__' (2016-09-13): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. ## Questions, bug reports, and feature suggestions diff --git a/tab_documentation/most_common.md b/tab_documentation/most_common.md index 3e445b6..a8e43ec 100644 --- a/tab_documentation/most_common.md +++ b/tab_documentation/most_common.md @@ -15,7 +15,8 @@ Outages and notes ------- -- **A** (13 September 2016): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__A__' (2016-09-13): Added event logging of language-switching, causing some events to flow into old table and some events to flow into the new table. See [T143149](https://phabricator.wikimedia.org/T143149) for more details. +* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. Questions, bug reports, and feature suggestions ------ diff --git a/tab_documentation/pageviews.md b/tab_documentation/pageviews.md index b004dc3..a174ef9 100644 --- a/tab_documentation/pageviews.md +++ b/tab_documentation/pageviews.md @@ -5,10 +5,11 @@ Notes ------ -- **A**: Started filtering out search-redirect.php requests. See [T138411](https://phabricator.wikimedia.org/T138411) for more information. -- **B**: Rise could not be determined due to deleted data. See [write-up](https://github.com/wikimedia-research/Discovery-Research-Portal/blob/master/Analyses/Pageviews%20Rise/README.md) and [T143045](https://phabricator.wikimedia.org/T143045) for more information. -- **C**: The investigation of pageviews rise caused us to [redefine](https://gerrit.wikimedia.org/r/#/c/306261/) how we count wikipedia.org pageviews. Pageviews from 2016-06-22 to 2016-08-22 were then recounted using the new definition. See [T143064](https://phabricator.wikimedia.org/T143064) for more information. -- **D**: On 11 July 2016 we started to split pageview counts into pageviews from "low-volume" clients and "high-volume" clients. A "high-volume" client is a client whose wikipedia.org pageviews are equal to or greater than the 99.99th percentile for the whole population on any particular day. The rationale for this being that the low-volume clients' PV counts would be more stable and the high-volume clients' PV counts would soak up outliers and bots. See [T143605](https://phabricator.wikimedia.org/T143605) for more details. +* '__A__': Started filtering out search-redirect.php requests. See [T138411](https://phabricator.wikimedia.org/T138411) for more information. +* '__B__':Rise could not be determined due to deleted data. See [write-up](https://github.com/wikimedia-research/Discovery-Research-Portal/blob/master/Analyses/Pageviews%20Rise/README.md) and [T143045](https://phabricator.wikimedia.org/T143045) for more information. +* '__C__': The investigation of pageviews rise caused us to [redefine](https://gerrit.wikimedia.org/r/#/c/306261/) how we count wikipedia.org pageviews. Pageviews from 2016-06-22 to 2016-08-22 were then recounted using the new definition. See [T143064](https://phabricator.wikimedia.org/T143064) for more information. +* '__D__': on 2016-07-11 we started to split pageview counts into pageviews from "low-volume" clients and "high-volume" clients. A "high-volume" client is a client whose wikipedia.org pageviews are equal to or greater than the 99.99th percentile for the whole population on any particular day. The rationale for this being that the low-volume clients' PV counts would be more stable and the high-volume clients' PV counts would soak up outliers and bots. See [T143605](https://phabricator.wikimedia.org/T143605) for more details. +* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. Questions, bug reports, and feature suggestions ------ diff --git a/tab_documentation/referers_byengine.md b/tab_documentation/referers_byengine.md index 62dd13e..f03fb72 100644 --- a/tab_documentation/referers_byengine.md +++ b/tab_documentation/referers_byengine.md @@ -10,7 +10,7 @@ Outages and inaccuracies ------ -None so far! +* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. Questions, bug reports, and feature suggestions ------ diff --git a/tab_documentation/referers_summary.md b/tab_documentation/referers_summary.md index ca68182..fbfb990 100644 --- a/tab_documentation/referers_summary.md +++ b/tab_documentation/referers_summary.md @@ -10,8 +10,9 @@ Outages and notes ------ -- **A**: We switched to a finalized version of the UDF that extracts internal traffic (see [T130083](https://phabricator.wikimedia.org/T130083)) -- **B**: Started filtering out search-redirect.php requests. See [T138411](https://phabricator.wikimedia.org/T138411) for more information. +* '__A__': We switched to a finalized version of the UDF that extracts internal traffic (see [T130083](https://phabricator.wikimedia.org/T130083)) +* '__B__': Started filtering out search-redirect.php requests. See [T138411](https://phabricator.wikimedia.org/T138411) for more information. +* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. Questions, bug reports, and feature suggestions ------ diff --git a/tab_documentation/sisproj.md b/tab_documentation/sisproj.md index 2960dfc..44196ac 100644 --- a/tab_documentation/sisproj.md +++ b/tab_documentation/sisproj.md @@ -6,6 +6,8 @@ Outages and inaccuracies ------ +* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. + Broadly-speaking, it's worth noting that (as with all data based on JavaScript logging) the code that gathers this information requires a certain amount of browser capabilities to function. It's probably not going to work on 10 year old Nokia brick phones, and so the data will be biased against users using those kinds of devices. Questions, bug reports, and feature suggestions diff --git a/utils.R b/utils.R index 67a1594..e25b10b 100644 --- a/utils.R +++ b/utils.R @@ -150,22 +150,22 @@ first_visits_country <- polloi::read_dataset("discovery/portal/first_visits_country.tsv", col_types = "Dccid") last_action_country <- polloi::read_dataset("discovery/portal/last_action_country.tsv", col_types = "Dccid") most_common_country <- polloi::read_dataset("discovery/portal/most_common_country.tsv", col_types = "Dccid") - all_country_data$country[all_country_data$country=="Cape Verde"] <- "Cabo Verde" - first_visits_country$country[first_visits_country$country=="Cape Verde"] <- "Cabo Verde" - last_action_country$country[last_action_country$country=="Cape Verde"] <- "Cabo Verde" - most_common_country$country[most_common_country$country=="Cape Verde"] <- "Cabo Verde" - all_country_data$country[all_country_data$country=="Czechia"] <- "Czech Republic" - first_visits_country$country[first_visits_country$country=="Czechia"] <- "Czech Republic" - last_action_country$country[last_action_country$country=="Czechia"] <- "Czech Republic" - most_common_country$country[most_common_country$country=="Czechia"] <- "Czech Republic" - data("countrycode_data", package="countrycode") - countrycode_data$country.name[c(54,143)] <- c("Congo, The Democratic Republic of the", "Macedonia, Republic of" ) - countrycode_data$continent[countrycode_data$country.name %in% c("British Indian Ocean Territory","Christmas Island","Taiwan, Province of China")] <- "Asia" + all_country_data$country[all_country_data$country == "Cape Verde"] <- "Cabo Verde" + first_visits_country$country[first_visits_country$country == "Cape Verde"] <- "Cabo Verde" + last_action_country$country[last_action_country$country == "Cape Verde"] <- "Cabo Verde" + most_common_country$country[most_common_country$country == "Cape Verde"] <- "Cabo Verde" + all_country_data$country[all_country_data$country == "Czechia"] <- "Czech Republic" + first_visits_country$country[first_visits_country$country == "Czechia"] <- "Czech Republic" + last_action_country$country[last_action_country$country == "Czechia"] <- "Czech Republic" + most_common_country$country[most_common_country$country == "Czechia"] <- "Czech Republic" + data("countrycode_data", package = "countrycode") + # Note: version 0.19 (published on CRAN on 2017-02-06) has renamed 'country.name' to 'country.name.en' + countrycode_data$country.name.en[c(54, 143)] <- c("Congo, The Democratic Republic of the", "Macedonia, Republic of" ) + countrycode_data$continent[countrycode_data$country.name.en %in% c("British Indian Ocean Territory", "Christmas Island", "Taiwan, Province of China")] <- "Asia" countrycode_data$continent[countrycode_data$region == "South America"] <- "South America" countrycode_data$continent[countrycode_data$continent == "Americas"] <- "North America" - - all_country_data <- all_country_data[!duplicated(all_country_data[,1:2],fromLast=T),] + all_country_data <- all_country_data[!duplicated(all_country_data[,1:2],fromLast = TRUE),] all_country_data_prop <- all_country_data %>% dplyr::group_by(date) %>% dplyr::mutate(event_prop=round(events/sum(events),4)*100, visit_prop=round(n_visit/sum(n_visit),4)*100, session_prop=round(n_session/sum(n_session),4)*100) %>% @@ -202,10 +202,10 @@ colnames(us_data_prop) <- c("Date", "Country", "Number of Events", "Overall Clickthrough Rate", "Number of Visits", "Clickthrough Rate Per Visit", "Number of Sessions", "Clickthrough Rate Per Session") - region_mask <- match(stringi::stri_trans_general(all_country_data$Country, "Latin-ASCII"), countrycode_data$country.name) + region_mask <- match(stringi::stri_trans_general(all_country_data$Country, "Latin-ASCII"), countrycode_data$country.name.en) all_country_data$Region <- countrycode_data$continent[region_mask] all_country_data$Region[is.na(all_country_data$Region)] <- "Other" - region_mask <- match(stringi::stri_trans_general(all_country_data_prop$Country, "Latin-ASCII"), countrycode_data$country.name) + region_mask <- match(stringi::stri_trans_general(all_country_data_prop$Country, "Latin-ASCII"), countrycode_data$country.name.en) all_country_data_prop$Region <- countrycode_data$continent[region_mask] all_country_data_prop$Region[is.na(all_country_data_prop$Region)] <- "Other" all_country_data <<- all_country_data[, c(1:2, 9, 3:8)] %>% dplyr::arrange(Date, Country) @@ -214,7 +214,7 @@ us_data_prop <<- us_data_prop %>% dplyr::mutate(Region="North America") %>% dplyr::select(c(1:2, 9, 3:8)) %>% dplyr::arrange(Date, Country) - first_visits_country <- first_visits_country[!duplicated(first_visits_country[,1:3],fromLast=T),] + first_visits_country <- first_visits_country[!duplicated(first_visits_country[,1:3],fromLast = TRUE),] colnames(first_visits_country) <- c("Date", "Action", "Country", "Number of Sessions", "Proportion") first_visits_country$Proportion <- first_visits_country$Proportion*100 first_visits_country_prop <- tidyr::spread(first_visits_country[,-4], key=Action, value=Proportion, fill=0) @@ -231,10 +231,10 @@ first_visits_country_prop <- first_visits_us_prop %>% dplyr::select(-Country) %>% dplyr::group_by(Date) %>% dplyr::summarize_each(dplyr::funs(sum)) %>% dplyr::mutate(Country="United States") %>% rbind(first_visits_country_prop[!us_mask,]) - region_mask <- match(stringi::stri_trans_general(first_visits_country$Country, "Latin-ASCII"), countrycode_data$country.name) + region_mask <- match(stringi::stri_trans_general(first_visits_country$Country, "Latin-ASCII"), countrycode_data$country.name.en) first_visits_country$Region <- countrycode_data$continent[region_mask] first_visits_country$Region[is.na(first_visits_country$Region)] <- "Other" - region_mask <- match(stringi::stri_trans_general(first_visits_country_prop$Country, "Latin-ASCII"), countrycode_data$country.name) + region_mask <- match(stringi::stri_trans_general(first_visits_country_prop$Country, "Latin-ASCII"), countrycode_data$country.name.en) first_visits_country_prop$Region <- countrycode_data$continent[region_mask] first_visits_country_prop$Region[is.na(first_visits_country_prop$Region)] <- "Other" first_visits_country <<- first_visits_country[, c(1, 8:9, 2:7)] %>% dplyr::arrange(Date, Country) @@ -243,7 +243,7 @@ first_visits_us_prop <<- first_visits_us_prop %>% dplyr::mutate(Region="North America") %>% dplyr::select(c(1:2, 9, 3:8)) %>% dplyr::arrange(Date, Country) - last_action_country <- last_action_country[!duplicated(last_action_country[,1:3],fromLast=T),] + last_action_country <- last_action_country[!duplicated(last_action_country[,1:3],fromLast = TRUE),] colnames(last_action_country) <- c("Date", "Action", "Country", "Events", "Proportion") last_action_country$Proportion <- last_action_country$Proportion*100 last_action_country_prop <- tidyr::spread(last_action_country[,-4], key=Action, value=Proportion, fill=0) @@ -260,10 +260,10 @@ last_action_country_prop <- last_action_us_prop %>% dplyr::select(-Country) %>% dplyr::group_by(Date) %>% dplyr::summarize_each(dplyr::funs(sum)) %>% dplyr::mutate(Country="United States") %>% rbind(last_action_country_prop[!us_mask,]) - region_mask <- match(stringi::stri_trans_general(last_action_country$Country, "Latin-ASCII"), countrycode_data$country.name) + region_mask <- match(stringi::stri_trans_general(last_action_country$Country, "Latin-ASCII"), countrycode_data$country.name.en) last_action_country$Region <- countrycode_data$continent[region_mask] last_action_country$Region[is.na(last_action_country$Region)] <- "Other" - region_mask <- match(stringi::stri_trans_general(last_action_country_prop$Country, "Latin-ASCII"), countrycode_data$country.name) + region_mask <- match(stringi::stri_trans_general(last_action_country_prop$Country, "Latin-ASCII"), countrycode_data$country.name.en) last_action_country_prop$Region <- countrycode_data$continent[region_mask] last_action_country_prop$Region[is.na(last_action_country_prop$Region)] <- "Other" last_action_country <<- last_action_country[, c(1, 8:9, 2:7)] %>% dplyr::arrange(Date, Country) @@ -272,7 +272,7 @@ last_action_us_prop <<- last_action_us_prop %>% dplyr::mutate(Region="North America") %>% dplyr::select(c(1:2, 9, 3:8)) %>% dplyr::arrange(Date, Country) - most_common_country <- most_common_country[!duplicated(most_common_country[,1:3],fromLast=T),] + most_common_country <- most_common_country[!duplicated(most_common_country[,1:3],fromLast = TRUE),] colnames(most_common_country) <- c("Date", "Action", "Country", "Number of Visits", "Proportion") most_common_country$Proportion <- most_common_country$Proportion*100 most_common_country_prop <- tidyr::spread(most_common_country[,-4], key=Action, value=Proportion, fill=0) @@ -289,10 +289,10 @@ most_common_country_prop <- most_common_us_prop %>% dplyr::select(-Country) %>% dplyr::group_by(Date) %>% dplyr::summarize_each(dplyr::funs(sum)) %>% dplyr::mutate(Country="United States") %>% rbind(most_common_country_prop[!us_mask,]) - region_mask <- match(stringi::stri_trans_general(most_common_country$Country, "Latin-ASCII"), countrycode_data$country.name) + region_mask <- match(stringi::stri_trans_general(most_common_country$Country, "Latin-ASCII"), countrycode_data$country.name.en) most_common_country$Region <- countrycode_data$continent[region_mask] most_common_country$Region[is.na(most_common_country$Region)] <- "Other" - region_mask <- match(stringi::stri_trans_general(most_common_country_prop$Country, "Latin-ASCII"), countrycode_data$country.name) + region_mask <- match(stringi::stri_trans_general(most_common_country_prop$Country, "Latin-ASCII"), countrycode_data$country.name.en) most_common_country_prop$Region <- countrycode_data$continent[region_mask] most_common_country_prop$Region[is.na(most_common_country_prop$Region)] <- "Other" most_common_country <<- most_common_country[, c(1, 7:8, 2:6)] %>% dplyr::arrange(Date, Country) -- To view, visit https://gerrit.wikimedia.org/r/341743 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Idd8b46e61db9e33788d2be63564c3dc40334dc5f Gerrit-PatchSet: 2 Gerrit-Project: wikimedia/discovery/prince Gerrit-Branch: master Gerrit-Owner: Bearloga <mpo...@wikimedia.org> Gerrit-Reviewer: Chelsyx <c...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits