[MediaWiki-commits] [Gerrit] Document some filter idiosyncrasies through tests - change (analytics/webstatscollector)
Ottomata has submitted this change and it was merged. Change subject: Document some filter idiosyncrasies through tests .. Document some filter idiosyncrasies through tests Change-Id: I7242d292a82a7ee9de033bab7caf07844075888e --- M tests/test.sh 1 file changed, 58 insertions(+), 1 deletion(-) Approvals: Ottomata: Verified; Looks good to me, approved jenkins-bot: Verified diff --git a/tests/test.sh b/tests/test.sh index 258c95e..9b1e225 100755 --- a/tests/test.sh +++ b/tests/test.sh @@ -31,11 +31,14 @@ set_FILTERED_OUTPUT() { local URL=$1 +local LOCAL_LOG_LINE_IP=${LOG_LINE_IP:-IP} +unset LOG_LINE_IP + local LOG_LINE=CACHE_MACHINE LOG_LINE=$LOG_LINESEQUENCE_NUMBER LOG_LINE=$LOG_LINETIMESTAMP LOG_LINE=$LOG_LINEDURATION -LOG_LINE=$LOG_LINEIP +LOG_LINE=$LOG_LINE$LOCAL_LOG_LINE_IP LOG_LINE=$LOG_LINESTATUS_CODE LOG_LINE=$LOG_LINESIZE LOG_LINE=$LOG_LINEREQUEST_METHOD @@ -136,6 +139,60 @@ assert_counted 'http://en.wikipedia.org/wiki/Robinson_Can\xC3\xB3' 'en' 'Robinson_Can\xC3\xB3' assert_counted 'http://en.wikipedia.org/wiki/Robinson_Canó' 'en' 'Robinson_Canó' + + +# Idiosyncrasies --- +# Here, we document some idiosyncrasies of webstatscollector. +# We might wish to change/fix them, but that would require all +# consumers of those files to adapt their software. And it would make +# comparison between files harder. So let's at least call them out for +# now. + +# Idiosyncrasy #1 Pageviews to mobile enwiki, are only counted for +# .mw, not for plain enwiki. And this counting is not per page, but +# per language. + +assert_counted 'http://en.m.wikipedia.org/wiki/Idiosyncrasy/Page_on_MobileEnwikiSite_only_counted_titleless_for_en.mw' 'en.mw' 'en' + +# Idiosyncrasy #2 While en.mw might suggest to be thought of as +# English mobile wikipedia, it is rather English mobile sites. So +# it includes for example hits to enwikivoyage. +assert_counted 'http://en.m.wikivoyage.org/wiki/Idiosyncrasy/Page_on_MobileEnwikivoyageSite_only_counted_titleless_for_en.mw' 'en.mw' 'en' + +# Idiosyncrasy #3 Languages in domain names are considered case +# sensitive. +assert_counted 'http://En.wikipedia.org/wiki/Idiosyncrasy/Case_sensitive_languages' 'En' 'Idiosyncrasy/Case_sensitive_languages' + +# Idiosyncrasy #4 Some internal IPv4 IPs are not counted +# altogether. This gets in the way for SSL requests, and makes it +# necessary that the logs from the SSL terminators get fed into the +# filter process too. +# First some internal IP addresses covered by 'filter'. +LOG_LINE_IP=208.80.152.1 ; assert_not_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.152.x' +LOG_LINE_IP=208.80.153.2 ; assert_not_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.153.x' +LOG_LINE_IP=208.80.154.3 ; assert_not_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.154.x' +LOG_LINE_IP=208.80.155.3 ; assert_not_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.155.x' +LOG_LINE_IP=91.198.174.5 ; assert_not_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/91.198.141.x' + +# Then some internal IP addresses not covered by 'filter'. +LOG_LINE_IP=198.35.26.6 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/198.35.26.x_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/198.35.26.x_not_covered' +LOG_LINE_IP=198.35.27.7 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/198.35.27.x_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/198.35.27.x_not_covered' +LOG_LINE_IP=185.15.56.7 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.56.x_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.56.x_not_covered' +LOG_LINE_IP=185.15.57.8 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.57.x_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.57.x_not_covered' +LOG_LINE_IP=185.15.58.9 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.58.x_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.58.x_not_covered' +LOG_LINE_IP=185.15.59.10 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.59.x_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.59.x_not_covered' +LOG_LINE_IP=2620:0:860::11 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/2620:0:860::_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/2620:0:860::_not_covered' +LOG_LINE_IP=2a02:ec80::12 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/2a02:ec80::_not_covered'
[MediaWiki-commits] [Gerrit] Document some filter idiosyncrasies through tests - change (analytics/webstatscollector)
Hello Ottomata, I'd like you to do a code review. Please visit https://gerrit.wikimedia.org/r/156050 to review the following change. Change subject: Document some filter idiosyncrasies through tests .. Document some filter idiosyncrasies through tests Change-Id: I7242d292a82a7ee9de033bab7caf07844075888e --- M tests/test.sh 1 file changed, 55 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/webstatscollector refs/changes/50/156050/1 diff --git a/tests/test.sh b/tests/test.sh index 258c95e..a437ade 100755 --- a/tests/test.sh +++ b/tests/test.sh @@ -31,11 +31,14 @@ set_FILTERED_OUTPUT() { local URL=$1 +local LOCAL_LOG_LINE_IP=${LOG_LINE_IP:-IP} +unset LOG_LINE_IP + local LOG_LINE=CACHE_MACHINE LOG_LINE=$LOG_LINESEQUENCE_NUMBER LOG_LINE=$LOG_LINETIMESTAMP LOG_LINE=$LOG_LINEDURATION -LOG_LINE=$LOG_LINEIP +LOG_LINE=$LOG_LINE$LOCAL_LOG_LINE_IP LOG_LINE=$LOG_LINESTATUS_CODE LOG_LINE=$LOG_LINESIZE LOG_LINE=$LOG_LINEREQUEST_METHOD @@ -136,6 +139,57 @@ assert_counted 'http://en.wikipedia.org/wiki/Robinson_Can\xC3\xB3' 'en' 'Robinson_Can\xC3\xB3' assert_counted 'http://en.wikipedia.org/wiki/Robinson_Canó' 'en' 'Robinson_Canó' + + +# Idiosyncrasies --- +# Here, we document some idiosyncrasies of webstatscollector. +# We might wish to change/fix them, but that would require all +# consumers of those files to adapt their software. And it would make +# comparison between files harder. So let's at least call them out for +# now. + +# Idiosyncrasy #1 Pageviews to mobile enwiki, are only counted for +# .mw, not for plain enwiki. And this counting is not per page, but +# per language. + +assert_counted 'http://en.m.wikipedia.org/wiki/Idiosyncrasy/Page_on_MobileEnwikiSite_only_counted_titleless_for_en.mw' 'en.mw' 'en' + +# Idiosyncrasy #2 While en.mw might suggest to be thought of as +# English mobile wikipedia, it is rather English mobile sites. So +# it includes for example hits to enwikivoyage. +assert_counted 'http://en.m.wikivoyage.org/wiki/Idiosyncrasy/Page_on_MobileEnwikivoyageSite_only_counted_titleless_for_en.mw' 'en.mw' 'en' + +# Idiosyncrasy #3 Languages in domain names are considered case +# sensitive. +assert_counted 'http://En.wikipedia.org/wiki/Idiosyncrasy/Case_sensitive_languages' 'En' 'Idiosyncrasy/Case_sensitive_languages' + +# Idiosyncrasy #4 Some internal IPv4 IPs are not counted +# altogether. This gets in the way for SSL requests, and makes it +# necessary that the logs from the SSL terminators get fed into the +# filter process too. +# First some internal IP addresses covered by 'filter'. +LOG_LINE_IP=208.80.152.1 ; assert_not_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.152.x' +LOG_LINE_IP=208.80.153.2 ; assert_not_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.153.x' +LOG_LINE_IP=208.80.154.3 ; assert_not_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.154.x' +LOG_LINE_IP=208.80.155.3 ; assert_not_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.155.x' +LOG_LINE_IP=91.198.174.5 ; assert_not_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/91.198.141.x' + +# Then some internal IP addresses not covered by 'filter'. +LOG_LINE_IP=198.35.26.6 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/198.35.26.x_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/198.35.26.x_not_covered' +LOG_LINE_IP=198.35.27.7 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/198.35.27.x_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/198.35.27.x_not_covered' +LOG_LINE_IP=185.15.56.7 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.56.x_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.56.x_not_covered' +LOG_LINE_IP=185.15.57.8 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.57.x_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.57.x_not_covered' +LOG_LINE_IP=185.15.58.9 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.58.x_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.58.x_not_covered' +LOG_LINE_IP=185.15.59.10 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.59.x_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.59.x_not_covered' +LOG_LINE_IP=2620:0:860::11 ; assert_counted 'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/2620:0:860::_not_covered' 'en' 'Idiosyncrasy/Do_not_count_internal/2620:0:860::_not_covered' +LOG_LINE_IP=2a02:ec80::12 ;