[MediaWiki-commits] [Gerrit] Document some filter idiosyncrasies through tests - change (analytics/webstatscollector)

2014-08-25 Thread Ottomata (Code Review)
Ottomata has submitted this change and it was merged.

Change subject: Document some filter idiosyncrasies through tests
..


Document some filter idiosyncrasies through tests

Change-Id: I7242d292a82a7ee9de033bab7caf07844075888e
---
M tests/test.sh
1 file changed, 58 insertions(+), 1 deletion(-)

Approvals:
  Ottomata: Verified; Looks good to me, approved
  jenkins-bot: Verified



diff --git a/tests/test.sh b/tests/test.sh
index 258c95e..9b1e225 100755
--- a/tests/test.sh
+++ b/tests/test.sh
@@ -31,11 +31,14 @@
 set_FILTERED_OUTPUT() {
 local URL=$1
 
+local LOCAL_LOG_LINE_IP=${LOG_LINE_IP:-IP}
+unset LOG_LINE_IP
+
 local LOG_LINE=CACHE_MACHINE
 LOG_LINE=$LOG_LINESEQUENCE_NUMBER
 LOG_LINE=$LOG_LINETIMESTAMP
 LOG_LINE=$LOG_LINEDURATION
-LOG_LINE=$LOG_LINEIP
+LOG_LINE=$LOG_LINE$LOCAL_LOG_LINE_IP
 LOG_LINE=$LOG_LINESTATUS_CODE
 LOG_LINE=$LOG_LINESIZE
 LOG_LINE=$LOG_LINEREQUEST_METHOD
@@ -136,6 +139,60 @@
 assert_counted 'http://en.wikipedia.org/wiki/Robinson_Can\xC3\xB3' 'en' 
'Robinson_Can\xC3\xB3'
 assert_counted 'http://en.wikipedia.org/wiki/Robinson_Canó' 'en' 
'Robinson_Canó'
 
+
+
+# Idiosyncrasies 
---
+# Here, we document some idiosyncrasies of webstatscollector.
+# We might wish to change/fix them, but that would require all
+# consumers of those files to adapt their software. And it would make
+# comparison between files harder. So let's at least call them out for
+# now.
+
+# Idiosyncrasy #1 Pageviews to mobile enwiki, are only counted for
+# .mw, not for plain enwiki. And this counting is not per page, but
+# per language.
+
+assert_counted 
'http://en.m.wikipedia.org/wiki/Idiosyncrasy/Page_on_MobileEnwikiSite_only_counted_titleless_for_en.mw'
 'en.mw' 'en'
+
+# Idiosyncrasy #2 While en.mw might suggest to be thought of as
+# English mobile wikipedia, it is rather English mobile sites. So
+# it includes for example hits to enwikivoyage.
+assert_counted 
'http://en.m.wikivoyage.org/wiki/Idiosyncrasy/Page_on_MobileEnwikivoyageSite_only_counted_titleless_for_en.mw'
 'en.mw' 'en'
+
+# Idiosyncrasy #3 Languages in domain names are considered case
+# sensitive.
+assert_counted 
'http://En.wikipedia.org/wiki/Idiosyncrasy/Case_sensitive_languages' 'En' 
'Idiosyncrasy/Case_sensitive_languages'
+
+# Idiosyncrasy #4 Some internal IPv4 IPs are not counted
+# altogether. This gets in the way for SSL requests, and makes it
+# necessary that the logs from the SSL terminators get fed into the
+# filter process too.
+# First some internal IP addresses covered by 'filter'.
+LOG_LINE_IP=208.80.152.1 ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.152.x'
+LOG_LINE_IP=208.80.153.2 ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.153.x'
+LOG_LINE_IP=208.80.154.3 ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.154.x'
+LOG_LINE_IP=208.80.155.3 ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.155.x'
+LOG_LINE_IP=91.198.174.5 ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/91.198.141.x'
+
+# Then some internal IP addresses not covered by 'filter'.
+LOG_LINE_IP=198.35.26.6 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/198.35.26.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/198.35.26.x_not_covered'
+LOG_LINE_IP=198.35.27.7 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/198.35.27.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/198.35.27.x_not_covered'
+LOG_LINE_IP=185.15.56.7 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.56.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.56.x_not_covered'
+LOG_LINE_IP=185.15.57.8 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.57.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.57.x_not_covered'
+LOG_LINE_IP=185.15.58.9 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.58.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.58.x_not_covered'
+LOG_LINE_IP=185.15.59.10 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.59.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.59.x_not_covered'
+LOG_LINE_IP=2620:0:860::11 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/2620:0:860::_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/2620:0:860::_not_covered'
+LOG_LINE_IP=2a02:ec80::12 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/2a02:ec80::_not_covered'
 

[MediaWiki-commits] [Gerrit] Document some filter idiosyncrasies through tests - change (analytics/webstatscollector)

2014-08-24 Thread QChris (Code Review)
Hello Ottomata,

I'd like you to do a code review.  Please visit

https://gerrit.wikimedia.org/r/156050

to review the following change.

Change subject: Document some filter idiosyncrasies through tests
..

Document some filter idiosyncrasies through tests

Change-Id: I7242d292a82a7ee9de033bab7caf07844075888e
---
M tests/test.sh
1 file changed, 55 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/webstatscollector 
refs/changes/50/156050/1

diff --git a/tests/test.sh b/tests/test.sh
index 258c95e..a437ade 100755
--- a/tests/test.sh
+++ b/tests/test.sh
@@ -31,11 +31,14 @@
 set_FILTERED_OUTPUT() {
 local URL=$1
 
+local LOCAL_LOG_LINE_IP=${LOG_LINE_IP:-IP}
+unset LOG_LINE_IP
+
 local LOG_LINE=CACHE_MACHINE
 LOG_LINE=$LOG_LINESEQUENCE_NUMBER
 LOG_LINE=$LOG_LINETIMESTAMP
 LOG_LINE=$LOG_LINEDURATION
-LOG_LINE=$LOG_LINEIP
+LOG_LINE=$LOG_LINE$LOCAL_LOG_LINE_IP
 LOG_LINE=$LOG_LINESTATUS_CODE
 LOG_LINE=$LOG_LINESIZE
 LOG_LINE=$LOG_LINEREQUEST_METHOD
@@ -136,6 +139,57 @@
 assert_counted 'http://en.wikipedia.org/wiki/Robinson_Can\xC3\xB3' 'en' 
'Robinson_Can\xC3\xB3'
 assert_counted 'http://en.wikipedia.org/wiki/Robinson_Canó' 'en' 
'Robinson_Canó'
 
+
+
+# Idiosyncrasies 
---
+# Here, we document some idiosyncrasies of webstatscollector.
+# We might wish to change/fix them, but that would require all
+# consumers of those files to adapt their software. And it would make
+# comparison between files harder. So let's at least call them out for
+# now.
+
+# Idiosyncrasy #1 Pageviews to mobile enwiki, are only counted for
+# .mw, not for plain enwiki. And this counting is not per page, but
+# per language.
+
+assert_counted 
'http://en.m.wikipedia.org/wiki/Idiosyncrasy/Page_on_MobileEnwikiSite_only_counted_titleless_for_en.mw'
 'en.mw' 'en'
+
+# Idiosyncrasy #2 While en.mw might suggest to be thought of as
+# English mobile wikipedia, it is rather English mobile sites. So
+# it includes for example hits to enwikivoyage.
+assert_counted 
'http://en.m.wikivoyage.org/wiki/Idiosyncrasy/Page_on_MobileEnwikivoyageSite_only_counted_titleless_for_en.mw'
 'en.mw' 'en'
+
+# Idiosyncrasy #3 Languages in domain names are considered case
+# sensitive.
+assert_counted 
'http://En.wikipedia.org/wiki/Idiosyncrasy/Case_sensitive_languages' 'En' 
'Idiosyncrasy/Case_sensitive_languages'
+
+# Idiosyncrasy #4 Some internal IPv4 IPs are not counted
+# altogether. This gets in the way for SSL requests, and makes it
+# necessary that the logs from the SSL terminators get fed into the
+# filter process too.
+# First some internal IP addresses covered by 'filter'.
+LOG_LINE_IP=208.80.152.1 ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.152.x'
+LOG_LINE_IP=208.80.153.2 ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.153.x'
+LOG_LINE_IP=208.80.154.3 ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.154.x'
+LOG_LINE_IP=208.80.155.3 ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.155.x'
+LOG_LINE_IP=91.198.174.5 ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/91.198.141.x'
+
+# Then some internal IP addresses not covered by 'filter'.
+LOG_LINE_IP=198.35.26.6 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/198.35.26.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/198.35.26.x_not_covered'
+LOG_LINE_IP=198.35.27.7 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/198.35.27.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/198.35.27.x_not_covered'
+LOG_LINE_IP=185.15.56.7 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.56.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.56.x_not_covered'
+LOG_LINE_IP=185.15.57.8 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.57.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.57.x_not_covered'
+LOG_LINE_IP=185.15.58.9 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.58.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.58.x_not_covered'
+LOG_LINE_IP=185.15.59.10 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.59.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.59.x_not_covered'
+LOG_LINE_IP=2620:0:860::11 ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/2620:0:860::_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/2620:0:860::_not_covered'
+LOG_LINE_IP=2a02:ec80::12 ;