Diederik has submitted this change and it was merged. Change subject: Add replace_space function ......................................................................
Add replace_space function Added tests, added test file, fixed replace_space Change-Id: Ic795c876105fe8a1a980f83a0c07a5ba2011fff5 --- M Makefile A README.tests A entries-with-urls-with-spaces-2013-02-10.txt M filter.c A test.sh 5 files changed, 80 insertions(+), 35 deletions(-) Approvals: Diederik: Verified; Looks good to me, approved diff --git a/Makefile b/Makefile index b05321b..25451a7 100644 --- a/Makefile +++ b/Makefile @@ -14,12 +14,14 @@ all: collector filter -collector: collector.h collector.c export.c +collector: collector.h collector.c export.c export.o + gcc -o collector collector.c export.o -ldb -lpthread filter: filter.c - cc -o filter filter.c + gcc -o filter filter.c -#export: collector.h export.c +export.o: export.c collector.h collector.c filter.c + gcc -c -o export.o export.c clean: - rm -f collector exporter + rm -f *.o collector filter diff --git a/README.tests b/README.tests new file mode 100644 index 0000000..c14927e --- /dev/null +++ b/README.tests @@ -0,0 +1,7 @@ + + +The test lines in entries-with-urls-with-spaces-2013-02-10.txt were produced like this: + + zcat /home/user/wikidata/raw_gzips/sampled-1000.tab.log-20130210.gz | perl -ne '@f=split(/\t/); print if index($f[8]," ")!=-1;' > entries-with-urls-with-spaces-2013-02-10.txt + +After this head -40 | tail -30 was applied in order for the filter to accept all of them (some of them were influenced by some the discarding rules of the filter). diff --git a/entries-with-urls-with-spaces-2013-02-10.txt b/entries-with-urls-with-spaces-2013-02-10.txt new file mode 100644 index 0000000..abb6648 --- /dev/null +++ b/entries-with-urls-with-spaces-2013-02-10.txt @@ -0,0 +1,30 @@ +ssl1002 338866312 2013-02-09T10:00:09.126 1.667 0.0.0.0 FAKE_CACHE_STATUS/301 667 GET https://fr.wikipedia.org/wiki/Discussion:Bourail/Droit d'auteur NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl1004 347282202 2013-02-09T10:18:11.367 0.082 0.0.0.0 FAKE_CACHE_STATUS/301 675 GET https://fr.wikipedia.org/wiki/Discussion:Chemin\xC3\xA9e solaire/Traduction NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+https://www.google.com/bot.html) - - +ssl1004 349592952 2013-02-09T12:19:04.972 0.079 0.0.0.0 FAKE_CACHE_STATUS/301 667 GET https://fr.wikipedia.org/wiki/\xC3\x89tienne Perrot (psychanalyste) NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl1001 338735382 2013-02-09T12:57:51.173 0.201 0.0.0.0 FAKE_CACHE_STATUS/301 759 GET http://ja.wikipedia.org/wiki/CODE NAME. 1 Brother Sun\xE3\x80\x90\xE5\x88\x9D\xE5\x9B\x9E\xE7\x94\x9F\xE7\x94\xA3\xE9\x99\x90\xE5\xAE\x9A\xE3\x80\x91(\xE7\xB4\x99\xE3\x82\xB8\xE3\x83\xA3\xE3\x82\xB1\xE3\x83\x83\xE3\x83\x88\xE4\xBB\x95\xE6\xA7\x98) NONE/wikipedia - - - foobar2000/1.1.14a - - +ssl1001 340768664 2013-02-09T14:29:25.274 0.351 0.0.0.0 FAKE_CACHE_STATUS/301 885 GET http://ja.wikipedia.org/wiki/\xE7\x89\xB9\xE5\x88\xA5:\xE3\x83\x87\xE3\x83\xBC\xE3\x82\xBF\xE6\x9B\xB8\xE3\x81\x8D\xE5\x87\xBA\xE3\x81\x97/\xE3\x82\x82\xE3\x82\x82\xE3\x81\x84\xE3\x82\x8D\xE3\x82\xAF\xE3\x83\xAD\xE3\x83\xBC\xE3\x83\x90\xE3\x83\xBCZ 1st Live \xE3\x81\xAB\xE3\x83\x95\xE3\x83\xA9\xE3\x83\xB3\xE3\x82\xB9\xE4\xBA\xBA\xE5\xA4\xA7\xE8\x88\x88\xE5\xA5\xAE\xEF\xBC\x81\xE3\x83\x91\xE3\x83\xAAde Japan Expo NONE/wikipedia - - - Mozilla/4.0%20(compatible;%20MSIE%207.0;%20Windows%20NT%206.0) - - +ssl3002 764325222 2013-02-09T14:35:55.799 0.088 0.0.0.0 FAKE_CACHE_STATUS/301 1247 GET http://en.wikipedia.org/wiki/Chopin - The Piano Works (Brilliant Classics) (CD4 of 13)_(album) NONE/wikipedia - - - foobar2000/1.2 - - +ssl1002 345415572 2013-02-09T15:10:39.804 0.051 0.0.0.0 FAKE_CACHE_STATUS/301 674 GET https://fr.wikipedia.org/wiki/Portail:Indianapolis/Index th\xC3\xA9matique NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl3002 769405948 2013-02-09T15:36:59.442 0.408 0.0.0.0 FAKE_CACHE_STATUS/301 1270 GET https://en.wikipedia.org/wiki/Kevin Smith NONE/wikipedia - - - Mozilla/5.0%20(X11;%20Linux%20x86_64;%20rv:18.0)%20Gecko/20100101%20Firefox/18.0 en-US,en;q=0.5 - +ssl1004 354242982 2013-02-09T15:40:02.553 0.061 0.0.0.0 FAKE_CACHE_STATUS/301 677 GET https://fr.wikipedia.org/wiki/Personnages de la Saga du d\xC3\xA9sir interdit NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl1001 343866342 2013-02-09T16:29:52.032 0.002 0.0.0.0 FAKE_CACHE_STATUS/200 29022 GET https://en.wikipedia.org/wiki/Template:Attached_KML/Interstate 87 NONE/wikipedia - https://en.wikipedia.org/wiki/Interstate_87 - Mozilla/5.0%20(Windows%20NT%206.1;%20WOW64)%20AppleWebKit/537.17%20(KHTML,%20like%20Gecko)%20Chrome/24.0.1312.57%20Safari/537.17 en-US,en;q=0.8,ja;q=0.6 - +ssl3002 775361522 2013-02-09T16:46:28.770 0.088 0.0.0.0 FAKE_CACHE_STATUS/301 1214 GET http://pl.wikipedia.org/wiki/Show Your Bones (Advance)_(album) NONE/wikipedia - - - foobar2000/1.1 - - +ssl1003 345719812 2013-02-09T16:58:27.186 0.108 0.0.0.0 FAKE_CACHE_STATUS/301 660 GET https://fr.wikipedia.org/wiki/Discussion utilisateur:Myaly NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl1003 347346702 2013-02-09T17:57:30.211 0.063 0.0.0.0 FAKE_CACHE_STATUS/301 683 GET https://fr.wikipedia.org/wiki/Discussion:Navire de d\xC3\xA9fense c\xC3\xB4ti\xC3\xA8re NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl3003 782892222 2013-02-09T18:55:51.553 0.115 0.0.0.0 FAKE_CACHE_STATUS/301 1201 GET http://ru.wikipedia.org/wiki/American Capitalist NONE/wikipedia - - - foobar2000/1.1.17 - - +ssl3002 787278662 2013-02-09T19:04:23.234 0.116 0.0.0.0 FAKE_CACHE_STATUS/301 1235 GET http://ru.wikipedia.org/wiki/Live at Wembley Stadium 1986 (25th Anniversary Edition) NONE/wikipedia - - - foobar2000/1.1.5 - - +ssl1001 348171052 2013-02-09T19:07:42.557 0.056 0.0.0.0 FAKE_CACHE_STATUS/301 647 GET https://fr.wikipedia.org/wiki/Tangara (genre) NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl1004 360069242 2013-02-09T19:09:35.203 0.050 0.0.0.0 FAKE_CACHE_STATUS/301 662 GET https://fr.wikipedia.org/wiki/Mod\xC3\xA8le:Cantons de Valence NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl1004 363042322 2013-02-09T20:55:34.082 0.057 0.0.0.0 FAKE_CACHE_STATUS/301 668 GET https://fr.wikipedia.org/wiki/Projet:Impression/Quality images/113 NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl1002 358824032 2013-02-09T23:20:20.933 0.050 0.0.0.0 FAKE_CACHE_STATUS/301 668 GET https://meta.wikimedia.org/wiki/Special:CentralAuth/Lala78 z z b 5 NONE/wikimedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl3003 806330792 2013-02-09T23:28:30.354 0.310 0.0.0.0 FAKE_CACHE_STATUS/301 1189 GET http://de.wikipedia.org/wiki/Hip Hop Is Dead NONE/wikipedia - - - foobar2000/1.1.10 - - +ssl1003 357371192 2013-02-10T00:05:37.562 0.055 0.0.0.0 FAKE_CACHE_STATUS/301 665 GET https://fr.wikipedia.org/wiki/Discussion utilisateur:Steve92341 NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl1002 362176022 2013-02-10T01:17:26.634 0.086 0.0.0.0 FAKE_CACHE_STATUS/301 680 GET https://fr.wikipedia.org/wiki/Discussion:Histoire du Racing Club de Strasbourg NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl3003 812500292 2013-02-10T01:32:25.217 0.001 0.0.0.0 FAKE_CACHE_STATUS/301 1226 GET http://commons.wikimedia.org/wiki/Template:Motd/2013-02-9 (en) NONE/wikimedia - - - Magnus%20tools - - +ssl1001 359846002 2013-02-10T02:05:36.049 0.433 0.0.0.0 FAKE_CACHE_STATUS/400 17294 GET http://ja.wikipedia.org/wiki/\xE7\x89\xB9\xE5\x88\xA5:\xE3\x83\x87\xE3\x83\xBC\xE3\x82\xBF\xE6\x9B\xB8\xE3\x81\x8D\xE5\x87\xBA\xE3\x81\x97/[enews24.net] '\xED\x99\x94\xEC\x84\xB1\xEC\x9D\xB8' \xEC\x84\xB9\xEC\x8B\x9C\xED\x95\x9C \xED\x82\xA4\xED\x8B\xB0\xEC\xA4\x91\xEB\x8F\x85\xEB\x85\x80 NONE/wikipedia - - - Mozilla/4.0%20(compatible;%20MSIE%207.0;%20Windows%20NT%206.0) - - +ssl3003 815122202 2013-02-10T03:01:12.862 0.374 0.0.0.0 FAKE_CACHE_STATUS/400 17734 GET http://pl.wikipedia.org/wiki/Mi\xC5\x82</img></table></i></td></tr></td></tr></table></img></div></div><a href= NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl1003 363030612 2013-02-10T03:11:03.299 0.266 0.0.0.0 FAKE_CACHE_STATUS/301 621 GET http://en.wikipedia.org/wiki/Tales Of VS. Original Soundtrack (Disc 1) NONE/wikipedia - - - foobar2000/1.0.3 - - +ssl1003 363126032 2013-02-10T03:13:58.975 0.089 0.0.0.0 FAKE_CACHE_STATUS/301 654 GET https://fr.wikipedia.org/wiki/Box-office France 1986 NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl1003 364287962 2013-02-10T03:50:17.999 0.060 0.0.0.0 FAKE_CACHE_STATUS/301 656 GET https://fr.wikipedia.org/wiki/Chesterfield (homonymie) NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl1002 370989322 2013-02-10T06:00:27.104 0.046 0.0.0.0 FAKE_CACHE_STATUS/301 659 GET https://fr.wikipedia.org/wiki/Discussion mod\xC3\xA8le:OMIM NONE/wikipedia - - - Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html) - - +ssl3002 823296232 2013-02-10T06:27:48.601 0.151 0.0.0.0 FAKE_CACHE_STATUS/301 1195 GET http://ru.wikipedia.org/wiki/The Electric Mist NONE/wikipedia - - - foobar2000/1.2.2 - - diff --git a/filter.c b/filter.c index 72474b4..68daf4b 100644 --- a/filter.c +++ b/filter.c @@ -16,30 +16,6 @@ */ -/* - -#!/usr/bin/python - -import re -import sys - -dupes = re.compile('^(145\.97\.39\.|66\.230\.200\.|211\.115\.107\.|91\.198\.174\.)') -urlre = re.compile('^http://([^\.]+)\.([^\.]+).org/wiki/([^?]+)') - -projects={"wikipedia":"","wiktionary":".d","wikinews":".n","wikimedia":".m","wikibooks":".b","wikisource":".s","mediawiki":".w","wikiversity":".v","wikiquote":".q" } - -for line in sys.stdin: - ip,undef,bytes,undef,url=line.split()[4:9] - if dupes.match(ip): continue - stuff=urlre.match(url) - if stuff == None: continue - language,project,title = stuff.groups() - if project=="wikimedia" and language not in ["commons","meta","incubator","species"]: continue - try: print language + projects[project] + " 1 " + bytes + " " + title - except: continue - -*/ - #define LINESIZE 4096 char *_sep, *_lasttok, *_firsttok; #define TOKENIZE(x,y) _lasttok=NULL; _sep=y; _firsttok=strtok_r(x,y,&_lasttok); @@ -70,11 +46,11 @@ */ char *dupes[] = {"208.80.152.", - "208.80.153.", - "208.80.154.", - "208.80.155.", - "91.198.174.", - NULL}; + "208.80.153.", + "208.80.154.", + "208.80.155.", + "91.198.174.", + NULL}; bool check_ip(char *ip) { char **prefix=dupes; @@ -114,6 +90,20 @@ char *title; char *suffix; } info; + +void replace_space(char *url) { + int len = strlen(url); + if (len==0) { + return; + } + + int i; + for(i = 0; i < len; i++){ + if(url[i] == ' '){ + url[i] = '_'; + } + } +} bool parse_url(char *url, struct info *in) { if (!url) @@ -186,10 +176,11 @@ info.size= FIELD; /* object size */ FIELD; url= FIELD; - if (!parse_url(url,&info)) - continue; + replace_space(url); if (!check_ip(info.ip)) continue; + if (!parse_url(url,&info)) + continue; if (!check_project(&info)) continue; printf("%s%s 1 %s %s\n",info.language, info.suffix, info.size, info.title); diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..dc91912 --- /dev/null +++ b/test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +TEST_SPACELESS_LINES=`cat entries-with-urls-with-spaces-2013-02-10.txt | ./filter | perl -MData::Dumper -ne '@f=split(/\s/,$_,4); print if $f[3] =~ /\ /;' | wc -l` + + + +if [ $TEST_SPACELESS_LINES -eq 0 ]; then + echo "Test1: Spaceless lines in filter PASSED"; +else + echo "Test1: Spaceless lines in filter FAILED"; + exit -1; +fi + + +exit 0; -- To view, visit https://gerrit.wikimedia.org/r/51680 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ic795c876105fe8a1a980f83a0c07a5ba2011fff5 Gerrit-PatchSet: 1 Gerrit-Project: analytics/webstatscollector Gerrit-Branch: time_travel Gerrit-Owner: Demon <ch...@wikimedia.org> Gerrit-Reviewer: Diederik <dvanli...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits