Ottomata has submitted this change and it was merged. Change subject: Add pagecounts-all-sites to webrequest dump script ......................................................................
Add pagecounts-all-sites to webrequest dump script Change-Id: I19703b5f9fe2ce2d648c477ad4240574aae66010 --- M bin/refinery-dump-status-webrequest-partitions 1 file changed, 45 insertions(+), 5 deletions(-) Approvals: Ottomata: Verified; Looks good to me, approved diff --git a/bin/refinery-dump-status-webrequest-partitions b/bin/refinery-dump-status-webrequest-partitions index 0ca523a..9e75f7e 100755 --- a/bin/refinery-dump-status-webrequest-partitions +++ b/bin/refinery-dump-status-webrequest-partitions @@ -15,9 +15,10 @@ --datasets DATASET1,DATASET2,... -- Select the datasets to output data for. The following datasets are available: - raw_webrequest -- Raw webrequest (hourly) - webrequest -- webrequest (refined tables) (hourly) - all -- all of the above + pagecounts-all-sites -- pagecounts-all-sites (hourly) + raw_webrequest -- Raw webrequest (hourly) + webrequest -- webrequest (refined tables) (hourly) + all -- all of the above By default, only "raw_webrequest" is shown. @@ -56,6 +57,7 @@ DATASET_VISIBILITIES["$DATASET"]=no } +add_dataset "pagecounts_all_sites" " file name date | page | project |" add_dataset "raw_webrequest" " bits | misc | mobile | text | upload |" add_dataset "webrequest" " bits | misc | mobile | text | upload |" @@ -98,9 +100,9 @@ FOUND_DATASET=no for INNER_DATASET in "${ALL_DATASETS[@]}" do - if [ "$DATASET" = "$INNER_DATASET" ] + if [ "${DATASET//-/_}" = "$INNER_DATASET" ] then - DATASET_VISIBILITIES["$DATASET"]=yes + DATASET_VISIBILITIES["$INNER_DATASET"]=yes FOUND_DATASET=yes fi done @@ -135,6 +137,7 @@ RAW_WEBREQUEST_DATA_DIR_ABS="$HDFS_MOUNT_DIR_ABS/wmf/data/raw/webrequest" RAW_WEBREQUEST_STATISTICS_DIR_ABS="$HDFS_MOUNT_DIR_ABS/wmf/data/raw/webrequests_faulty_hosts" WEBREQUEST_DATA_DIR_ABS="$HDFS_MOUNT_DIR_ABS/wmf/data/wmf/webrequest" +ARCHIVE_DATA_DIR_ABS="$HDFS_MOUNT_DIR_ABS/wmf/data/archive" log_no_lf() { if [ -n "$QUIET" ] @@ -218,6 +221,43 @@ log } +dump_dataset_pagecounts_file() { + local DATASET="$1" + local KIND="$2" + + local STATUS="X" + + if [ "$KIND" = page ] + then + FILE_ENDING=".gz" + else + FILE_ENDING="" + fi + + FILE_DATE_PART="$(date --utc -d "$DATE 1 hour" +"%Y/%Y-%m/${KIND}counts-%Y%m%d-%H0000")" + + FILE_ABS="$ARCHIVE_DATA_DIR_ABS/${DATASET//_/-}/$FILE_DATE_PART$FILE_ENDING" + + if [ -e "$FILE_ABS" ] + then + STATUS="." + fi + log_no_lf "$STATUS" +} + +dump_dataset_pagecounts_all_sites() { + local DATE="$1" + local DATASET="pagecounts_all_sites" + + log_no_lf " $(date --utc -d "$DATE 1 hour" +'%Y%m%d-%H0000') |" + for KIND in page project + do + log_no_lf " " + dump_dataset_pagecounts_file "$DATASET" "$KIND" + log_no_lf " |" + done +} + dump_dataset_raw_webrequest_partition() { local DATE_HDFS_PADDED="$1" -- To view, visit https://gerrit.wikimedia.org/r/187418 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I19703b5f9fe2ce2d648c477ad4240574aae66010 Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery Gerrit-Branch: master Gerrit-Owner: QChris <christ...@quelltextlich.at> Gerrit-Reviewer: Ottomata <o...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits