Hoo man has submitted this change and it was merged.

Change subject: Support multiple compression formats for dumps
......................................................................


Support multiple compression formats for dumps

Turn the compression format into a config variable thereby allowing
both gzip and bzip2 dump files.

This deprecates one i18n message and requires config.json be updated.

Additionally:
* Breaks out i18n for distributions
* Introduces and renames some variables to improve code legibility
* Updates README with this and one older change

Bug: T118397
Change-Id: Ieb517b5ad677abaa541274d69155855b95787f12
---
M DCAT.php
M README.md
M config.example.json
M i18n/en.json
M i18n/qqq.json
5 files changed, 105 insertions(+), 60 deletions(-)

Approvals:
  Hoo man: Verified; Looks good to me, approved



diff --git a/DCAT.php b/DCAT.php
index 9904932..8d2bc6d 100644
--- a/DCAT.php
+++ b/DCAT.php
@@ -42,7 +42,9 @@
        }
        if ( $config['dumps-enabled'] ) {
                array_push( $top, "dump-info" );
-               $sub["dump-info"] = array( "accessURL", "mediatype", "license" 
);
+               $sub["dump-info"] = array(
+                       "accessURL", "mediatype", "compression", "license"
+               );
        }
 
        // Test
@@ -149,12 +151,12 @@
  * @param XmlWriter $xml XML stream to write to
  * @param array $data data-blob of i18n and config variables
  * @param string|null $dumpDate the date of the dumpfile, null for live data
- * @param string $format the fileformat
+ * @param string $dumpKey the key for the corresponding dump file
  */
-function dumpDistributionExtras( XMLWriter $xml, array $data, $dumpDate, 
$format ) {
+function dumpDistributionExtras( XMLWriter $xml, array $data, $dumpDate, 
$dumpKey ) {
        $url = str_replace(
                '$1',
-               $dumpDate . '/' . 
$data['dumps'][$dumpDate][$format]['filename'],
+               $dumpDate . '/' . 
$data['dumps'][$dumpDate][$dumpKey]['filename'],
                $data['config']['dump-info']['accessURL']
        );
 
@@ -169,14 +171,50 @@
        $xml->startElementNS( 'dcterms', 'issued', null );
        $xml->writeAttributeNS( 'rdf', 'datatype', null,
                'http://www.w3.org/2001/XMLSchema#date' );
-       $xml->text( $data['dumps'][$dumpDate][$format]['timestamp'] );
+       $xml->text( $data['dumps'][$dumpDate][$dumpKey]['timestamp'] );
        $xml->endElement();
 
        $xml->startElementNS( 'dcat', 'byteSize', null );
        $xml->writeAttributeNS( 'rdf', 'datatype', null,
                'http://www.w3.org/2001/XMLSchema#decimal' );
-       $xml->text( $data['dumps'][$dumpDate][$format]['byteSize'] );
+       $xml->text( $data['dumps'][$dumpDate][$dumpKey]['byteSize'] );
        $xml->endElement();
+}
+
+/**
+ * Add i18n descriptions for a distribution
+ *
+ * @param XmlWriter $xml XML stream to write to
+ * @param array $data data-blob of i18n and config variables
+ * @param bool $isDump whether this is a dump distribution
+ * @param string $prefix the type of distribution, one of ld, api or dump
+ * @param string $format the file format, if dump
+ * @param string $compression the compression format, if dump
+ */
+function writeDistributionI18n( XMLWriter $xml, array $data, $isDump,
+       $prefix, $format, $compression ) {
+
+       foreach ( $data['i18n'] as $langCode => $langData ) {
+               if ( array_key_exists( "distribution-$prefix-description", 
$langData ) ) {
+                       $formatDescription = 
$langData["distribution-$prefix-description"];
+                       if ( $isDump ) {
+                               $formatDescription = str_replace(
+                                       '$1',
+                                       $format,
+                                       $formatDescription
+                               );
+                               $formatDescription = str_replace(
+                                       '$2',
+                                       $compression,
+                                       $formatDescription
+                               );
+                       }
+                       $xml->startElementNS( 'dcterms', 'description', null );
+                       $xml->writeAttributeNS( 'xml', 'lang', null, $langCode 
);
+                       $xml->text( $formatDescription );
+                       $xml->endElement();
+               }
+       }
 }
 
 /**
@@ -193,56 +231,55 @@
 function writeDistribution( XMLWriter $xml, array $data, $distribId, $prefix, 
$dumpDate ) {
        $ids = array();
 
+       $isDump = !is_null( $dumpDate );
        $allowedMediatypes = $data['config']["$prefix-info"]['mediatype'];
-       foreach ( $allowedMediatypes as $format => $mediatype ) {
-               // handle missing (and BETA) dump files
-               if ( !is_null( $dumpDate ) and !array_key_exists( $format, 
$data['dumps'][$dumpDate] ) ) {
-                       continue;
-               }
+       $allowedCompressiontypes = array( '' => '' );  // dummy array for 
non-dumps
+       if ( $isDump ) {
+               $allowedCompressiontypes = 
$data['config']["$prefix-info"]['compression'];
+       }
 
-               $id = $data['config']['uri'] . '#' . $distribId . $dumpDate . 
$format;
-               array_push( $ids, $id );
+       foreach ( $allowedCompressiontypes as $compressionName => $compression 
) {
+               foreach ( $allowedMediatypes as $format => $mediatype ) {
+                       $distributionKey = $format . $compression;
 
-               $xml->startElementNS( 'rdf', 'Description', null );
-               $xml->writeAttributeNS( 'rdf', 'about', null, $id );
-
-               $xml->startElementNS( 'rdf', 'type', null );
-               $xml->writeAttributeNS( 'rdf', 'resource', null,
-                       'http://www.w3.org/ns/dcat#Distribution' );
-               $xml->endElement();
-
-               $xml->startElementNS( 'dcterms', 'license', null );
-               $xml->writeAttributeNS( 'rdf', 'resource', null,
-                       $data['config']["$prefix-info"]['license'] );
-               $xml->endElement();
-
-               if ( is_null( $dumpDate ) ) {
-                       $xml->startElementNS( 'dcat', 'accessURL', null );
-                       $xml->writeAttributeNS( 'rdf', 'resource', null,
-                               $data['config']["$prefix-info"]['accessURL'] );
-                       $xml->endElement();
-               } else {
-                       dumpDistributionExtras( $xml, $data, $dumpDate, $format 
);
-               }
-
-               $xml->writeElementNS( 'dcterms', 'format', null, $mediatype );
-
-               // add description in each language
-               foreach ( $data['i18n'] as $langCode => $langData ) {
-                       if ( array_key_exists( 
"distribution-$prefix-description", $langData ) ) {
-                               $formatDescription = str_replace(
-                                       '$1',
-                                       $format,
-                                       
$langData["distribution-$prefix-description"]
-                               );
-                               $xml->startElementNS( 'dcterms', 'description', 
null );
-                               $xml->writeAttributeNS( 'xml', 'lang', null, 
$langCode );
-                               $xml->text( $formatDescription );
-                               $xml->endElement();
+                       // handle missing (and BETA) dump files
+                       if ( $isDump and !array_key_exists( $distributionKey , 
$data['dumps'][$dumpDate] ) ) {
+                               continue;
                        }
-               }
 
-               $xml->endElement();
+                       $id = $data['config']['uri'] . '#' . $distribId . 
$dumpDate . $distributionKey;
+                       array_push( $ids, $id );
+
+                       $xml->startElementNS( 'rdf', 'Description', null );
+                       $xml->writeAttributeNS( 'rdf', 'about', null, $id );
+
+                       $xml->startElementNS( 'rdf', 'type', null );
+                       $xml->writeAttributeNS( 'rdf', 'resource', null,
+                               'http://www.w3.org/ns/dcat#Distribution' );
+                       $xml->endElement();
+
+                       $xml->startElementNS( 'dcterms', 'license', null );
+                       $xml->writeAttributeNS( 'rdf', 'resource', null,
+                               $data['config']["$prefix-info"]['license'] );
+                       $xml->endElement();
+
+                       if ( !$isDump ) {
+                               $xml->startElementNS( 'dcat', 'accessURL', null 
);
+                               $xml->writeAttributeNS( 'rdf', 'resource', null,
+                                       
$data['config']["$prefix-info"]['accessURL'] );
+                               $xml->endElement();
+                       } else {
+                               dumpDistributionExtras( $xml, $data, $dumpDate, 
$distributionKey );
+                       }
+
+                       $xml->writeElementNS( 'dcterms', 'format', null, 
$mediatype );
+
+                       // add description in each language
+                       writeDistributionI18n( $xml, $data, $isDump, $prefix,
+                               $format, $compressionName );
+
+                       $xml->endElement();
+               }
        }
 
        return $ids;
@@ -584,8 +621,10 @@
  */
 function scanDump( $dirname, array $data ) {
        $testStrings = array();
-       foreach ( $data['config']['dump-info']['mediatype'] as $fileEnding => 
$mediatype ) {
-               $testStrings[$fileEnding] = 'all.' . $fileEnding . '.gz';
+       foreach ( $data['config']['dump-info']['compression'] as $compression ) 
{
+               foreach ( $data['config']['dump-info']['mediatype'] as $format 
=> $mediatype ) {
+                       $testStrings["$format$compression"] = '-all.' . $format 
. '.' . $compression;
+               }
        }
 
        $dumps = array();
@@ -594,7 +633,7 @@
        foreach ( glob( $dirname . '/[0-9]*', GLOB_ONLYDIR ) as $subdir ) {
                // $subdir = testdirNew/20150120
                $subDump = array();
-               foreach ( glob( $subdir . '/*.gz' ) as $filename ) {
+               foreach ( glob( $subdir . '/*' ) as $filename ) {
                        // match each file against an expected testString
                        foreach ( $testStrings as $fileEnding => $testString ) {
                                if ( substr( $filename, -strlen( $testString ) 
) === $testString ) {
diff --git a/README.md b/README.md
index f2791bf..76fb017 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
 
 *   Content negotiation (various formats)
 *   MediaWiki api (various formats)
-*   Entity dumps e.g. json, ttl (assumes that these are gziped)
+*   Entity dumps e.g. json, ttl (assumes that these are compressed)
 
 An example result can be found at [lokal-profil / 
dcatap.rdf](https://gist.github.com/lokal-profil/8086dc6bf2398d84a311).
 The live DCAT-AP description of Wikidata can be found 
[here](https://dumps.wikimedia.org/wikidatawiki/entities/dcatap.rdf).
@@ -102,6 +102,8 @@
     *   `accessURL`: URL to the directory where the *.json.gz* files
         reside (`$1` is replaced on the fly by the actual filename),
         e.g. *http://example.org/dumps/$1*
-    *   `mediatype`: (`object`) List of media types. In practice this is
-        always `{"json": "application/json"}` ... for now
+    *   `mediatype`: (`object`) List of media types. e.g.
+        `{"json": "application/json"}`
+    *   `compression`: (`object`) List of compression formats, in the
+        format *name:file-ending* e.g. `{"gzip": "gz"}`
     *   `license`: See ld-info:license above
diff --git a/config.example.json b/config.example.json
index e082fa9..bc1e6f9 100644
--- a/config.example.json
+++ b/config.example.json
@@ -45,6 +45,10 @@
             "json": "application/json",
             "ttl": "text/turtle"
         },
+        "compression": {
+            "gzip": "gz",
+            "bzip2": "bz2"
+        },
         "license": "http://creativecommons.org/publicdomain/zero/1.0/";
     }
 }
diff --git a/i18n/en.json b/i18n/en.json
index d767a75..ee81494 100644
--- a/i18n/en.json
+++ b/i18n/en.json
@@ -5,8 +5,8 @@
        "dataset-live-title": "Live access",
        "dataset-live-description": "The live version of the data, includes 
entities and properties. Only non-deprecated formats are listed as 
distributions.",
        "dataset-dump-title": "Entity dump of $1",
-       "dataset-dump-description": "A static dump of all entites for the given 
date.",
+       "dataset-dump-description": "A static dump of all entities for the 
given date.",
        "distribution-ld-description": "The Linked Data endpoint. Format is 
resolved through content negotiation.",
        "distribution-api-description": "The MediaWiki API endpoint. Format is 
given through the \"format\" parameter.",
-       "distribution-dump-description": "A gziped $1 file."
+       "distribution-dump-description": "A $1 file, $2 compressed."
 }
diff --git a/i18n/qqq.json b/i18n/qqq.json
index c669eb1..12b1dac 100644
--- a/i18n/qqq.json
+++ b/i18n/qqq.json
@@ -10,5 +10,5 @@
        "dataset-dump-description": "The description of the entity dump for the 
given date.",
        "distribution-ld-description": "The description of the Linked Data 
endpoint. For content negotiation see 
https://en.wikipedia.org/wiki/Content_negotiation";,
        "distribution-api-description": "The description of the MediaWiki API 
endpoint. Leave \"format\" untranslated.",
-       "distribution-dump-description": "The description of a dump file where 
$1 is the file format."
+       "distribution-dump-description": "The description of a dump file where 
$1 is the file format and $2 the compression format."
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/262422
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ieb517b5ad677abaa541274d69155855b95787f12
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/dcat
Gerrit-Branch: master
Gerrit-Owner: Lokal Profil <[email protected]>
Gerrit-Reviewer: Hoo man <[email protected]>
Gerrit-Reviewer: Lokal Profil <[email protected]>
Gerrit-Reviewer: Nikerabbit <[email protected]>
Gerrit-Reviewer: Raimond Spekking <[email protected]>
Gerrit-Reviewer: Siebrand <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to