jenkins-bot has submitted this change and it was merged.

Change subject: Implement outputdir and compression params for dump-corpora.php
......................................................................


Implement outputdir and compression params for dump-corpora.php

Bug: T133006
Bug: T133007
Change-Id: Ib5488a37f9a8bd235b84db65a96ae8c534a6c6b5
---
M scripts/dump-corpora.php
1 file changed, 61 insertions(+), 16 deletions(-)

Approvals:
  KartikMistry: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/scripts/dump-corpora.php b/scripts/dump-corpora.php
index f69fa1c..90af743 100644
--- a/scripts/dump-corpora.php
+++ b/scripts/dump-corpora.php
@@ -20,6 +20,14 @@
 use ContentTranslation\Translation;
 
 class CXCorporaDump extends Maintenance {
+       private static $sinkTypes = [
+               'file' => [ 'DumpFileOutput', '' ],
+               'gzip' => [ 'DumpGZipOutput', '.gz' ],
+               'bzip2' => [ 'DumpBZip2Output', '.bz2' ],
+               'dbzip2' => [ 'DumpDBZip2Output', '.bz2' ],
+               '7zip' => [ 'Dump7ZipOutput', '.7z' ],
+       ];
+
        public function __construct() {
                parent::__construct();
                $this->mDescription = 'Script to produce parallel corpora dumps 
from CX translations.';
@@ -46,6 +54,20 @@
                );
 
                $this->addOption(
+                       'compression',
+                       '(optional) Compression. Available formats gzip, bzip2, 
dbzip2, 7zip.',
+                       false, /*required*/
+                       true /*has arg*/
+               );
+
+               $this->addOption(
+                       'outputdir',
+                       '(optional) Location to place the file(s). Defaults to 
current working directory.',
+                       false, /*required*/
+                       true /*has arg*/
+               );
+
+               $this->addOption(
                        'split-at',
                        '(optional) If there are more than this published 
articles, also create split dumps.',
                        false, /*required*/
@@ -61,15 +83,38 @@
                $this->tags = [];
        }
 
+       private function getPath( $source, $target ) {
+               $path = $this->path;
+               $source = $source ?: '_';
+               $target = $target ?: '_';
+               $type = $this->type;
+               $format = $this->format;
+               $ext = self::$sinkTypes[$this->sinkType][1];
+
+               return "$path/cx-corpora.{$source}2{$target}.$type.$format$ext";
+       }
+
+       private function getSink( $path ) {
+               $class = self::$sinkTypes[$this->sinkType][0];
+               return new $class( $path );
+       }
+
        public function execute() {
                $sourceLanguage = $this->getOption( 'source-language', false );
                $targetLanguage = $this->getOption( 'target-language', false );
-               $format = $this->getOption( 'format', 'json' );
                $plain = $this->getOption( 'plaintext', false );
                $split = $this->getOption( 'split-at', false );
-               $type = $plain ? 'text' : 'html';
 
-               $formatSpec = [ $format, $type ];
+               $this->format = $this->getOption( 'format', 'json' );
+               $this->type = $plain ? 'text' : 'html';
+               $this->path = $this->getOption( 'outputdir', '.' );
+               $this->sinkType = $this->getOption( 'compression', 'file' );
+
+               if ( !isset( self::$sinkTypes[ $this->sinkType ] ) ) {
+                       $this->error( 'Unknown compression format given.', 1 );
+               }
+
+               $formatSpec = [ $this->format, $this->type ];
 
                $limit = 999999999;
                $offset = 0;
@@ -114,10 +159,8 @@
                }
 
                if ( !$split ) {
-                       $source = $sourceLanguage ?: '_';
-                       $target = $targetLanguage ?: '_';
-                       $filename = 
"cx-corpora.{$source}2{$target}.$type.$format";
-                       $this->export( $formatSpec, $filename, $translations, 
$source );
+                       $path = $this->getPath( $sourceLanguage, 
$targetLanguage );
+                       $this->export( $formatSpec, $path, $translations, 
$sourceLanguage ?: '_' );
 
                        return;
                }
@@ -129,8 +172,8 @@
                                        continue;
                                }
 
-                               $filename = 
"cx-corpora.{$sourceLanguage}2{$targetLanguage}.$type.$format";
-                               $this->export( $formatSpec, $filename, 
$targets, $sourceLanguage );
+                               $path = $this->getPath( $sourceLanguage, 
$targetLanguage );
+                               $this->export( $formatSpec, $path, $targets, 
$sourceLanguage );
                                unset( 
$sorted[$targetLanguage][$sourceLanguage] );
                        }
 
@@ -147,15 +190,16 @@
                                continue;
                        }
 
-                       $filename = 
"cx-corpora._2{$targetLanguage}.$type.$format";
-                       $this->export( $formatSpec, $filename, $targets, '_' );
+                       $path = $this->getPath( false, $targetLanguage );
+                       $this->export( $formatSpec, $path, $targets, '_' );
                        unset( $sorted[$targetLanguage] );
                }
 
                if ( count( $sorted ) ) {
                        $targets = call_user_func_array( 'array_merge', $sorted 
);
-                       $filename = "cx-corpora._2_.$type.$format";
-                       $this->export( $formatSpec, $filename, $targets, '_' );
+
+                       $path = $this->getPath( false, false );
+                       $this->export( $formatSpec, $path, $targets, '_' );
                }
        }
 
@@ -179,7 +223,7 @@
                return $sorted;
        }
 
-       public function export( $formatSpec, $filename, array $targets, 
$sourceLanguage ) {
+       public function export( $formatSpec, $path, array $targets, 
$sourceLanguage ) {
                $data = null;
 
                list( $format, $type ) = $formatSpec;
@@ -193,8 +237,9 @@
                }
 
                if ( $data ) {
-                       file_put_contents( $filename, $data );
-                       $this->output( "$filename\n" );
+                       $sink = $this->getSink( $path );
+                       $sink->write( $data );
+                       $this->output( $sink->getFilenames() . "\n" );
                }
        }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/288595
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ib5488a37f9a8bd235b84db65a96ae8c534a6c6b5
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/extensions/ContentTranslation
Gerrit-Branch: master
Gerrit-Owner: Nikerabbit <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: KartikMistry <[email protected]>
Gerrit-Reviewer: Santhosh <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to