jenkins-bot has submitted this change and it was merged. Change subject: Update groupStatistics.php to use Maintenance class ......................................................................
Update groupStatistics.php to use Maintenance class Change-Id: I0298d797eecdab6d49f94a5b4a8859d69e008f9f --- M scripts/groupStatistics.php 1 file changed, 659 insertions(+), 628 deletions(-) Approvals: Nikerabbit: Looks good to me, approved jenkins-bot: Verified diff --git a/scripts/groupStatistics.php b/scripts/groupStatistics.php index 5f5d5f9..3bc225b 100644 --- a/scripts/groupStatistics.php +++ b/scripts/groupStatistics.php @@ -1,7 +1,7 @@ <?php /** * Commandline script to general statistics about the localisation level of - * localisation for one or more message groups. + * one or more message groups. * * @file * @ingroup Script Stats @@ -11,176 +11,667 @@ * @license GPL-2.0+ */ -/** - * Array of the most spoken languages in the world. - * Source: http://stats.wikimedia.org/EN/Sitemap.htm. - * - * Key value pairs of: - * [MediaWiki localisation code] => array( - * [position in top 50], - * [speakers in millions], - * [continent where localisation is spoken] - * ) - * - */ -$mostSpokenLanguages = array( - 'en' => array( 1, 1500, 'multiple' ), - 'zh-hans' => array( 2, 1300, 'asia' ), - 'zh-hant' => array( 2, 1300, 'asia' ), - 'hi' => array( 3, 550, 'asia' ), - 'ar' => array( 4, 530, 'multiple' ), - 'es' => array( 5, 500, 'multiple' ), - 'ms' => array( 6, 300, 'asia' ), - 'pt' => array( 7, 290, 'multiple' ), - 'pt-br' => array( 7, 290, 'america' ), - 'ru' => array( 8, 278, 'multiple' ), - 'id' => array( 9, 250, 'asia' ), - 'bn' => array( 10, 230, 'asia' ), - 'fr' => array( 11, 200, 'multiple' ), - 'de' => array( 12, 185, 'europe' ), - 'ja' => array( 13, 132, 'asia' ), - 'fa' => array( 14, 107, 'asia' ), - 'pnb' => array( 15, 104, 'asia' ), // Most spoken variant - 'tl' => array( 16, 90, 'asia' ), - 'mr' => array( 17, 90, 'asia' ), - 'vi' => array( 18, 80, 'asia' ), - 'jv' => array( 19, 80, 'asia' ), - 'te' => array( 20, 80, 'asia' ), - 'ko' => array( 21, 78, 'asia' ), - 'wuu' => array( 22, 77, 'asia' ), - 'arz' => array( 23, 76, 'africa' ), - 'th' => array( 24, 73, 'asia' ), - 'yue' => array( 25, 71, 'asia' ), - 'tr' => array( 26, 70, 'multiple' ), - 'it' => array( 27, 70, 'europe' ), - 'ta' => array( 28, 66, 'asia' ), - 'ur' => array( 29, 60, 'asia' ), - 'my' => array( 30, 52, 'asia' ), - 'sw' => array( 31, 50, 'africa' ), - 'nan' => array( 32, 49, 'asia' ), - 'kn' => array( 33, 47, 'asia' ), - 'gu' => array( 34, 46, 'asia' ), - 'uk' => array( 35, 45, 'europe' ), - 'pl' => array( 36, 43, 'europe' ), - 'sd' => array( 37, 41, 'asia' ), - 'ha' => array( 38, 39, 'africa' ), - 'ml' => array( 39, 37, 'asia' ), - 'gan-hans' => array( 40, 35, 'asia' ), - 'gan-hant' => array( 40, 35, 'asia' ), - 'hak' => array( 41, 34, 'asia' ), - 'or' => array( 42, 31, 'asia' ), - 'ne' => array( 43, 30, 'asia' ), - 'ro' => array( 44, 28, 'europe' ), - 'su' => array( 45, 27, 'asia' ), - 'az' => array( 46, 27, 'asia' ), - 'nl' => array( 47, 27, 'europe' ), - 'zu' => array( 48, 26, 'africa' ), - 'ps' => array( 49, 26, 'asia' ), - 'ckb' => array( 50, 26, 'asia' ), - 'ku-latn' => array( 50, 26, 'asia' ), -); +// Standard boilerplate to define $IP +if ( getenv( 'MW_INSTALL_PATH' ) !== false ) { + $IP = getenv( 'MW_INSTALL_PATH' ); +} else { + $dir = __DIR__; + $IP = "$dir/../../.."; +} +require_once "$IP/maintenance/Maintenance.php"; -/** - * Variable with key-value pairs with a named index and an array of key-value - * pairs where the key is a MessageGroup ID and the value is a weight of the - * group in the sum of the values for all the groups in the array. - * - * Definitions in this variable can be used to report weighted meta localisation - * scores for the 50 most spoken languages. - * - * @todo Allow weighted reporting for all available languges. - */ -$localisedWeights = array( - 'wikimedia' => array( - 'core-0-mostused' => 40, - 'core' => 30, - 'ext-0-wikimedia' => 30 - ), - 'fundraiser' => array( - 'ext-di-di' => 16, - 'ext-di-pfpg' => 84, - ), - 'mediawiki' => array( - 'core-0-mostused' => 30, - 'core' => 30, - 'ext-0-wikimedia' => 20, - 'ext-0-all' => 20 - ) -); +class GroupStatictics extends Maintenance { + /** + * Array of the most spoken languages in the world. + * Source: http://stats.wikimedia.org/EN/Sitemap.htm. + * + * Key value pairs of: + * [MediaWiki localisation code] => array( + * [position in top 50], + * [speakers in millions], + * [continent where localisation is spoken] + * ) + */ + public $mostSpokenLanguages = array( + 'en' => array( 1, 1500, 'multiple' ), + 'zh-hans' => array( 2, 1300, 'asia' ), + 'zh-hant' => array( 2, 1300, 'asia' ), + 'hi' => array( 3, 550, 'asia' ), + 'ar' => array( 4, 530, 'multiple' ), + 'es' => array( 5, 500, 'multiple' ), + 'ms' => array( 6, 300, 'asia' ), + 'pt' => array( 7, 290, 'multiple' ), + 'pt-br' => array( 7, 290, 'america' ), + 'ru' => array( 8, 278, 'multiple' ), + 'id' => array( 9, 250, 'asia' ), + 'bn' => array( 10, 230, 'asia' ), + 'fr' => array( 11, 200, 'multiple' ), + 'de' => array( 12, 185, 'europe' ), + 'ja' => array( 13, 132, 'asia' ), + 'fa' => array( 14, 107, 'asia' ), + 'pnb' => array( 15, 104, 'asia' ), // Most spoken variant + 'tl' => array( 16, 90, 'asia' ), + 'mr' => array( 17, 90, 'asia' ), + 'vi' => array( 18, 80, 'asia' ), + 'jv' => array( 19, 80, 'asia' ), + 'te' => array( 20, 80, 'asia' ), + 'ko' => array( 21, 78, 'asia' ), + 'wuu' => array( 22, 77, 'asia' ), + 'arz' => array( 23, 76, 'africa' ), + 'th' => array( 24, 73, 'asia' ), + 'yue' => array( 25, 71, 'asia' ), + 'tr' => array( 26, 70, 'multiple' ), + 'it' => array( 27, 70, 'europe' ), + 'ta' => array( 28, 66, 'asia' ), + 'ur' => array( 29, 60, 'asia' ), + 'my' => array( 30, 52, 'asia' ), + 'sw' => array( 31, 50, 'africa' ), + 'nan' => array( 32, 49, 'asia' ), + 'kn' => array( 33, 47, 'asia' ), + 'gu' => array( 34, 46, 'asia' ), + 'uk' => array( 35, 45, 'europe' ), + 'pl' => array( 36, 43, 'europe' ), + 'sd' => array( 37, 41, 'asia' ), + 'ha' => array( 38, 39, 'africa' ), + 'ml' => array( 39, 37, 'asia' ), + 'gan-hans' => array( 40, 35, 'asia' ), + 'gan-hant' => array( 40, 35, 'asia' ), + 'hak' => array( 41, 34, 'asia' ), + 'or' => array( 42, 31, 'asia' ), + 'ne' => array( 43, 30, 'asia' ), + 'ro' => array( 44, 28, 'europe' ), + 'su' => array( 45, 27, 'asia' ), + 'az' => array( 46, 27, 'asia' ), + 'nl' => array( 47, 27, 'europe' ), + 'zu' => array( 48, 26, 'africa' ), + 'ps' => array( 49, 26, 'asia' ), + 'ckb' => array( 50, 26, 'asia' ), + 'ku-latn' => array( 50, 26, 'asia' ), + ); -/** - * Code map to map localisation codes to Wikimedia project codes. Only - * exclusion and remapping is defined here. It is assumed that the first part - * of the localisation code is the WMF project name otherwise (zh-hans -> zh). - */ -$wikimediaCodeMap = array( - // Codes containing a dash - 'bat-smg' => 'bat-smg', - 'cbk-zam' => 'cbk-zam', - 'map-bms' => 'map-bms', - 'nds-nl' => 'nds-nl', - 'roa-rup' => 'roa-rup', - 'roa-tara' => 'roa-tara', + /** + * Variable with key-value pairs with a named index and an array of key-value + * pairs where the key is a MessageGroup ID and the value is a weight of the + * group in the sum of the values for all the groups in the array. + * + * Definitions in this variable can be used to report weighted meta localisation + * scores for the 50 most spoken languages. + * + * @todo Allow weighted reporting for all available languges. + */ + public $localisedWeights = array( + 'wikimedia' => array( + 'core-0-mostused' => 40, + 'core' => 30, + 'ext-0-wikimedia' => 30 + ), + 'fundraiser' => array( + 'ext-di-di' => 16, + 'ext-di-pfpg' => 84, + ), + 'mediawiki' => array( + 'core-0-mostused' => 30, + 'core' => 30, + 'ext-0-wikimedia' => 20, + 'ext-0-all' => 20 + ) + ); - // Remaps - 'be-tarask' => 'be-x-old', - 'gsw' => 'als', - 'ike-cans' => 'iu', - 'ike-latn' => 'iu', - 'lzh' => 'zh-classical', - 'nan' => 'zh-min-nan', - 'vro' => 'fiu-vro', - 'yue' => 'zh-yue', + /** + * Code map to map localisation codes to Wikimedia project codes. Only + * exclusion and remapping is defined here. It is assumed that the first part + * of the localisation code is the WMF project name otherwise (zh-hans -> zh). + */ + public $wikimediaCodeMap = array( + // Codes containing a dash + 'bat-smg' => 'bat-smg', + 'cbk-zam' => 'cbk-zam', + 'map-bms' => 'map-bms', + 'nds-nl' => 'nds-nl', + 'roa-rup' => 'roa-rup', + 'roa-tara' => 'roa-tara', - // Ignored language codes. See reason. - 'als' => '', // gsw - 'be-x-old' => '', // be-tarask - 'crh' => '', // crh-* - 'de-at' => '', // de - 'de-ch' => '', // de - 'de-formal' => '', // de, not reporting formal form - 'dk' => '', // da - 'en-au' => '', // en - 'en-ca' => '', // no MW code - 'en-gb' => '', // no MW code - 'es-419' => '', // no MW code - 'fiu-vro' => '', // vro - 'gan' => '', // gan-* - 'got' => '', // extinct. not reporting formal form - 'hif' => '', // hif-* - 'hu-formal' => '', // not reporting - 'iu' => '', // ike-* - 'kk' => '', // kk-* - 'kk-cn' => '', // kk-arab - 'kk-kz' => '', // kk-cyrl - 'kk-tr' => '', // kk-latn - 'ko-kp' => '', // ko - 'ku' => '', // ku-* - 'ku-arab' => '', // ckb - 'nb' => '', // no - 'nl-be' => '', // no MW code - 'nl-informal' => '', // nl, not reporting informal form - 'ruq' => '', // ruq-* - 'simple' => '', // en - 'sr' => '', // sr-* - 'tg' => '', // tg-* - 'tp' => '', // tokipona - 'tt' => '', // tt-* - 'ug' => '', // ug-* - 'zh' => '', // zh-* - 'zh-classical' => '', // lzh - 'zh-cn' => '', // zh - 'zh-sg' => '', // zh - 'zh-hk' => '', // zh - 'zh-min-nan' => '', // - 'zh-mo' => '', // zh - 'zh-my' => '', // zh - 'zh-tw' => '', // zh - 'zh-yue' => '', // yue -); + // Remaps + 'be-tarask' => 'be-x-old', + 'gsw' => 'als', + 'ike-cans' => 'iu', + 'ike-latn' => 'iu', + 'lzh' => 'zh-classical', + 'nan' => 'zh-min-nan', + 'vro' => 'fiu-vro', + 'yue' => 'zh-yue', -$optionsWithArgs = array( 'groups', 'output', 'skiplanguages', 'legenddetail', 'legendsummary' ); -require __DIR__ . '/cli.inc'; + // Ignored language codes. See reason. + 'als' => '', // gsw + 'be-x-old' => '', // be-tarask + 'crh' => '', // crh-* + 'de-at' => '', // de + 'de-ch' => '', // de + 'de-formal' => '', // de, not reporting formal form + 'dk' => '', // da + 'en-au' => '', // en + 'en-ca' => '', // no MW code + 'en-gb' => '', // no MW code + 'es-419' => '', // no MW code + 'fiu-vro' => '', // vro + 'gan' => '', // gan-* + 'got' => '', // extinct. not reporting formal form + 'hif' => '', // hif-* + 'hu-formal' => '', // not reporting + 'iu' => '', // ike-* + 'kk' => '', // kk-* + 'kk-cn' => '', // kk-arab + 'kk-kz' => '', // kk-cyrl + 'kk-tr' => '', // kk-latn + 'ko-kp' => '', // ko + 'ku' => '', // ku-* + 'ku-arab' => '', // ckb + 'nb' => '', // no + 'nl-be' => '', // no MW code + 'nl-informal' => '', // nl, not reporting informal form + 'ruq' => '', // ruq-* + 'simple' => '', // en + 'sr' => '', // sr-* + 'tg' => '', // tg-* + 'tp' => '', // tokipona + 'tt' => '', // tt-* + 'ug' => '', // ug-* + 'zh' => '', // zh-* + 'zh-classical' => '', // lzh + 'zh-cn' => '', // zh + 'zh-sg' => '', // zh + 'zh-hk' => '', // zh + 'zh-min-nan' => '', // + 'zh-mo' => '', // zh + 'zh-my' => '', // zh + 'zh-tw' => '', // zh + 'zh-yue' => '', // yue + ); + + public function __construct() { + parent::__construct(); + $this->mDescription = 'Script to generate statistics about the localisation ' . + 'level of one or more message groups'; + $this->addOption( + 'groups', + '(optional) Comma separated list of groups', + false, /*required*/ + true /*has arg*/ + ); + $this->addOption( + 'output', + '(optional) csv: Comma Separated Values, wiki: MediaWiki syntax, ' . + 'text: Text with tabs. Default: default', + false, /*required*/ + true /*has arg*/ + ); + $this->addOption( + 'skiplanguages', + '(optional) Comma separated list of languages to be skipped', + false, /*required*/ + true /*has arg*/ + ); + $this->addOption( + 'skipzero', + '(optional) Skip languages that do not have any localisation at all' + ); + $this->addOption( + 'legenddetail', + '(optional) Page name for legend to be transcluded at the top of the details table', + false, /*required*/ + true /*has arg*/ + ); + $this->addOption( + 'legendsummary', + '(optional) Page name for legend to be transcluded at the top of the summary table', + false, /*required*/ + true /*has arg*/ + ); + $this->addOption( + 'fuzzy', + '(optional) Add column for fuzzy counts' + ); + $this->addOption( + 'speakers', + '(optional) Add column for number of speakers (est.). ' . + 'Only valid when combined with "most"' + ); + $this->addOption( + 'nol10n', + '(optional) Do not add localised language name if I18ntags is installed' + ); + $this->addOption( + 'continent', + '(optional) Add a continent column. Only available when output is ' . + '"wiki" or not specified.' + ); + $this->addOption( + 'summary', + '(optional) Add a summary with counts and scores per continent category ' . + 'and totals. Only available for a valid "most" value.', + false, /*required*/ + true /*has arg*/ + ); + $this->addOption( + 'wmfscore', + 'Only output WMF language code and weighted score for all ' . + 'language codes for weighing group "wikimedia" in CSV. This ' . + 'report must keep a stable layout as it is used/will be ' . + 'used in the Wikimedia statistics.' + ); + $this->addOption( + 'most', + '(optional) "mediawiki" or "wikimedia". Report on the 50 most ' . + 'spoken languages. Skipzero is ignored. If a valid scope is ' . + 'defined, the group list and fuzzy are ignored and the ' . + 'localisation levels are weighted and reported.', + false, /*required*/ + true /*has arg*/ + ); + } + + public function execute() { + $output = $this->getOption( 'output', 'default' ); + + // Select an output engine + switch ( $output ) { + case 'wiki': + $out = new wikiStatsOutput(); + break; + case 'text': + $out = new textStatsOutput(); + break; + case 'csv': + $out = new csvStatsOutput(); + break; + default: + $out = new TranslateStatsOutput(); + } + + $skipLanguages = array(); + if ( $this->hasOption( 'skiplanguages' ) ) { + $skipLanguages = array_map( + 'trim', + explode( ',', $this->getOption( 'skiplanguages' ) ) + ); + } + + $reportScore = false; + // Check if score should be reported and prepare weights + $most = $this->getOption( 'most' ); + $weights = array(); + if ( $most && isset( $localisedWeights[$most] ) ) { + $reportScore = true; + + foreach ( $localisedWeights[$most] as $weight ) { + $weights[] = $weight; + } + } + + // check if l10n should be done + $l10n = false; + if ( ( $output === 'wiki' || $output === 'default' ) && + !$this->hasOption( 'nol10n' ) + ) { + $l10n = true; + } + + $wmfscore = $this->hasOption( 'wmfscore ' ); + + // Get groups from input + $groups = array(); + if ( $reportScore ) { + $reqGroups = array_keys( $this->localisedWeights[$most] ); + } elseif ( !$wmfscore ) { + $reqGroups = array_map( 'trim', explode( ',', $this->getOption( 'groups' ) ) ); + } else { + $reqGroups = array_keys( $this->localisedWeights['wikimedia'] ); + } + + // List of all groups + $allGroups = MessageGroups::singleton()->getGroups(); + + // Get list of valid groups + foreach ( $reqGroups as $id ) { + // Page translation group ids use spaces which are not nice on command line + $id = str_replace( '_', ' ', $id ); + if ( isset( $allGroups[$id] ) ) { + $groups[$id] = $allGroups[$id]; + } else { + $this->output( "Unknown group: $id" ); + } + } + + if ( $wmfscore ) { + // Override/set parameters + $out = new csvStatsOutput(); + $reportScore = true; + + $weights = array(); + foreach ( $this->localisedWeights['wikimedia'] as $weight ) { + $weights[] = $weight; + } + $wmfscores = array(); + } + + if ( !count( $groups ) ) { + showUsage(); + } + + // List of all languages. + $languages = Language::fetchLanguageNames( false ); + // Default sorting order by language code, users can sort wiki output. + ksort( $languages ); + + if ( $this->hasOption( 'legenddetail' ) ) { + $out->addFreeText( "{{" . $this->getOption( 'legenddetail' ) . "}}\n" ); + } + + $totalWeight = 0; + if ( $reportScore ) { + if ( $wmfscore ) { + foreach ( $this->localisedWeights['wikimedia'] as $weight ) { + $totalWeight += $weight; + } + } else { + foreach ( $this->localisedWeights[$most] as $weight ) { + $totalWeight += $weight; + } + } + } + + $showContinent = $this->getOption( 'continent' ); + if ( !$wmfscore ) { + // Output headers + $out->heading(); + + $out->blockstart(); + + if ( $most ) { + $out->element( ( $l10n ? "{{int:translate-gs-pos}}" : 'Pos.' ), true ); + } + + $out->element( ( $l10n ? "{{int:translate-gs-code}}" : 'Code' ), true ); + $out->element( ( $l10n ? "{{int:translate-page-language}}" : 'Language' ), true ); + if ( $showContinent ) { + $out->element( ( $l10n ? "{{int:translate-gs-continent}}" : 'Continent' ), true ); + } + + if ( $most && $this->hasOption( 'speakers' ) ) { + $out->element( ( $l10n ? "{{int:translate-gs-speakers}}" : 'Speakers' ), true ); + } + + if ( $reportScore ) { + $out->element( + ( $l10n ? "{{int:translate-gs-score}}" : 'Score' ) . ' (' . $totalWeight . ')', + true + ); + } + + /** + * @var $g MessageGroup + */ + foreach ( $groups as $g ) { + // Add unprocessed description of group as heading + if ( $reportScore ) { + $gid = $g->getId(); + $heading = $g->getLabel() . " (" . $this->localisedWeights[$most][$gid] . ")"; + } else { + $heading = $g->getLabel(); + } + $out->element( $heading, true ); + if ( !$reportScore && $this->hasOption( 'fuzzy' ) ) { + $out->element( ( $l10n ? "{{int:translate-percentage-fuzzy}}" : 'Fuzzy' ), true ); + } + } + + $out->blockend(); + } + + $rows = array(); + foreach ( $languages as $code => $name ) { + // Skip list + if ( in_array( $code, $skipLanguages ) ) { + continue; + } + $rows[$code] = array(); + } + + foreach ( $groups as $groupName => $g ) { + $stats = MessageGroupStats::forGroup( $groupName ); + + // Perform the statistic calculations on every language + foreach ( $languages as $code => $name ) { + // Skip list + if ( !$most && in_array( $code, $skipLanguages ) ) { + continue; + } + + // Do not calculate if we do not need it for anything. + if ( $wmfscore && isset( $wikimediaCodeMap[$code] ) && $wikimediaCodeMap[$code] == '' ) { + continue; + } + + // If --most is set, skip all other + if ( $most && !isset( $mostSpokenLanguages[$code] ) ) { + continue; + } + + $total = $stats[$code][MessageGroupStats::TOTAL]; + $translated = $stats[$code][MessageGroupStats::TRANSLATED]; + $fuzzy = $stats[$code][MessageGroupStats::FUZZY]; + + $rows[$code][] = array( false, $translated, $total ); + + if ( $this->hasOption( 'fuzzy' ) ) { + $rows[$code][] = array( true, $fuzzy, $total ); + } + } + + unset( $collection ); + } + + // init summary array + $summarise = false; + if ( $this->hasOption( 'summary' ) ) { + $summarise = true; + $summary = array(); + } + + foreach ( $languages as $code => $name ) { + // Skip list + if ( !$most && in_array( $code, $skipLanguages ) ) { + continue; + } + + // Skip unneeded + if ( $wmfscore && isset( $wikimediaCodeMap[$code] ) && $wikimediaCodeMap[$code] == '' ) { + continue; + } + + // If --most is set, skip all other + if ( $most && !isset( $mostSpokenLanguages[$code] ) ) { + continue; + } + + $columns = $rows[$code]; + + $allZero = true; + foreach ( $columns as $fields ) { + if ( intval( $fields[1] ) !== 0 ) { + $allZero = false; + } + } + + // Skip dummy languages if requested + if ( $allZero && $this->hasOption( 'skipzero' ) ) { + continue; + } + + // Output the the row + if ( !$wmfscore ) { + $out->blockstart(); + } + + // Fill language position field + if ( $most ) { + $out->element( $this->mostSpokenLanguages[$code][0] ); + } + + // Fill language name field + if ( !$wmfscore ) { + // Fill language code field + $out->element( $code ); + + if ( $l10n && function_exists( 'efI18nTagsInit' ) ) { + $out->element( "{{#languagename:" . $code . "}}" ); + } else { + $out->element( $name ); + } + } + + // Fill continent field + if ( $showContinent ) { + if ( $this->mostSpokenLanguages[$code][2] == 'multiple' ) { + $continent = ( $l10n ? "{{int:translate-gs-multiple}}" : 'Multiple' ); + } else { + $continent = $l10n ? + "{{int:timezoneregion-" . $this->mostSpokenLanguages[$code][2] . "}}" : + ucfirst( $this->mostSpokenLanguages[$code][2] ); + } + + $out->element( $continent ); + } + + // Fill speakers field + if ( $most && $this->hasOption( 'speakers' ) ) { + $out->element( number_format( $this->mostSpokenLanguages[$code][1] ) ); + } + + // Fill the score field + if ( $reportScore ) { + // Keep count + $i = 0; + // Start with 0 points + $score = 0; + + foreach ( $columns as $fields ) { + list( , $upper, $total ) = $fields; + // Weigh the score and add it to the current score + $score += ( $weights[$i] * $upper ) / $total; + $i++; + } + + // Report a round numbers + $score = number_format( $score, 0 ); + + if ( $summarise ) { + $continent = $this->mostSpokenLanguages[$code][2]; + if ( isset( $summary[$continent] ) ) { + $newcount = $summary[$continent][0] + 1; + $newscore = $summary[$continent][1] + (int)$score; + } else { + $newcount = 1; + $newscore = $score; + } + + $summary[$continent] = array( $newcount, $newscore ); + } + + if ( $wmfscore ) { + // Multiple variants can be used for the same wiki. + // Store the scores in an array and output them later + // when they can be averaged. + if ( isset( $wikimediaCodeMap[$code] ) ) { + $wmfcode = $wikimediaCodeMap[$code]; + } else { + $codeparts = explode( '-', $code ); + $wmfcode = $codeparts[0]; + } + + if ( isset( $wmfscores[$wmfcode] ) ) { + $count = $wmfscores[$wmfcode]['count'] + 1; + $tmpWmfScore = (int)$wmfscores[$wmfcode]['score']; + $tmpWmfCount = (int)$wmfscores[$wmfcode]['count']; + $score = ( ( $tmpWmfCount * $tmpWmfScore ) + (int)$score ) / $count; + $wmfscores[$wmfcode] = array( 'score' => $score, 'count' => $count ); + } else { + $wmfscores[$wmfcode] = array( 'score' => $score, 'count' => 1 ); + } + } else { + $out->element( $score ); + } + } + + // Fill fields for groups + if ( !$wmfscore ) { + foreach ( $columns as $fields ) { + list( $invert, $upper, $total ) = $fields; + $c = $out->formatPercent( $upper, $total, $invert ); + $out->element( $c ); + } + + $out->blockend(); + } + } + + $out->footer(); + + if ( $reportScore && $this->hasOption( 'summary' ) ) { + if ( $reportScore && $this->hasOption( 'legendsummary' ) ) { + $out->addFreeText( "{{" . $this->getOption( 'legendsummary' ) . "}}\n" ); + } + + $out->summaryheading(); + + $out->blockstart(); + + $out->element( $l10n ? "{{int:translate-gs-continent}}" : 'Continent', true ); + $out->element( $l10n ? "{{int:translate-gs-count}}" : 'Count', true ); + $out->element( $l10n ? "{{int:translate-gs-avgscore}}" : 'Avg. score', true ); + + $out->blockend(); + + ksort( $summary ); + + $totals = array( 0, 0 ); + + foreach ( $summary as $key => $values ) { + $out->blockstart(); + + if ( $key == 'multiple' ) { + $out->element( $l10n ? "{{int:translate-gs-multiple}}" : 'Multiple' ); + } else { + $out->element( $l10n ? "{{int:timezoneregion-" . $key . "}}" : ucfirst( $key ) ); + } + $out->element( $values[0] ); + $out->element( number_format( $values[1] / $values[0] ) ); + + $out->blockend(); + + $totals[0] += $values[0]; + $totals[1] += $values[1]; + } + + $out->blockstart(); + $out->element( $l10n ? "{{int:translate-gs-total}}" : 'Total' ); + $out->element( $totals[0] ); + $out->element( number_format( $totals[1] / $totals[0] ) ); + $out->blockend(); + + $out->footer(); + } + + // Custom output + if ( $wmfscore ) { + ksort( $wmfscores ); + + foreach ( $wmfscores as $code => $stats ) { + echo $code . ';' . number_format( $stats['score'] ) . ";\n"; + } + } + } +} /** * Provides heading, summaryheading and free text addition for stats output in @@ -206,465 +697,5 @@ } } -if ( isset( $options['help'] ) ) { - showUsage(); -} - -// Show help and exit if '--most' does not have a valid value and no groups set -if ( isset( $options['most'] ) && - !isset( $localisedWeights[$options['most']] ) && - !isset( $options['groups'] ) -) { - showUsage(); -} - -if ( !isset( $options['output'] ) ) { - $options['output'] = 'default'; -} - -# Select an output engine -switch ( $options['output'] ) { - case 'wiki': - $out = new wikiStatsOutput(); - break; - case 'text': - $out = new textStatsOutput(); - break; - case 'csv': - $out = new csvStatsOutput(); - break; - case 'default': - $out = new TranslateStatsOutput(); - break; - default: - showUsage(); -} - -$skipLanguages = array(); -if ( isset( $options['skiplanguages'] ) ) { - $skipLanguages = array_map( 'trim', explode( ',', $options['skiplanguages'] ) ); -} - -$reportScore = false; -// Check if score should be reported and prepare weights -if ( isset( $options['most'] ) && isset( $localisedWeights[$options['most']] ) ) { - $reportScore = true; - $weights = array(); - - foreach ( $localisedWeights[$options['most']] as $weight ) { - $weights[] = $weight; - } -} - -// check if l10n should be done -$l10n = false; -if ( ( $options['output'] == 'wiki' || $options['output'] == 'default' ) && - !isset( $options['nol10n'] ) -) { - $l10n = true; -} - -$wmfscore = false; -if ( isset( $options['wmfscore'] ) ) { - $wmfscore = true; -} - -// Get groups from input -$groups = array(); -if ( $reportScore ) { - $reqGroups = array_keys( $localisedWeights[$options['most']] ); -} elseif ( !$wmfscore ) { - $reqGroups = array_map( 'trim', explode( ',', $options['groups'] ) ); -} else { - $reqGroups = array_keys( $localisedWeights['wikimedia'] ); -} - -// List of all groups -$allGroups = MessageGroups::singleton()->getGroups(); - -// Get list of valid groups -foreach ( $reqGroups as $id ) { - // Page translation group ids use spaces which are not nice on command line - $id = str_replace( '_', ' ', $id ); - if ( isset( $allGroups[$id] ) ) { - $groups[$id] = $allGroups[$id]; - } else { - STDERR( "Unknown group: $id" ); - } -} - -if ( $wmfscore ) { - // Override/set parameters - $out = new csvStatsOutput(); - $reportScore = true; - - $weights = array(); - foreach ( $localisedWeights['wikimedia'] as $weight ) { - $weights[] = $weight; - } - $wmfscores = array(); -} - -if ( !count( $groups ) ) { - showUsage(); -} - -// List of all languages. -$languages = Language::getLanguageNames( false ); -// Default sorting order by language code, users can sort wiki output. -ksort( $languages ); - -if ( isset( $options['legenddetail'] ) ) { - $out->addFreeText( "{{" . $options['legenddetail'] . "}}\n" ); -} - -$totalWeight = 0; -if ( $reportScore ) { - if ( $wmfscore ) { - foreach ( $localisedWeights['wikimedia'] as $weight ) { - $totalWeight += $weight; - } - } else { - foreach ( $localisedWeights[$options['most']] as $weight ) { - $totalWeight += $weight; - } - } -} - -if ( !$wmfscore ) { - // Output headers - $out->heading(); - - $out->blockstart(); - - if ( isset( $options['most'] ) ) { - $out->element( ( $l10n ? "{{int:translate-gs-pos}}" : 'Pos.' ), true ); - } - - $out->element( ( $l10n ? "{{int:translate-gs-code}}" : 'Code' ), true ); - $out->element( ( $l10n ? "{{int:translate-page-language}}" : 'Language' ), true ); - if ( isset( $options['continent'] ) ) { - $out->element( ( $l10n ? "{{int:translate-gs-continent}}" : 'Continent' ), true ); - } - - if ( isset( $options['most'] ) && isset( $options['speakers'] ) ) { - $out->element( ( $l10n ? "{{int:translate-gs-speakers}}" : 'Speakers' ), true ); - } - - if ( $reportScore ) { - $out->element( - ( $l10n ? "{{int:translate-gs-score}}" : 'Score' ) . ' (' . $totalWeight . ')', - true - ); - } - - /** - * @var $g MessageGroup - */ - foreach ( $groups as $g ) { - // Add unprocessed description of group as heading - if ( $reportScore ) { - $gid = $g->getId(); - $heading = $g->getLabel() . " (" . $localisedWeights[$options['most']][$gid] . ")"; - } else { - $heading = $g->getLabel(); - } - $out->element( $heading, true ); - if ( !$reportScore && isset( $options['fuzzy'] ) ) { - $out->element( ( $l10n ? "{{int:translate-percentage-fuzzy}}" : 'Fuzzy' ), true ); - } - } - - $out->blockend(); -} - -$rows = array(); -foreach ( $languages as $code => $name ) { - // Skip list - if ( in_array( $code, $skipLanguages ) ) { - continue; - } - $rows[$code] = array(); -} - -foreach ( $groups as $groupName => $g ) { - $stats = MessageGroupStats::forGroup( $groupName ); - - // Perform the statistic calculations on every language - foreach ( $languages as $code => $name ) { - // Skip list - if ( !isset( $options['most'] ) && in_array( $code, $skipLanguages ) ) { - continue; - } - - // Do not calculate if we do not need it for anything. - if ( $wmfscore && isset( $wikimediaCodeMap[$code] ) && $wikimediaCodeMap[$code] == '' ) { - continue; - } - - // If --most is set, skip all other - if ( isset( $options['most'] ) && !isset( $mostSpokenLanguages[$code] ) ) { - continue; - } - - $total = $stats[$code][MessageGroupStats::TOTAL]; - $translated = $stats[$code][MessageGroupStats::TRANSLATED]; - $fuzzy = $stats[$code][MessageGroupStats::FUZZY]; - - $rows[$code][] = array( false, $translated, $total ); - - if ( isset( $options['fuzzy'] ) ) { - $rows[$code][] = array( true, $fuzzy, $total ); - } - } - - unset( $collection ); -} - -// init summary array -$summarise = false; -if ( isset( $options['summary'] ) ) { - $summarise = true; - $summary = array(); -} - -foreach ( $languages as $code => $name ) { - // Skip list - if ( !isset( $options['most'] ) && in_array( $code, $skipLanguages ) ) { - continue; - } - - // Skip unneeded - if ( $wmfscore && isset( $wikimediaCodeMap[$code] ) && $wikimediaCodeMap[$code] == '' ) { - continue; - } - - // If --most is set, skip all other - if ( isset( $options['most'] ) && !isset( $mostSpokenLanguages[$code] ) ) { - continue; - } - - $columns = $rows[$code]; - - $allZero = true; - foreach ( $columns as $fields ) { - if ( intval( $fields[1] ) !== 0 ) { - $allZero = false; - } - } - - // Skip dummy languages if requested - if ( $allZero && isset( $options['skipzero'] ) ) { - continue; - } - - // Output the the row - if ( !$wmfscore ) { - $out->blockstart(); - } - - // Fill language position field - if ( isset( $options['most'] ) ) { - $out->element( $mostSpokenLanguages[$code][0] ); - } - - // Fill language name field - if ( !$wmfscore ) { - // Fill language code field - $out->element( $code ); - - if ( $l10n && function_exists( 'efI18nTagsInit' ) ) { - $out->element( "{{#languagename:" . $code . "}}" ); - } else { - $out->element( $name ); - } - } - - // Fill continent field - if ( isset( $options['continent'] ) ) { - if ( $mostSpokenLanguages[$code][2] == 'multiple' ) { - $continent = ( $l10n ? "{{int:translate-gs-multiple}}" : 'Multiple' ); - } else { - $continent = $l10n ? - "{{int:timezoneregion-" . $mostSpokenLanguages[$code][2] . "}}" : - ucfirst( $mostSpokenLanguages[$code][2] ); - } - - $out->element( $continent ); - } - - // Fill speakers field - if ( isset( $options['most'] ) && isset( $options['speakers'] ) ) { - $out->element( number_format( $mostSpokenLanguages[$code][1] ) ); - } - - // Fill the score field - if ( $reportScore ) { - // Keep count - $i = 0; - // Start with 0 points - $score = 0; - - foreach ( $columns as $fields ) { - list( $invert, $upper, $total ) = $fields; - // Weigh the score and add it to the current score - $score += ( $weights[$i] * $upper ) / $total; - $i++; - } - - // Report a round numbers - $score = number_format( $score, 0 ); - - if ( $summarise ) { - $continent = $mostSpokenLanguages[$code][2]; - if ( isset( $summary[$continent] ) ) { - $newcount = $summary[$continent][0] + 1; - $newscore = $summary[$continent][1] + $score; - } else { - $newcount = 1; - $newscore = $score; - } - - $summary[$continent] = array( $newcount, $newscore ); - } - - if ( $wmfscore ) { - // Multiple variants can be used for the same wiki. - // Store the scores in an array and output them later - // when they can be averaged. - if ( isset( $wikimediaCodeMap[$code] ) ) { - $wmfcode = $wikimediaCodeMap[$code]; - } else { - $codeparts = explode( '-', $code ); - $wmfcode = $codeparts[0]; - } - - if ( isset( $wmfscores[$wmfcode] ) ) { - $count = $wmfscores[$wmfcode]['count'] + 1; - $tmpWmfScore = (int) $wmfscores[$wmfcode]['score']; - $tmpWmfCount = (int) $wmfscores[$wmfcode]['count']; - $score = ( ( $tmpWmfCount * $tmpWmfScore ) + $score ) / $count; - $wmfscores[$wmfcode] = array( 'score' => $score, 'count' => $count ); - } else { - $wmfscores[$wmfcode] = array( 'score' => $score, 'count' => 1 ); - } - } else { - $out->element( $score ); - } - } - - // Fill fields for groups - if ( !$wmfscore ) { - foreach ( $columns as $fields ) { - list( $invert, $upper, $total ) = $fields; - $c = $out->formatPercent( $upper, $total, $invert ); - $out->element( $c ); - } - - $out->blockend(); - } -} - -$out->footer(); - -if ( $reportScore && isset( $options['summary'] ) ) { - if ( $reportScore && isset( $options['legendsummary'] ) ) { - $out->addFreeText( "{{" . $options['legendsummary'] . "}}\n" ); - } - - $out->summaryheading(); - - $out->blockstart(); - - $out->element( $l10n ? "{{int:translate-gs-continent}}" : 'Continent', true ); - $out->element( $l10n ? "{{int:translate-gs-count}}" : 'Count', true ); - $out->element( $l10n ? "{{int:translate-gs-avgscore}}" : 'Avg. score', true ); - - $out->blockend(); - - ksort( $summary ); - - $totals = array( 0, 0 ); - - foreach ( $summary as $key => $values ) { - $out->blockstart(); - - if ( $key == 'multiple' ) { - $out->element( $l10n ? "{{int:translate-gs-multiple}}" : 'Multiple' ); - } else { - $out->element( $l10n ? "{{int:timezoneregion-" . $key . "}}" : ucfirst( $key ) ); - } - $out->element( $values[0] ); - $out->element( number_format( $values[1] / $values[0] ) ); - - $out->blockend(); - - $totals[0] += $values[0]; - $totals[1] += $values[1]; - } - - $out->blockstart(); - $out->element( $l10n ? "{{int:translate-gs-total}}" : 'Total' ); - $out->element( $totals[0] ); - $out->element( number_format( $totals[1] / $totals[0] ) ); - $out->blockend(); - - $out->footer(); -} - -// Custom output -if ( $wmfscore ) { - ksort( $wmfscores ); - - foreach ( $wmfscores as $code => $stats ) { - echo $code . ';' . number_format( $stats['score'] ) . ";\n"; - } -} - -/** - * Print a usage message. - */ -function showUsage() { - $msg = <<<PHP - --help : this help message - --groups LIST: comma separated list of groups - --skiplanguages LIST: comma separated list of skipped languages - --skipzero : skip languages that do not have any localisation at all - --fuzzy : add column for fuzzy counts - --output TYPE: select another output engine - * 'csv' : Comma Separated Values. - * 'wiki' : MediaWiki syntax. - * 'text' : Text with tabs. - --most : [SCOPE]: report on the 50 most spoken languages. Skipzero is - ignored. If a valid scope is defined, the group list - and fuzzy are ignored and the localisation levels are - weighted and reported. - * mediawiki: - core-0-mostused (30%) - core (30%) - ext-0-wikimedia (20%) - ext-0-all (20%) - * wikimedia: - core-0-mostused (40%) - core (30%) - ext-0-wikimedia (30%) - --speakers : add column for number of speakers (est.). Only valid when - combined with --most. - --nol10n : do not add localised language name if I18ntags is installed. - --continent : add a continent column. Only available when output is - 'wiki' or not specified. - --summary : add a summary with counts and scores per continent category - and totals. Only available for a valid 'most' value. - --legenddetail : Page name for legend to be transcluded at the top of - the details table - --legendsummary : Page name for legend to be transcluded at the top of - the summary table - --wmfscore : Only output WMF language code and weighted score for all - language codes for weighing group 'wikimedia' in CSV. This - report must keep a stable layout as it is used/will be - used in the Wikimedia statistics. - -PHP; - STDERR( $msg ); - exit( 1 ); -} +$maintClass = 'GroupStatictics'; +require_once RUN_MAINTENANCE_IF_MAIN; -- To view, visit https://gerrit.wikimedia.org/r/89114 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I0298d797eecdab6d49f94a5b4a8859d69e008f9f Gerrit-PatchSet: 3 Gerrit-Project: mediawiki/extensions/Translate Gerrit-Branch: master Gerrit-Owner: Siebrand <siebr...@wikimedia.org> Gerrit-Reviewer: Nikerabbit <niklas.laxst...@gmail.com> Gerrit-Reviewer: Siebrand <siebr...@wikimedia.org> Gerrit-Reviewer: jenkins-bot _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits