Gergő Tisza has uploaded a new change for review. https://gerrit.wikimedia.org/r/160580
Change subject: [WIP] Add tracking categories for files with attribution problems ...................................................................... [WIP] Add tracking categories for files with attribution problems Horribly broken at the moment because the file page is parsed in little bits and pieces so ParserAfterTidy is called with small fragments of texts instead of a single big one. Change-Id: I43ed79b6a54cd31820ecae8139e29c5880f5dd1b Mingle: https://wikimedia.mingle.thoughtworks.com/projects/multimedia/cards/859 --- M CommonsMetadata.php M DataCollector.php M HookHandler.php M i18n/en.json M i18n/qqq.json 5 files changed, 104 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CommonsMetadata refs/changes/80/160580/1 diff --git a/CommonsMetadata.php b/CommonsMetadata.php index 5ffa245..f947aec 100755 --- a/CommonsMetadata.php +++ b/CommonsMetadata.php @@ -36,4 +36,12 @@ $wgHooks['GetExtendedMetadata'][] = 'CommonsMetadata\HookHandler::onGetExtendedMetadata'; $wgHooks['ValidateExtendedMetadataCache'][] = 'CommonsMetadata\HookHandler::onValidateExtendedMetadataCache'; +$wgHooks['ParserAfterTidy'][] = 'CommonsMetadata\HookHandler::onParserAfterTidy'; $wgHooks['UnitTestsList'][] = 'CommonsMetadata\HookHandler::onUnitTestsList'; + +array_merge( $wgTrackingCategories, array( + 'commonsmetadata-trackingcategory-no-license', + 'commonsmetadata-trackingcategory-no-description', + 'commonsmetadata-trackingcategory-no-author', + 'commonsmetadata-trackingcategory-no-source', +) ); diff --git a/DataCollector.php b/DataCollector.php index 373d38d..b8d7e0d 100755 --- a/DataCollector.php +++ b/DataCollector.php @@ -99,6 +99,38 @@ } /** + * Checks for the presence of metadata needed for attributing the file (author, source, license) + * and returns a list of keys corresponding to problems. + * @param string $descriptionText HTML code of the file description + * @return array one or more of the following keys: + * - no-license - failed to detect a license + * - no-description - failed to detect any image description + * - no-author - failed to detect author name or a custom attribution text + * - no-source - failed to detect the source of the image or a custom attribution text + */ + public function verifyAttributionMetadata( $descriptionText ) { + $problems = array(); + $templateData = $this->templateParser->parsePage( $descriptionText ); + $licenseData = $this->selectLicense( $templateData[TemplateParser::LICENSES_KEY] ); + $informationData = $this->selectInformationTemplate( $templateData[TemplateParser::INFORMATION_FIELDS_KEY] ); + + if ( !$licenseData || empty( $licenseData['LicenseShortName'] ) ) { + $problems[] = 'no-license'; + } + if ( !$informationData || empty( $informationData['ImageDescription'] ) ) { + $problems[] = 'no-description'; + } + if ( !$informationData || empty( $informationData['Artist'] ) && empty( $informationData['Attribution'] ) ) { + $problems[] = 'no-author'; + } + if ( !$informationData || empty( $informationData['Credit'] ) && empty( $informationData['Attribution'] ) ) { + $problems[] = 'no-source'; + } + + return $problems; + } + + /** * @param array $categories * @return array */ diff --git a/HookHandler.php b/HookHandler.php index 75c323e..c569e83 100755 --- a/HookHandler.php +++ b/HookHandler.php @@ -84,6 +84,32 @@ } /** + * Check HTML output of a file page to see if it has all the basic metadata, and add tracking categories + * if it does not. + * @param \Parser $parser + * @param string $text HTML code of the page after parsing and tidying + * @return bool this hook handler always returns true. + */ + public static function onParserAfterTidy( &$parser, &$text ) { + if ( + !$parser->getTitle()->inNamespace( NS_FILE ) + || $parser->getOptions()->getIsSectionPreview() + ) { + return true; + } + + $lang = $parser->getTargetLanguage(); + $dataCollector = self::getDataCollector( $lang, true ); + + $categoryKeys = $dataCollector->verifyAttributionMetadata( $text ); + foreach ( $categoryKeys as $key ) { + $parser->addTrackingCategory( 'commonsmetadata-trackingcategory-' . $key ); + } + + return true; + } + + /** * Hook to add unit tests * @param array $files * @return bool @@ -93,4 +119,24 @@ $files = array_merge( $files, glob( $testDir . DIRECTORY_SEPARATOR . '*Test.php' ) ); return true; } + + /** + * @param Language $lang + * @param bool $singleLang + */ + private static function getDataCollector( Language $lang, $singleLang ) { + $templateParser = new TemplateParser(); + $templateParser->setMultiLanguage( !$singleLang ); + $fallbacks = Language::getFallbacksFor( $lang->getCode() ); + array_unshift( $fallbacks, $lang->getCode() ); + $templateParser->setPriorityLanguages( $fallbacks ); + + $dataCollector = new DataCollector(); + $dataCollector->setLanguage( $lang ); + $dataCollector->setMultiLang( !$singleLang ); + $dataCollector->setTemplateParser( $templateParser ); + $dataCollector->setLicenseParser( new LicenseParser() ); + + return $dataCollector; + } } diff --git a/i18n/en.json b/i18n/en.json index 1b9c42c..b2c835d 100644 --- a/i18n/en.json +++ b/i18n/en.json @@ -2,5 +2,13 @@ "@metadata": { "authors": [] }, - "commonsmetadata-desc": "Extends the \"extmetadata\" property of the image information API module to include information stored in image description pages that use the templates commonly used on Wikimedia Commons" + "commonsmetadata-desc": "Extends the \"extmetadata\" property of the image information API module to include information stored in image description pages that use the templates commonly used on Wikimedia Commons", + "commonsmetadata-trackingcategory-no-license": "Files with no machine-readable license", + "commonsmetadata-trackingcategory-no-license-desc": "The file does not have any [https://commons.wikimedia.org/wiki/Commons:Machine-readable_data machine-redable] license template", + "commonsmetadata-trackingcategory-no-description": "Files with no machine-readable description", + "commonsmetadata-trackingcategory-no-description-desc": "The file does not have a [https://commons.wikimedia.org/wiki/Commons:Machine-readable_data machine-redable] information template, or its description field is not filled out", + "commonsmetadata-trackingcategory-no-author": "Files with no machine-readable author", + "commonsmetadata-trackingcategory-no-author-desc": "The file does not have a [https://commons.wikimedia.org/wiki/Commons:Machine-readable_data machine-redable] information template, or its author field is not filled out", + "commonsmetadata-trackingcategory-no-source": "Files with no machine-readable source", + "commonsmetadata-trackingcategory-no-source-desc": "The file does not have a [https://commons.wikimedia.org/wiki/Commons:Machine-readable_data machine-redable] information template, or its source field is not filled out" } \ No newline at end of file diff --git a/i18n/qqq.json b/i18n/qqq.json index d09eaab..3f93512 100644 --- a/i18n/qqq.json +++ b/i18n/qqq.json @@ -4,5 +4,13 @@ "Bawolff" ] }, - "commonsmetadata-desc": "{{desc}}" + "commonsmetadata-desc": "{{desc}}", + "commonsmetadata-trackingcategory-no-license": "Name of the tracking category for files with no machine-readable license", + "commonsmetadata-trackingcategory-no-license-desc": "Description of the inclusion criteria for the tracking category for files with no machine-readable license", + "commonsmetadata-trackingcategory-no-description": "Name of the tracking category for files with no machine-readable description", + "commonsmetadata-trackingcategory-no-description-desc": "Description of the inclusion criteria for the tracking category for files with no machine-readable license", + "commonsmetadata-trackingcategory-no-author": "Name of the tracking category for files with no machine-readable author", + "commonsmetadata-trackingcategory-no-author-desc": "Description of the inclusion criteria for the tracking category for files with no machine-readable license", + "commonsmetadata-trackingcategory-no-source": "Name of the tracking category for files with no machine-readable source", + "commonsmetadata-trackingcategory-no-source-desc": "Description of the the inclusion criteria for tracking category for files with no machine-readable license" } -- To view, visit https://gerrit.wikimedia.org/r/160580 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I43ed79b6a54cd31820ecae8139e29c5880f5dd1b Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CommonsMetadata Gerrit-Branch: master Gerrit-Owner: Gergő Tisza <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
