Gergő Tisza has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/160580

Change subject: [WIP] Add tracking categories for files with attribution 
problems
......................................................................

[WIP] Add tracking categories for files with attribution problems

Horribly broken at the moment because the file page is parsed in
little bits and pieces so ParserAfterTidy is called with small
fragments of texts instead of a single big one.

Change-Id: I43ed79b6a54cd31820ecae8139e29c5880f5dd1b
Mingle: https://wikimedia.mingle.thoughtworks.com/projects/multimedia/cards/859
---
M CommonsMetadata.php
M DataCollector.php
M HookHandler.php
M i18n/en.json
M i18n/qqq.json
5 files changed, 104 insertions(+), 2 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CommonsMetadata 
refs/changes/80/160580/1

diff --git a/CommonsMetadata.php b/CommonsMetadata.php
index 5ffa245..f947aec 100755
--- a/CommonsMetadata.php
+++ b/CommonsMetadata.php
@@ -36,4 +36,12 @@
 
 $wgHooks['GetExtendedMetadata'][] = 
'CommonsMetadata\HookHandler::onGetExtendedMetadata';
 $wgHooks['ValidateExtendedMetadataCache'][] = 
'CommonsMetadata\HookHandler::onValidateExtendedMetadataCache';
+$wgHooks['ParserAfterTidy'][] = 
'CommonsMetadata\HookHandler::onParserAfterTidy';
 $wgHooks['UnitTestsList'][] = 'CommonsMetadata\HookHandler::onUnitTestsList';
+
+array_merge( $wgTrackingCategories,  array(
+       'commonsmetadata-trackingcategory-no-license',
+       'commonsmetadata-trackingcategory-no-description',
+       'commonsmetadata-trackingcategory-no-author',
+       'commonsmetadata-trackingcategory-no-source',
+) );
diff --git a/DataCollector.php b/DataCollector.php
index 373d38d..b8d7e0d 100755
--- a/DataCollector.php
+++ b/DataCollector.php
@@ -99,6 +99,38 @@
        }
 
        /**
+        * Checks for the presence of metadata needed for attributing the file 
(author, source, license)
+        * and returns a list of keys corresponding to problems.
+        * @param string $descriptionText HTML code of the file description
+        * @return array one or more of the following keys:
+        *  - no-license - failed to detect a license
+        *  - no-description - failed to detect any image description
+        *  - no-author - failed to detect author name or a custom attribution 
text
+        *  - no-source - failed to detect the source of the image or a custom 
attribution text
+        */
+       public function verifyAttributionMetadata( $descriptionText ) {
+               $problems = array();
+               $templateData = $this->templateParser->parsePage( 
$descriptionText );
+               $licenseData = $this->selectLicense( 
$templateData[TemplateParser::LICENSES_KEY] );
+               $informationData = $this->selectInformationTemplate( 
$templateData[TemplateParser::INFORMATION_FIELDS_KEY] );
+
+               if ( !$licenseData || empty( $licenseData['LicenseShortName'] ) 
) {
+                       $problems[] = 'no-license';
+               }
+               if ( !$informationData || empty( 
$informationData['ImageDescription'] ) ) {
+                       $problems[] = 'no-description';
+               }
+               if ( !$informationData || empty( $informationData['Artist'] ) 
&& empty( $informationData['Attribution'] ) ) {
+                       $problems[] = 'no-author';
+               }
+               if ( !$informationData || empty( $informationData['Credit'] ) 
&& empty( $informationData['Attribution'] ) ) {
+                       $problems[] = 'no-source';
+               }
+
+               return $problems;
+       }
+
+       /**
         * @param array $categories
         * @return array
         */
diff --git a/HookHandler.php b/HookHandler.php
index 75c323e..c569e83 100755
--- a/HookHandler.php
+++ b/HookHandler.php
@@ -84,6 +84,32 @@
        }
 
        /**
+        * Check HTML output of a file page to see if it has all the basic 
metadata, and add tracking categories
+        * if it does not.
+        * @param \Parser $parser
+        * @param string $text HTML code of the page after parsing and tidying
+        * @return bool this hook handler always returns true.
+        */
+       public static function onParserAfterTidy( &$parser, &$text ) {
+               if (
+                       !$parser->getTitle()->inNamespace( NS_FILE )
+                       || $parser->getOptions()->getIsSectionPreview()
+               ) {
+                       return true;
+               }
+
+               $lang = $parser->getTargetLanguage();
+               $dataCollector = self::getDataCollector( $lang, true );
+
+               $categoryKeys = $dataCollector->verifyAttributionMetadata( 
$text );
+               foreach ( $categoryKeys as $key ) {
+                       $parser->addTrackingCategory( 
'commonsmetadata-trackingcategory-' . $key );
+               }
+
+               return true;
+       }
+
+       /**
         * Hook to add unit tests
         * @param array $files
         * @return bool
@@ -93,4 +119,24 @@
                $files = array_merge( $files, glob( $testDir . 
DIRECTORY_SEPARATOR . '*Test.php' ) );
                return true;
        }
+
+       /**
+        * @param Language $lang
+        * @param bool $singleLang
+        */
+       private static function getDataCollector( Language $lang, $singleLang ) 
{
+               $templateParser = new TemplateParser();
+               $templateParser->setMultiLanguage( !$singleLang );
+               $fallbacks = Language::getFallbacksFor( $lang->getCode() );
+               array_unshift( $fallbacks, $lang->getCode() );
+               $templateParser->setPriorityLanguages( $fallbacks );
+
+               $dataCollector = new DataCollector();
+               $dataCollector->setLanguage( $lang );
+               $dataCollector->setMultiLang( !$singleLang );
+               $dataCollector->setTemplateParser( $templateParser );
+               $dataCollector->setLicenseParser( new LicenseParser() );
+
+               return $dataCollector;
+       }
 }
diff --git a/i18n/en.json b/i18n/en.json
index 1b9c42c..b2c835d 100644
--- a/i18n/en.json
+++ b/i18n/en.json
@@ -2,5 +2,13 @@
     "@metadata": {
         "authors": []
     },
-    "commonsmetadata-desc": "Extends the \"extmetadata\" property of the image 
information API module to include information stored in image description pages 
that use the templates commonly used on Wikimedia Commons"
+    "commonsmetadata-desc": "Extends the \"extmetadata\" property of the image 
information API module to include information stored in image description pages 
that use the templates commonly used on Wikimedia Commons",
+    "commonsmetadata-trackingcategory-no-license": "Files with no 
machine-readable license",
+       "commonsmetadata-trackingcategory-no-license-desc": "The file does not 
have any [https://commons.wikimedia.org/wiki/Commons:Machine-readable_data 
machine-redable] license template",
+       "commonsmetadata-trackingcategory-no-description": "Files with no 
machine-readable description",
+       "commonsmetadata-trackingcategory-no-description-desc": "The file does 
not have a [https://commons.wikimedia.org/wiki/Commons:Machine-readable_data 
machine-redable] information template, or its description field is not filled 
out",
+       "commonsmetadata-trackingcategory-no-author": "Files with no 
machine-readable author",
+       "commonsmetadata-trackingcategory-no-author-desc": "The file does not 
have a [https://commons.wikimedia.org/wiki/Commons:Machine-readable_data 
machine-redable] information template, or its author field is not filled out",
+       "commonsmetadata-trackingcategory-no-source": "Files with no 
machine-readable source",
+       "commonsmetadata-trackingcategory-no-source-desc": "The file does not 
have a [https://commons.wikimedia.org/wiki/Commons:Machine-readable_data 
machine-redable] information template, or its source field is not filled out"
 }
\ No newline at end of file
diff --git a/i18n/qqq.json b/i18n/qqq.json
index d09eaab..3f93512 100644
--- a/i18n/qqq.json
+++ b/i18n/qqq.json
@@ -4,5 +4,13 @@
                        "Bawolff"
                ]
        },
-       "commonsmetadata-desc": "{{desc}}"
+       "commonsmetadata-desc": "{{desc}}",
+       "commonsmetadata-trackingcategory-no-license": "Name of the tracking 
category for files with no machine-readable license",
+       "commonsmetadata-trackingcategory-no-license-desc": "Description of the 
inclusion criteria for the tracking category for files with no machine-readable 
license",
+       "commonsmetadata-trackingcategory-no-description": "Name of the 
tracking category for files with no machine-readable description",
+       "commonsmetadata-trackingcategory-no-description-desc": "Description of 
the inclusion criteria for the tracking category for files with no 
machine-readable license",
+       "commonsmetadata-trackingcategory-no-author": "Name of the tracking 
category for files with no machine-readable author",
+       "commonsmetadata-trackingcategory-no-author-desc": "Description of the 
inclusion criteria for the tracking category for files with no machine-readable 
license",
+       "commonsmetadata-trackingcategory-no-source": "Name of the tracking 
category for files with no machine-readable source",
+       "commonsmetadata-trackingcategory-no-source-desc": "Description of the 
the inclusion criteria for tracking category for files with no machine-readable 
license"
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/160580
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I43ed79b6a54cd31820ecae8139e29c5880f5dd1b
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CommonsMetadata
Gerrit-Branch: master
Gerrit-Owner: GergÅ‘ Tisza <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to