jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/370664 )
Change subject: [IMPR] Share claimit.py logic with harvest_template.py ...................................................................... [IMPR] Share claimit.py logic with harvest_template.py It is now possible to import another value for the same property that the item already has, using -exists:p. Each imported claim can have its 'exists' setting, as well as all claims can share the same (global) one. Additionally, because presence of a property is no longer a reason to skip adding a value, as well as because of performance, I'm dropping the safeguard that skips the page when the item already has all the properties to be harvested. Bug: T72702 Bug: T76391 Change-Id: I4b4bb9cd2f7427b2226c05212b27d0113163464f --- M pywikibot/bot.py M scripts/claimit.py M scripts/harvest_template.py 3 files changed, 151 insertions(+), 134 deletions(-) Approvals: jenkins-bot: Verified Xqt: Looks good to me, approved Ejegg: Looks good to me, but someone else must approve diff --git a/pywikibot/bot.py b/pywikibot/bot.py index 1d95652..3444389 100644 --- a/pywikibot/bot.py +++ b/pywikibot/bot.py @@ -2000,6 +2000,69 @@ source.setTarget(item) return source + def user_add_claim_unless_exists( + self, item, claim, exists_arg='', source=None, + logger_callback=log, **kwargs): + """ + Decorator of L{user_add_claim}. + + Before adding a new claim, it checks if we can add it, using provided + filters. + + @see: documentation of L{claimit.py<scripts.claimit>} + @param exists_arg: pattern for merging existing claims with new ones + @type exists_arg: str + @param logger_callback: function logging the output of the method + @type logger_callback: callable + @return: whether the claim could be added + @rtype: bool + """ + # Existing claims on page of same property + for existing in item.get().get('claims').get(claim.getID(), []): + # If claim with same property already exists... + if 'p' not in exists_arg: + logger_callback( + 'Skipping %s because claim with same property already exists' + % (claim.getID(),)) + log('Use -exists:p option to override this behavior') + return False + if not existing.target_equals(claim.getTarget()): + continue + # If some attribute of the claim being added + # matches some attribute in an existing claim of + # the same property, skip the claim, unless the + # 'exists' argument overrides it. + if 't' not in exists_arg: + logger_callback( + 'Skipping %s because claim with same target already exists' + % (claim.getID(),)) + log("Append 't' to -exists argument to override this behavior") + return False + if 'q' not in exists_arg and not existing.qualifiers: + logger_callback( + 'Skipping %s because claim without qualifiers already exists' + % (claim.getID(),)) + log("Append 'q' to -exists argument to override this behavior") + return False + if ('s' not in exists_arg or not source) and not existing.sources: + logger_callback( + 'Skipping %s because claim without source already exists' + % (claim.getID(),)) + log("Append 's' to -exists argument to override this behavior") + return False + if ('s' not in exists_arg and source and + any(source.getID() in ref and + all(snak.target_equals(source.getTarget()) + for snak in ref[source.getID()]) + for ref in existing.sources)): + logger_callback( + 'Skipping %s because claim with the same source already exists' + % (claim.getID(),)) + log("Append 's' to -exists argument to override this behavior") + return False + + return self.user_add_claim(item, claim, source, **kwargs) + def create_item_for_page(self, page, data=None, summary=None, **kwargs): """ Create an ItemPage with the provided page as the sitelink. diff --git a/scripts/claimit.py b/scripts/claimit.py index 1f213d0..caf75ec 100755 --- a/scripts/claimit.py +++ b/scripts/claimit.py @@ -92,60 +92,10 @@ def treat_page_and_item(self, page, item): """Treat each page.""" - # The generator might yield pages from multiple sites - source = self.getSource(page.site) - for claim in self.claims: - # Existing claims on page of same property - for existing in item.claims.get(claim.getID(), []): - # If claim with same property already exists... - if 'p' not in self.exists_arg: - pywikibot.log( - 'Skipping %s because claim with same property already exists' - % (claim.getID(),)) - pywikibot.log( - 'Use -exists:p option to override this behavior') - break - if not existing.target_equals(claim.getTarget()): - continue - # If some attribute of the claim being added - # matches some attribute in an existing claim of - # the same property, skip the claim, unless the - # 'exists' argument overrides it. - if 't' not in self.exists_arg: - pywikibot.log( - 'Skipping %s because claim with same target already exists' - % (claim.getID(),)) - pywikibot.log( - "Append 't' to -exists argument to override this behavior") - break - if 'q' not in self.exists_arg and not existing.qualifiers: - pywikibot.log( - 'Skipping %s because claim without qualifiers already exists' - % (claim.getID(),)) - pywikibot.log( - "Append 'q' to -exists argument to override this behavior") - break - if ('s' not in self.exists_arg or not source) and not existing.sources: - pywikibot.log( - 'Skipping %s because claim without source already exists' - % (claim.getID(),)) - pywikibot.log( - "Append 's' to -exists argument to override this behavior") - break - if ('s' not in self.exists_arg and source and - any(source.getID() in ref and - all(snak.target_equals(source.getTarget()) - for snak in ref[source.getID()]) - for ref in existing.sources)): - pywikibot.log( - 'Skipping %s because claim with the same source already exists' - % (claim.getID(),)) - pywikibot.log( - "Append 's' to -exists argument to override this behavior") - break - else: - self.user_add_claim(item, claim, page.site) + # The generator might yield pages from multiple sites + self.user_add_claim_unless_exists( + item, claim, self.exists_arg, page.site) def main(*args): diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py index 7303524..9d76d23 100755 --- a/scripts/harvest_template.py +++ b/scripts/harvest_template.py @@ -34,6 +34,12 @@ -islink Treat plain text values as links ("text" -> "[[text]]"). +-exists If set to 'p', add a new value, even if the item already + has the imported property but not the imported value. + If set to 'pt', add a new value, even if the item already + has the imported property with the imported value and + some qualifiers. + Examples: python pwb.py harvest_template -lang:en -family:wikipedia -namespace:0 \ @@ -60,6 +66,14 @@ -template:"Infobox person" birth_place P19 -islink death_place P20 will do the same but only "birth_place" can be imported without a link. + + python pwb.py harvest_template -lang:en -family:wikipedia -namespace:0 \ + -template:"Infobox person" occupation P106 -exists:p + + will import an occupation from "occupation" parameter of "Infobox + person" on English Wikipedia as Wikidata property "P106" (occupation). The + page won't be skipped if the item already has that property but there is + not the new value. """ # @@ -100,6 +114,7 @@ availableOptions = { 'islink': False, + 'exists': '', } @@ -121,10 +136,14 @@ @type islink: bool @keyword create: Whether to create a new item if it's missing @type create: bool + @keyword exists: pattern for merging existing claims with harvested + values + @type exists: str """ self.availableOptions.update({ 'always': True, 'create': False, + 'exists': '', 'islink': False, }) super(HarvestRobot, self).__init__(**kwargs) @@ -197,25 +216,19 @@ """ Compare bot's (global) and provided (local) options. - @see: L{pywikibot.bot.OptionHandler.getOption} - - @rtype: bool + @see: L{OptionHandler.getOption} """ - # TODO: only works with booleans default = self.getOption(option) local = handler.getOption(option) - return default is not local + if isinstance(default, bool) and isinstance(local, bool): + return default is not local + else: + return local or default def treat_page_and_item(self, page, item): """Process a single page/item.""" if willstop: raise KeyboardInterrupt - item.get() - if set(val[0] for val in self.fields.values()) <= set( - item.claims.keys()): - pywikibot.output('%s item %s has claims for all properties. ' - 'Skipping.' % (page, item.title())) - return templates = page.raw_extracted_templates for (template, fielddict) in templates: @@ -228,80 +241,71 @@ "Failed parsing template; '%s' should be the template name." % template) continue + + if template not in self.templateTitles: + continue # We found the template we were looking for - if template in self.templateTitles: - for field, value in fielddict.items(): - field = field.strip() - value = value.strip() - if not field or not value: + for field, value in fielddict.items(): + field = field.strip() + value = value.strip() + if not field or not value: + continue + + if field not in self.fields: + continue + + # This field contains something useful for us + prop, options = self.fields[field] + claim = pywikibot.Claim(self.repo, prop) + if claim.type == 'wikibase-item': + # Try to extract a valid page + match = pywikibot.link_regex.search(value) + if match: + link_text = match.group(1) + else: + if self._get_option_with_fallback(options, 'islink'): + link_text = value + else: + pywikibot.output( + '%s field %s value %s is not a wikilink. ' + 'Skipping.' % (claim.getID(), field, value)) + continue + + linked_item = self._template_link_target(item, link_text) + if not linked_item: continue - # This field contains something useful for us - if field in self.fields: - prop, options = self.fields[field] - # Check if the property isn't already set - claim = pywikibot.Claim(self.repo, prop) - if claim.getID() in item.get().get('claims'): - pywikibot.output( - 'A claim for %s already exists. Skipping.' - % claim.getID()) - # TODO: Implement smarter approach to merging - # harvested values with existing claims esp. - # without overwriting humans unintentionally. - else: - if claim.type == 'wikibase-item': - # Try to extract a valid page - match = pywikibot.link_regex.search(value) - if match: - link_text = match.group(1) - else: - if self._get_option_with_fallback( - options, 'islink'): - link_text = value - else: - pywikibot.output( - '%s field %s value %s is not a ' - 'wikilink. Skipping.' - % (claim.getID(), field, value)) - continue + claim.setTarget(linked_item) + elif claim.type in ('string', 'external-id'): + claim.setTarget(value.strip()) + elif claim.type == 'url': + match = self.linkR.search(value) + if not match: + continue + claim.setTarget(match.group('url')) + elif claim.type == 'commonsMedia': + commonssite = pywikibot.Site('commons', 'commons') + imagelink = pywikibot.Link( + value, source=commonssite, defaultNamespace=6) + image = pywikibot.FilePage(imagelink) + if image.isRedirectPage(): + image = pywikibot.FilePage(image.getRedirectTarget()) + if not image.exists(): + pywikibot.output( + "{0} doesn't exist. I can't link to it" + ''.format(image.title(asLink=True))) + continue + claim.setTarget(image) + else: + pywikibot.output('%s is not a supported datatype.' + % claim.type) + continue - linked_item = self._template_link_target( - item, link_text) - if not linked_item: - continue - - claim.setTarget(linked_item) - elif claim.type in ('string', 'external-id'): - claim.setTarget(value.strip()) - elif claim.type == 'url': - match = self.linkR.search(value) - if not match: - continue - claim.setTarget(match.group('url')) - elif claim.type == 'commonsMedia': - commonssite = pywikibot.Site('commons', - 'commons') - imagelink = pywikibot.Link(value, - source=commonssite, - defaultNamespace=6) - image = pywikibot.FilePage(imagelink) - if image.isRedirectPage(): - image = pywikibot.FilePage( - image.getRedirectTarget()) - if not image.exists(): - pywikibot.output( - "{0} doesn't exist. I can't link to it" - ''.format(image.title(asLink=True))) - continue - claim.setTarget(image) - else: - pywikibot.output( - '%s is not a supported datatype.' - % claim.type) - continue - - # A generator might yield pages from multiple sites - self.user_add_claim(item, claim, page.site) + # A generator might yield pages from multiple sites + self.user_add_claim_unless_exists( + item, claim, self._get_option_with_fallback( + options, 'exists'), + page.site, pywikibot.output) def main(*args): -- To view, visit https://gerrit.wikimedia.org/r/370664 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I4b4bb9cd2f7427b2226c05212b27d0113163464f Gerrit-PatchSet: 10 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Matěj Suchánek <matejsuchane...@gmail.com> Gerrit-Reviewer: Dalba <dalba.w...@gmail.com> Gerrit-Reviewer: Ejegg <ej...@ejegg.com> Gerrit-Reviewer: JAn Dudík <jan.du...@gmail.com> Gerrit-Reviewer: John Vandenberg <jay...@gmail.com> Gerrit-Reviewer: Ladsgroup <ladsgr...@gmail.com> Gerrit-Reviewer: Magul <tomasz.magul...@gmail.com> Gerrit-Reviewer: Matěj Suchánek <matejsuchane...@gmail.com> Gerrit-Reviewer: Mpaa <mpaa.w...@gmail.com> Gerrit-Reviewer: XXN <dan10r...@gmail.com> Gerrit-Reviewer: Xqt <i...@gno.de> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits