Matěj Suchánek has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/370664 )

Change subject: [IMPR] Share claimit.py logic with harvest_template.py
......................................................................

[IMPR] Share claimit.py logic with harvest_template.py

It is now possible to import another value for the same property
that the item already has, using -exists:p. Each imported claim
can have its 'exists' setting, as well as all claims can share
the same (global) one.

Bug: T72702
Change-Id: I4b4bb9cd2f7427b2226c05212b27d0113163464f
---
M pywikibot/bot.py
M scripts/claimit.py
M scripts/harvest_template.py
3 files changed, 147 insertions(+), 124 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/64/370664/1

diff --git a/pywikibot/bot.py b/pywikibot/bot.py
index c2ba886..f8f2ede 100644
--- a/pywikibot/bot.py
+++ b/pywikibot/bot.py
@@ -1989,6 +1989,71 @@
             source.setTarget(item)
         return source
 
+    def user_add_claim_unless_exists(
+            self, item, claim, exists_arg='', source=None,
+            logger_callback=pywikibot.log, **kwargs):
+        """
+        Like L{user_add_claim} but check first if we can add it, based on
+        provided filters.
+
+        @see: documentation of L{claimit.py<scripts.claimit>}
+        @param exists_arg: which claim attributes should be compared
+        @type exists_arg: str
+        @param logger_callback:
+        @type logger_callback: callable
+        @return: whether the claim could be added
+        @rtype: bool
+        """
+        # Existing claims on page of same property
+        for existing in item.get().get('claims').get(claim.getID(), []):
+            # If claim with same property already exists...
+            if 'p' not in exists_arg:
+                logger_callback(
+                    'Skipping %s because claim with same property already 
exists'
+                    % (claim.getID(),))
+                pywikibot.log('Use -exists:p option to override this behavior')
+                return False
+            if not existing.target_equals(claim.getTarget()):
+                continue
+            # If some attribute of the claim being added
+            # matches some attribute in an existing claim of
+            # the same property, skip the claim, unless the
+            # 'exists' argument overrides it.
+            if 't' not in exists_arg:
+                logger_callback(
+                    'Skipping %s because claim with same target already exists'
+                    % (claim.getID(),))
+                pywikibot.log(
+                    "Append 't' to -exists argument to override this behavior")
+                return False
+            if 'q' not in exists_arg and not existing.qualifiers:
+                logger_callback(
+                    'Skipping %s because claim without qualifiers already 
exists'
+                    % (claim.getID(),))
+                pywikibot.log(
+                    "Append 'q' to -exists argument to override this behavior")
+                return False
+            if ('s' not in exists_arg or not source) and not existing.sources:
+                logger_callback(
+                    'Skipping %s because claim without source already exists'
+                    % (claim.getID(),))
+                pywikibot.log(
+                    "Append 's' to -exists argument to override this behavior")
+                return False
+            if ('s' not in exists_arg and source
+                    and any(source.getID() in ref
+                            and all(snak.target_equals(source.getTarget())
+                                    for snak in ref[source.getID()])
+                            for ref in existing.sources)):
+                logger_callback(
+                    'Skipping %s because claim with the same source already 
exists'
+                    % (claim.getID(),))
+                pywikibot.log(
+                    "Append 's' to -exists argument to override this behavior")
+                return False
+
+        return self.user_add_claim(item, claim, source, **kwargs)
+
     def run(self):
         """Process all pages in generator."""
         if not hasattr(self, 'generator'):
diff --git a/scripts/claimit.py b/scripts/claimit.py
index cdf8256..9e0693e 100755
--- a/scripts/claimit.py
+++ b/scripts/claimit.py
@@ -97,56 +97,8 @@
         source = self.getSource(page.site)
 
         for claim in self.claims:
-            # Existing claims on page of same property
-            for existing in item.claims.get(claim.getID(), []):
-                # If claim with same property already exists...
-                if 'p' not in self.exists_arg:
-                    pywikibot.log(
-                        'Skipping %s because claim with same property already 
exists'
-                        % (claim.getID(),))
-                    pywikibot.log(
-                        'Use -exists:p option to override this behavior')
-                    break
-                if not existing.target_equals(claim.getTarget()):
-                    continue
-                # If some attribute of the claim being added
-                # matches some attribute in an existing claim of
-                # the same property, skip the claim, unless the
-                # 'exists' argument overrides it.
-                if 't' not in self.exists_arg:
-                    pywikibot.log(
-                        'Skipping %s because claim with same target already 
exists'
-                        % (claim.getID(),))
-                    pywikibot.log(
-                        "Append 't' to -exists argument to override this 
behavior")
-                    break
-                if 'q' not in self.exists_arg and not existing.qualifiers:
-                    pywikibot.log(
-                        'Skipping %s because claim without qualifiers already 
exists'
-                        % (claim.getID(),))
-                    pywikibot.log(
-                        "Append 'q' to -exists argument to override this 
behavior")
-                    break
-                if ('s' not in self.exists_arg or not source) and not 
existing.sources:
-                    pywikibot.log(
-                        'Skipping %s because claim without source already 
exists'
-                        % (claim.getID(),))
-                    pywikibot.log(
-                        "Append 's' to -exists argument to override this 
behavior")
-                    break
-                if ('s' not in self.exists_arg and source and
-                        any(source.getID() in ref and
-                            all(snak.target_equals(source.getTarget())
-                                for snak in ref[source.getID()])
-                            for ref in existing.sources)):
-                    pywikibot.log(
-                        'Skipping %s because claim with the same source 
already exists'
-                        % (claim.getID(),))
-                    pywikibot.log(
-                        "Append 's' to -exists argument to override this 
behavior")
-                    break
-            else:
-                self.user_add_claim(item, claim, page.site)
+            self.user_add_claim_unless_exists(
+                item, claim, self.exists_arg, page.site)
 
 
 def main(*args):
diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py
index 775bf17..a3fe049 100755
--- a/scripts/harvest_template.py
+++ b/scripts/harvest_template.py
@@ -30,6 +30,12 @@
 
 -islink           Treat plain text values as links ("text" -> "[[text]]").
 
+-exists           If set to 'p', add a new value, even if the item already
+                  has the imported property but not the imported value.
+                  If set to 'pt', add a new value, even if the item already
+                  has the imported property, the imported value and claim has
+                  some qualifiers.
+
 Examples:
 
     python pwb.py harvest_template -lang:en -family:wikipedia -namespace:0 \
@@ -56,6 +62,14 @@
         -template:"Infobox person" birth_place P19 -islink death_place P20
 
     will do the same but only "birth_place" can be imported without a link.
+
+    python pwb.py harvest_template -lang:en -family:wikipedia -namespace:0 \
+        -template:"Infobox person" occupation P106 -exists:p
+
+    will import an occupation from "occupation" parameter of "Infobox
+    person" on English Wikipedia as Wikidata property "P106" (occupation). The
+    page won't be skipped if the item already has that property but there is
+    not the new value.
 
 """
 #
@@ -96,6 +110,7 @@
 
     availableOptions = {
         'islink': False,
+        'exists': '',
     }
 
 
@@ -118,6 +133,7 @@
         """
         self.availableOptions.update({
             'always': True,
+            'exists': '',
             'islink': False,
         })
         super(HarvestRobot, self).__init__(**kwargs)
@@ -190,13 +206,13 @@
         Compare bot's (global) and provided (local) options.
 
         @see: L{pywikibot.bot.OptionHandler.getOption}
-
-        @rtype: bool
         """
-        # TODO: only works with booleans
         default = self.getOption(option)
         local = handler.getOption(option)
-        return default is not local
+        if isinstance(default, bool) and isinstance(local, bool):
+            return default is not local
+        else:
+            return local or default
 
     def treat(self, page, item):
         """Process a single page/item."""
@@ -221,80 +237,70 @@
                     "Failed parsing template; '%s' should be the template 
name."
                     % template)
                 continue
+
+            if template not in self.templateTitles:
+                continue
             # We found the template we were looking for
-            if template in self.templateTitles:
-                for field, value in fielddict.items():
-                    field = field.strip()
-                    value = value.strip()
-                    if not field or not value:
+            for field, value in fielddict.items():
+                field = field.strip()
+                value = value.strip()
+                if not field or not value:
+                    continue
+
+                if field not in self.fields:
+                    continue
+
+                # This field contains something useful for us
+                prop, options = self.fields[field]
+                claim = pywikibot.Claim(self.repo, prop)
+                if claim.type == 'wikibase-item':
+                    # Try to extract a valid page
+                    match = pywikibot.link_regex.search(value)
+                    if match:
+                        link_text = match.group(1)
+                    else:
+                        if self._get_option_with_fallback(options, 'islink'):
+                            link_text = value
+                        else:
+                            pywikibot.output(
+                                '%s field %s value %s is not a wikilink. '
+                                'Skipping.' % (claim.getID(), field, value))
+                            continue
+
+                    linked_item = self._template_link_target(item, link_text)
+                    if not linked_item:
                         continue
 
-                    # This field contains something useful for us
-                    if field in self.fields:
-                        prop, options = self.fields[field]
-                        # Check if the property isn't already set
-                        claim = pywikibot.Claim(self.repo, prop)
-                        if claim.getID() in item.get().get('claims'):
-                            pywikibot.output(
-                                'A claim for %s already exists. Skipping.'
-                                % claim.getID())
-                            # TODO: Implement smarter approach to merging
-                            # harvested values with existing claims esp.
-                            # without overwriting humans unintentionally.
-                        else:
-                            if claim.type == 'wikibase-item':
-                                # Try to extract a valid page
-                                match = pywikibot.link_regex.search(value)
-                                if match:
-                                    link_text = match.group(1)
-                                else:
-                                    if self._get_option_with_fallback(
-                                            options, 'islink'):
-                                        link_text = value
-                                    else:
-                                        pywikibot.output(
-                                            '%s field %s value %s is not a '
-                                            'wikilink. Skipping.'
-                                            % (claim.getID(), field, value))
-                                        continue
+                    claim.setTarget(linked_item)
+                elif claim.type in ('string', 'external-id'):
+                    claim.setTarget(value.strip())
+                elif claim.type == 'url':
+                    match = self.linkR.search(value)
+                    if not match:
+                        continue
+                    claim.setTarget(match.group('url'))
+                elif claim.type == 'commonsMedia':
+                    commonssite = pywikibot.Site('commons', 'commons')
+                    imagelink = pywikibot.Link(
+                        value, source=commonssite, defaultNamespace=6)
+                    image = pywikibot.FilePage(imagelink)
+                    if image.isRedirectPage():
+                        image = pywikibot.FilePage(image.getRedirectTarget())
+                    if not image.exists():
+                        pywikibot.output(
+                            "{0} doesn't exist. I can't link to it"
+                            ''.format(image.title(asLink=True)))
+                        continue
+                    claim.setTarget(image)
+                else:
+                    pywikibot.output('%s is not a supported datatype.'
+                                     % claim.type)
+                    continue
 
-                                linked_item = self._template_link_target(
-                                    item, link_text)
-                                if not linked_item:
-                                    continue
-
-                                claim.setTarget(linked_item)
-                            elif claim.type in ('string', 'external-id'):
-                                claim.setTarget(value.strip())
-                            elif claim.type == 'url':
-                                match = self.linkR.search(value)
-                                if not match:
-                                    continue
-                                claim.setTarget(match.group('url'))
-                            elif claim.type == 'commonsMedia':
-                                commonssite = pywikibot.Site('commons',
-                                                             'commons')
-                                imagelink = pywikibot.Link(value,
-                                                           source=commonssite,
-                                                           defaultNamespace=6)
-                                image = pywikibot.FilePage(imagelink)
-                                if image.isRedirectPage():
-                                    image = pywikibot.FilePage(
-                                        image.getRedirectTarget())
-                                if not image.exists():
-                                    pywikibot.output(
-                                        "{0} doesn't exist. I can't link to it"
-                                        ''.format(image.title(asLink=True)))
-                                    continue
-                                claim.setTarget(image)
-                            else:
-                                pywikibot.output(
-                                    '%s is not a supported datatype.'
-                                    % claim.type)
-                                continue
-
-                            # A generator might yield pages from multiple sites
-                            self.user_add_claim(item, claim, page.site)
+                # A generator might yield pages from multiple sites
+                self.user_add_claim_unless_exists(
+                    item, claim, self._get_option_with_fallback('exists'),
+                    pywikibot.output, page.site)
 
 
 def main(*args):

-- 
To view, visit https://gerrit.wikimedia.org/r/370664
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I4b4bb9cd2f7427b2226c05212b27d0113163464f
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Matěj Suchánek <matejsuchane...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to