Ejegg has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/371586 )

Change subject: [WIP] harvest multiple values from one parameter
......................................................................

[WIP] harvest multiple values from one parameter

Does not work with -islink

FIXME: refactor lame claim/claims codepaths, work with claim types
besides wikibase-item

Bug: T87689
Change-Id: Ied808405a21213e165d51b3fe3d79dfd883e58c0
---
M scripts/harvest_template.py
1 file changed, 54 insertions(+), 19 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/86/371586/1

diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py
index 5d99364..5959270 100755
--- a/scripts/harvest_template.py
+++ b/scripts/harvest_template.py
@@ -71,6 +71,15 @@
     page won't be skipped if the item already has that property but there is
     not the new value.
 
+    python pwb.py harvest_template -lang:en -family:wikipedia -namespace:0 \
+        -template:"Infobox musical artist" current_members P527 -exists:p 
-multi
+
+    will import band members from the "current_members" parameter of "Infobox
+    musical artist" on English Wikipedia as Wikidata property "P527" (has 
part).
+    This will only extract multiple band members if each is linked, and will 
not
+    add duplicate claims for the same member.
+
+    TODO: 'multi' implies at least exists:p - set that automatically?
 """
 #
 # (C) Multichill, Amir, 2013
@@ -109,8 +118,9 @@
     """Class holding options for a param-property pair."""
 
     availableOptions = {
-        'islink': False,
         'exists': '',
+        'islink': False,
+        'multi': False,
     }
 
 
@@ -130,11 +140,14 @@
         @type fields: dict
         @keyword islink: Whether non-linked values should be treated as links
         @type islink: bool
+        @keyword multi: Whether multiple values should be extracted from a 
single parameter
+        @type islink: bool
         """
         self.availableOptions.update({
             'always': True,
             'exists': '',
             'islink': False,
+            'multi': False,
         })
         super(HarvestRobot, self).__init__(**kwargs)
         self.generator = generator
@@ -220,7 +233,8 @@
             raise KeyboardInterrupt
         self.current_page = page
         item.get()
-        if set(val[0] for val in self.fields.values()) <= set(
+        any_multi = any('exists' in val[1].options for val in 
self.fields.values())
+        if not any_multi and set(val[0] for val in self.fields.values()) <= 
set(
                 item.claims.keys()):
             pywikibot.output('%s item %s has claims for all properties. '
                              'Skipping.' % (page, item.title()))
@@ -253,25 +267,43 @@
                 # This field contains something useful for us
                 prop, options = self.fields[field]
                 claim = pywikibot.Claim(self.repo, prop)
+                claims = []  # FIXME: this is a horrid way to do multiples
                 if claim.type == 'wikibase-item':
-                    # Try to extract a valid page
-                    match = pywikibot.link_regex.search(value)
-                    if match:
-                        link_text = match.group(1)
-                    else:
-                        if self._get_option_with_fallback(options, 'islink'):
-                            link_text = value
-                        else:
+                    if self._get_option_with_fallback(options, 'multi'):
+                        matches = pywikibot.link_regex.findall(value)
+                        if matches:
+                            for match in matches:
+                                link_text = match[0]
+                                linked_item = self._template_link_target(item, 
link_text)
+                                if not linked_item:
+                                    continue
+                                claim.setTarget(linked_item)
+                                claims.append(claim)
+                                claim = pywikibot.Claim(self.repo, prop)
+                        if len(claims) == 0:
                             pywikibot.output(
-                                '%s field %s value %s is not a wikilink. '
+                                '%s field %s value %s contains no wikilinks to 
data items. '
                                 'Skipping.' % (claim.getID(), field, value))
                             continue
+                    else:
+                        # Try to extract a valid page
+                        match = pywikibot.link_regex.search(value)
+                        if match:
+                            link_text = match.group(1)
+                        else:
+                            if self._get_option_with_fallback(options, 
'islink'):
+                                link_text = value
+                            else:
+                                pywikibot.output(
+                                    '%s field %s value %s is not a wikilink. '
+                                    'Skipping.' % (claim.getID(), field, 
value))
+                                continue
 
-                    linked_item = self._template_link_target(item, link_text)
-                    if not linked_item:
-                        continue
+                        linked_item = self._template_link_target(item, 
link_text)
+                        if not linked_item:
+                            continue
 
-                    claim.setTarget(linked_item)
+                        claim.setTarget(linked_item)
                 elif claim.type in ('string', 'external-id'):
                     claim.setTarget(value.strip())
                 elif claim.type == 'url':
@@ -297,10 +329,13 @@
                                      % claim.type)
                     continue
 
-                # A generator might yield pages from multiple sites
-                self.user_add_claim_unless_exists(
-                    item, claim, self._get_option_with_fallback('exists'),
-                    pywikibot.output, page.site)
+                if len(claims) == 0:
+                    claims.append(claim)
+                for add_claim in claims:
+                    # A generator might yield pages from multiple sites
+                    self.user_add_claim_unless_exists(
+                        item, add_claim, 
self._get_option_with_fallback(options, 'exists'),
+                        pywikibot.output, page.site)
 
 
 def main(*args):

-- 
To view, visit https://gerrit.wikimedia.org/r/371586
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ied808405a21213e165d51b3fe3d79dfd883e58c0
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Ejegg <ej...@ejegg.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to