Joal has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/360866 )

Change subject: Add two tables to sqoop on hadoop
......................................................................


Add two tables to sqoop on hadoop

Those two tables are needed for production run
of the clickstream dataset.
Also add scripts for related hive tables creation.

Change-Id: I97455bd3779906d7228595e10c2e134b39720b42
---
M bin/sqoop-mediawiki-tables
A hive/mediawiki/history/create_mediawiki_pagelinks_table.hql
A hive/mediawiki/history/create_mediawiki_redirect_table.hql
3 files changed, 86 insertions(+), 0 deletions(-)

Approvals:
  Joal: Verified; Looks good to me, approved



diff --git a/bin/sqoop-mediawiki-tables b/bin/sqoop-mediawiki-tables
index a2571ae..1ef30c2 100755
--- a/bin/sqoop-mediawiki-tables
+++ b/bin/sqoop-mediawiki-tables
@@ -227,6 +227,31 @@
         ])),
     }
 
+    queries['pagelinks'] = {
+        'query': '''
+             select pl_from,
+                    pl_namespace,
+                    convert(pl_title using utf8) pl_title,
+                    pl_from_namespace
+
+               from pagelinks
+              where $CONDITIONS
+        ''',
+    }
+
+    queries['redirect'] = {
+        'query': '''
+             select rd_from,
+                    rd_namespace,
+                    convert(rd_title using utf8) rd_title,
+                    convert(rd_interwiki using utf8) rd_interwiki,
+                    convert(rd_fragment using utf8) rd_fragment
+
+               from redirect
+              where $CONDITIONS
+        ''',
+    }
+
     queries['revision'] = {
         'query': '''
              select rev_id,
diff --git a/hive/mediawiki/history/create_mediawiki_pagelinks_table.hql 
b/hive/mediawiki/history/create_mediawiki_pagelinks_table.hql
new file mode 100644
index 0000000..12ef642
--- /dev/null
+++ b/hive/mediawiki/history/create_mediawiki_pagelinks_table.hql
@@ -0,0 +1,30 @@
+-- Creates table statement for raw mediawiki_pagelinks table.
+--
+-- Parameters:
+--     <none>
+--
+-- Usage
+--     hive -f create_mediawiki_pagelinks_table.hql \
+--         --database wmf_raw
+--
+
+CREATE EXTERNAL TABLE `mediawiki_pagelinks`(
+  `pl_from`             bigint      COMMENT 'Key to the page_id of the page 
containing the link',
+  `pl_namespace`        bigint      COMMENT 'Key to page_namespace of the 
target page. The target page may or may not exist, and due to renames and 
deletions may refer to different page records as time goes by',
+  `pl_title`            string      COMMENT 'Key to page_title of the target 
page. The target page may or may not exist, and due to renames and deletions 
may refer to different page records as time goes by. Spaces are converted to 
underscores, and the first letter is automatically capitalized. So for example, 
a link to foo bar would have a pl_title of "Foo_bar"',
+  `pl_from_namespace`   bigint      COMMENT 'MediaWiki version:  ≥ 1.24 - 
page_namespace of the page containing the link'
+)
+COMMENT
+  'See most up to date documentation at 
https://www.mediawiki.org/wiki/Manual:Pagelinks_table'
+PARTITIONED BY (
+  `snapshot` string COMMENT 'Versioning information to keep multiple datasets 
(YYYY-MM for regular labs imports)',
+  `wiki_db` string COMMENT 'The wiki_db project')
+ROW FORMAT SERDE
+  'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS INPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+LOCATION
+  'hdfs://analytics-hadoop/wmf/data/raw/mediawiki/tables/pagelinks'
+;
diff --git a/hive/mediawiki/history/create_mediawiki_redirect_table.hql 
b/hive/mediawiki/history/create_mediawiki_redirect_table.hql
new file mode 100644
index 0000000..ddf1883
--- /dev/null
+++ b/hive/mediawiki/history/create_mediawiki_redirect_table.hql
@@ -0,0 +1,31 @@
+-- Creates table statement for raw mediawiki_redirect table.
+--
+-- Parameters:
+--     <none>
+--
+-- Usage
+--     hive -f create_mediawiki_redirect_table.hql \
+--         --database wmf_raw
+--
+
+CREATE EXTERNAL TABLE `mediawiki_redirect`(
+  `rd_from`             bigint      COMMENT 'Contains the page_id of the 
source page',
+  `rd_namespace`        bigint      COMMENT 'Contains the number of the 
target''s namespace',
+  `rd_title`            string      COMMENT 'Contains the sanitized title of 
the target page. It is stored as text, with spaces replaced by underscores',
+  `rd_interwiki`        string      COMMENT 'MediaWiki version:  ≥ 1.16 - This 
field is not empty only if an interwiki prefix is used: #REDIRECT [[prefix:…]] 
(a prefix such as "w:" for Wikipedia, or an interlanguage link prefix such as 
"nl:" for Dutch, but not both). In this case a redirect is not visible in 
Special:WhatLinksHere (even if the target lies in the same wiki), rd_namespace 
is always 0 and rd_title may contain a possible namespace prefix, but 
rd_fragment may be non-NULL',
+  `rd_fragment`         string      COMMENT 'Contains the target''s fragment 
ID if present (see also bugzilla:218), otherwise is NULL'
+)
+COMMENT
+  'See most up to date documentation at 
https://www.mediawiki.org/wiki/Manual:Redirect_table'
+PARTITIONED BY (
+  `snapshot` string COMMENT 'Versioning information to keep multiple datasets 
(YYYY-MM for regular labs imports)',
+  `wiki_db` string COMMENT 'The wiki_db project')
+ROW FORMAT SERDE
+  'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS INPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+LOCATION
+  'hdfs://analytics-hadoop/wmf/data/raw/mediawiki/tables/redirect'
+;

-- 
To view, visit https://gerrit.wikimedia.org/r/360866
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I97455bd3779906d7228595e10c2e134b39720b42
Gerrit-PatchSet: 4
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Mforns <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to