Joal has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/360866 )
Change subject: Add two tables to sqoop on hadoop
......................................................................
Add two tables to sqoop on hadoop
Those two tables are needed for production run
of the clickstream dataset.
Also add scripts for related hive tables creation.
Change-Id: I97455bd3779906d7228595e10c2e134b39720b42
---
M bin/sqoop-mediawiki-tables
A hive/mediawiki/history/create_mediawiki_pagelinks_table.hql
A hive/mediawiki/history/create_mediawiki_redirect_table.hql
3 files changed, 86 insertions(+), 0 deletions(-)
Approvals:
Joal: Verified; Looks good to me, approved
diff --git a/bin/sqoop-mediawiki-tables b/bin/sqoop-mediawiki-tables
index a2571ae..1ef30c2 100755
--- a/bin/sqoop-mediawiki-tables
+++ b/bin/sqoop-mediawiki-tables
@@ -227,6 +227,31 @@
])),
}
+ queries['pagelinks'] = {
+ 'query': '''
+ select pl_from,
+ pl_namespace,
+ convert(pl_title using utf8) pl_title,
+ pl_from_namespace
+
+ from pagelinks
+ where $CONDITIONS
+ ''',
+ }
+
+ queries['redirect'] = {
+ 'query': '''
+ select rd_from,
+ rd_namespace,
+ convert(rd_title using utf8) rd_title,
+ convert(rd_interwiki using utf8) rd_interwiki,
+ convert(rd_fragment using utf8) rd_fragment
+
+ from redirect
+ where $CONDITIONS
+ ''',
+ }
+
queries['revision'] = {
'query': '''
select rev_id,
diff --git a/hive/mediawiki/history/create_mediawiki_pagelinks_table.hql
b/hive/mediawiki/history/create_mediawiki_pagelinks_table.hql
new file mode 100644
index 0000000..12ef642
--- /dev/null
+++ b/hive/mediawiki/history/create_mediawiki_pagelinks_table.hql
@@ -0,0 +1,30 @@
+-- Creates table statement for raw mediawiki_pagelinks table.
+--
+-- Parameters:
+-- <none>
+--
+-- Usage
+-- hive -f create_mediawiki_pagelinks_table.hql \
+-- --database wmf_raw
+--
+
+CREATE EXTERNAL TABLE `mediawiki_pagelinks`(
+ `pl_from` bigint COMMENT 'Key to the page_id of the page
containing the link',
+ `pl_namespace` bigint COMMENT 'Key to page_namespace of the
target page. The target page may or may not exist, and due to renames and
deletions may refer to different page records as time goes by',
+ `pl_title` string COMMENT 'Key to page_title of the target
page. The target page may or may not exist, and due to renames and deletions
may refer to different page records as time goes by. Spaces are converted to
underscores, and the first letter is automatically capitalized. So for example,
a link to foo bar would have a pl_title of "Foo_bar"',
+ `pl_from_namespace` bigint COMMENT 'MediaWiki version: ≥ 1.24 -
page_namespace of the page containing the link'
+)
+COMMENT
+ 'See most up to date documentation at
https://www.mediawiki.org/wiki/Manual:Pagelinks_table'
+PARTITIONED BY (
+ `snapshot` string COMMENT 'Versioning information to keep multiple datasets
(YYYY-MM for regular labs imports)',
+ `wiki_db` string COMMENT 'The wiki_db project')
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+LOCATION
+ 'hdfs://analytics-hadoop/wmf/data/raw/mediawiki/tables/pagelinks'
+;
diff --git a/hive/mediawiki/history/create_mediawiki_redirect_table.hql
b/hive/mediawiki/history/create_mediawiki_redirect_table.hql
new file mode 100644
index 0000000..ddf1883
--- /dev/null
+++ b/hive/mediawiki/history/create_mediawiki_redirect_table.hql
@@ -0,0 +1,31 @@
+-- Creates table statement for raw mediawiki_redirect table.
+--
+-- Parameters:
+-- <none>
+--
+-- Usage
+-- hive -f create_mediawiki_redirect_table.hql \
+-- --database wmf_raw
+--
+
+CREATE EXTERNAL TABLE `mediawiki_redirect`(
+ `rd_from` bigint COMMENT 'Contains the page_id of the
source page',
+ `rd_namespace` bigint COMMENT 'Contains the number of the
target''s namespace',
+ `rd_title` string COMMENT 'Contains the sanitized title of
the target page. It is stored as text, with spaces replaced by underscores',
+ `rd_interwiki` string COMMENT 'MediaWiki version: ≥ 1.16 - This
field is not empty only if an interwiki prefix is used: #REDIRECT [[prefix:…]]
(a prefix such as "w:" for Wikipedia, or an interlanguage link prefix such as
"nl:" for Dutch, but not both). In this case a redirect is not visible in
Special:WhatLinksHere (even if the target lies in the same wiki), rd_namespace
is always 0 and rd_title may contain a possible namespace prefix, but
rd_fragment may be non-NULL',
+ `rd_fragment` string COMMENT 'Contains the target''s fragment
ID if present (see also bugzilla:218), otherwise is NULL'
+)
+COMMENT
+ 'See most up to date documentation at
https://www.mediawiki.org/wiki/Manual:Redirect_table'
+PARTITIONED BY (
+ `snapshot` string COMMENT 'Versioning information to keep multiple datasets
(YYYY-MM for regular labs imports)',
+ `wiki_db` string COMMENT 'The wiki_db project')
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+LOCATION
+ 'hdfs://analytics-hadoop/wmf/data/raw/mediawiki/tables/redirect'
+;
--
To view, visit https://gerrit.wikimedia.org/r/360866
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I97455bd3779906d7228595e10c2e134b39720b42
Gerrit-PatchSet: 4
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Mforns <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits