GoranSMilovanovic has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/398692 )

Change subject: Labs processing 17 Dec 2017
......................................................................

Labs processing 17 Dec 2017

Change-Id: I1cfad83ed71be0e046205a5cff09987df0a8ab7f
---
M WDCM_EngineGeo_goransm.R
A WDCM_Process.R
A WDCM_Update_Labs.R
3 files changed, 1,159 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/wmde/WDCM 
refs/changes/92/398692/1

diff --git a/WDCM_EngineGeo_goransm.R b/WDCM_EngineGeo_goransm.R
index 635606b..4a04518 100644
--- a/WDCM_EngineGeo_goransm.R
+++ b/WDCM_EngineGeo_goransm.R
@@ -288,14 +288,14 @@
 ### --- join coordinates, items, labels, and usage
 setwd(dataDir)
 
-# - list .tsv files
+# - list .tsv files from dataDir
 lF <- list.files()
 w <- which(grepl("^wdcm_geoitem", lF))
 lF <- lF[w]
 w <- which(grepl(".tsv", lF, fixed = T))
 lF <- lF[w]
 
-# - remove old .csv files:
+# - remove old .csv files from dataDir
 rmF <- list.files()
 w <- which(grepl("^wdcm_geoitem", rmF))
 rmF <- rmF[w]
diff --git a/WDCM_Process.R b/WDCM_Process.R
new file mode 100644
index 0000000..467892d
--- /dev/null
+++ b/WDCM_Process.R
@@ -0,0 +1,910 @@
+
+### ---------------------------------------------------------------------------
+### --- WDCM Process Module, v. Beta 0.1
+### --- Script: WDCM_Process_v2.R, v. Beta 0.1
+### ---------------------------------------------------------------------------
+### --- DESCRIPTION:
+### --- WDCM_Process_v2.R takes a list of .tsv files that present
+### --- the data from wbc_entity_usage tables accross the client projects
+### --- fetched from production (stat1005) by WDCM_Search_Clients.R and 
+### --- further pre-processed by WDCM_Pre-Process.R (also on production).
+### --- The goal of this WDCM module/script is to produce (or update) 
+### --- the WDCM Stats Dashboard database.
+### ---------------------------------------------------------------------------
+### --- INPUT: 
+### --- the WDCM_Process_v2.R reads the .tsv input files from:
+### --- /home/goransm/WMDE/WDCM/WDCM_DataIN/WDCM_DataIN_ClientUsage_v2/
+### --- on the wikidataconcepts.eqiad.wmflabs Cloud VPS instance
+### --- These files are brought to Labs directly from productio
+### --- (currently the stat1005.eqiad.wmnet statbox)
+### ---------------------------------------------------------------------------
+### --- OUTPUT: the WDCM Dashboards MariaDB database is update
+### ---------------------------------------------------------------------------
+
+### ---------------------------------------------------------------------------
+### --- LICENSE:
+### ---------------------------------------------------------------------------
+### --- GPL v2
+### --- This file is part of Wikidata Concepts Monitor (WDCM)
+### ---
+### --- WDCM is free software: you can redistribute it and/or modify
+### --- it under the terms of the GNU General Public License as published by
+### --- the Free Software Foundation, either version 2 of the License, or
+### --- (at your option) any later version.
+### ---
+### --- WDCM is distributed in the hope that it will be useful,
+### --- but WITHOUT ANY WARRANTY; without even the implied warranty of
+### --- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+### --- GNU General Public License for more details.
+### ---
+### --- You should have received a copy of the GNU General Public License
+### --- along with WDCM. If not, see <http://www.gnu.org/licenses/>.
+### ---------------------------------------------------------------------------
+
+### --- Setup
+library(RMySQL)
+library(httr)
+library(XML)
+library(data.table)
+library(dplyr)
+library(tidyr)
+library(readr)
+library(htmltab)
+library(snowfall)
+library(maptpx)
+library(Rtsne)
+
+# - mysql --defaults-file=/home/goransm/mySQL_Credentials/replica.my.cnf -h 
tools.labsdb u16664__wdcm_p
+# - database: u16664__wdcm_p
+
+### --- functions
+
+# - projectType() to determine project type
+projectType <- function(projectName) {
+  unname(sapply(projectName, function(x) {
+    if (grepl("commons", x, fixed = T)) {"Commons"
+    } else if (grepl("mediawiki|meta|species|wikidata", x)) {"Other"
+    } else if (grepl("wiki$", x)) {"Wikipedia"
+    } else if (grepl("quote$", x)) {"Wikiquote"
+    } else if (grepl("voyage$", x)) {"Wikivoyage"
+    } else if (grepl("news$", x)) {"Wikinews"
+    } else if (grepl("source$", x)) {"Wikisource"
+    } else if (grepl("wiktionary$", x)) {"Wiktionary"
+    } else if (grepl("versity$", x)) {"Wikiversity"
+    } else if (grepl("books$", x)) {"Wikibooks"
+    } else {"Other"}
+  }))
+}
+
+### ---------------------------------------------------------------------------
+### --- NOTE:
+### --- TABLE NAMING CONVENTION FOR v2 (WDCM Stats Dashboard)
+### --- wdcm2_something
+### ---------------------------------------------------------------------------
+
+# - to nohup.out
+print(paste("WDCM Process.R update started at: ", Sys.time(), sep = ""))
+
+# - to working directory:
+# - credentials on tools.labsdb
+setwd('/home/goransm/WMDE/WDCM/WDCM_RScripts')
+mySQLCreds <- read.csv("mySQLCreds.csv",
+                       header = T,
+                       check.names = F,
+                       row.names = 1,
+                       stringsAsFactors = F)
+
+### --- list existing tables
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+q <- "SHOW TABLES;"
+res <- dbSendQuery(con, q)
+st <- fetch(res, -1)
+dbClearResult(res)
+dbDisconnect(con)
+colnames(st) <- "tables"
+# - keep only v2 tables:
+st <- st[which(grepl("^wdcm2", st$tables)), ]
+
+### ---------------------------------------------------------------------------
+### ---------------------------------------------------------------------------
+### --- CREATE TABLES for WDCM Projects Stats Dashboard
+### ---------------------------------------------------------------------------
+### ---------------------------------------------------------------------------
+
+# - to /home/goransm/WMDE/WDCM/WDCM_DataIN/WDCM_DataIN_ClientUsage_v2/
+inputDir <- '/home/goransm/WMDE/WDCM/WDCM_DataIN/WDCM_DataIN_ClientUsage_v2/'
+setwd(inputDir)
+
+### ---------------------------------------------------------------------------
+### --- wdcm2_project
+### ---------------------------------------------------------------------------
+
+# - to nohup.out
+print(paste("wdcm2_project: ", Sys.time(), sep = ""))
+
+# - load wdcm_project.tsv as data.frame
+wdcm2_project <- fread('wdcm_project.csv', data.table = F)
+wdcm2_project$V1 <- NULL
+# - check whether wdcm2_project table exists:
+checkTable <- which(st %in% "wdcm2_project")
+# - DROP wdcm2_project if it exists
+if (length(checkTable) == 1) {
+  con <- dbConnect(MySQL(), 
+                   host = "tools.labsdb", 
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "u16664__wdcm_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  q <- "DROP TABLE wdcm2_project;"
+  res <- dbSendQuery(con, q)
+  dbClearResult(res)
+  dbDisconnect(con)
+}
+# - CREATE wdcm2_project:
+# - write to wdcm2_project
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+dbWriteTable(conn = con,
+             name = "wdcm2_project",
+             value = wdcm2_project,
+             row.names = F,
+             append = T)
+dbDisconnect(con)
+rm(wdcm2_project); gc()
+
+### ---------------------------------------------------------------------------
+### --- wdcm2_category
+### ---------------------------------------------------------------------------
+
+# - to nohup.out
+print(paste("wdcm2_category: ", Sys.time(), sep = ""))
+
+# - load wdcm_category.tsv as data.frame
+wdcm2_category <- fread('wdcm_category.csv', data.table = F)
+wdcm2_category$V1 <- NULL
+# - check whether wdcm2_category table exists:
+checkTable <- which(st %in% "wdcm2_category")
+# - DROP wdcm2_category if it exists
+if (length(checkTable) == 1) {
+  con <- dbConnect(MySQL(), 
+                   host = "tools.labsdb", 
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "u16664__wdcm_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  q <- "DROP TABLE wdcm2_category;"
+  res <- dbSendQuery(con, q)
+  dbClearResult(res)
+  dbDisconnect(con)
+}
+# - CREATE wdcm2_category
+# - write to wdcm2_category
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+dbWriteTable(conn = con,
+             name = "wdcm2_category",
+             value = wdcm2_category,
+             row.names = F,
+             append = T)
+dbDisconnect(con)
+rm(wdcm2_category); gc()
+
+### ---------------------------------------------------------------------------
+### --- wdcm2_project_category
+### ---------------------------------------------------------------------------
+
+# - to nohup.out
+print(paste("wdcm2_project_category: ", Sys.time(), sep = ""))
+
+# - load wdcm_category.tsv as data.frame
+wdcm2_project_category <- fread('wdcm_project_category.csv', data.table = F)
+wdcm2_project_category$V1 <- NULL
+### --- FIX THIS (!!!)
+colnames(wdcm2_project_category)[4] <- 'projecttype'
+### --- FIX THIS (!!!)
+# - check whether wdcm2_project_category table exists:
+checkTable <- which(st %in% "wdcm2_project_category")
+# - DROP wdcm2_project_category if it exists
+if (length(checkTable) == 1) {
+  con <- dbConnect(MySQL(), 
+                   host = "tools.labsdb", 
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "u16664__wdcm_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  q <- "DROP TABLE wdcm2_project_category;"
+  res <- dbSendQuery(con, q)
+  dbClearResult(res)
+  dbDisconnect(con)
+}
+# - CREATE wdcm2_project_category
+# - write to wdcm2_project_category
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+qCreate <- 'CREATE TABLE wdcm2_project_category ( 
+              eu_project VARCHAR(255) NOT NULL,
+              category VARCHAR(37) NOT NULL,
+              eu_count INT, 
+              projecttype VARCHAR(255) NOT NULL
+            ) 
+            ENGINE=InnoDB 
+            ROW_FORMAT=COMPRESSED;'
+res <- dbSendQuery(con, qCreate)
+dbClearResult(res)
+# - populate wdcm2_project_category
+dbWriteTable(conn = con,
+             name = "wdcm2_project_category",
+             value = wdcm2_project_category,
+             row.names = F,
+             append = T)
+dbDisconnect(con)
+rm(wdcm2_project_category); gc()
+# - CREATE INDEX on wdcm2_project_category
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+# - CREATE INDEX
+q <- "CREATE INDEX wdcm2_project_category_eu_project_category ON 
wdcm2_project_category (eu_project, category); "
+res <- dbSendQuery(con, q)
+dbClearResult(res)
+dbDisconnect(con)
+
+### ---------------------------------------------------------------------------
+### --- wdcm2_project_item100
+### ---------------------------------------------------------------------------
+
+# - to nohup.out
+print(paste("wdcm2_project_item100: ", Sys.time(), sep = ""))
+
+# - load wdcm_project_item100.tsv as data.frame
+wdcm2_project_item100 <- fread('wdcm_project_item100.csv', data.table = F)
+wdcm2_project_item100$V1 <- NULL
+wdcm2_project_item100$t.row <- NULL
+colnames(wdcm2_project_item100) <- c('eu_project', 'eu_entity_id', 'eu_count', 
'projecttype')
+# - check whether wdcm2_project_item100 table exists:
+checkTable <- which(st %in% "wdcm2_project_item100")
+# - DROP wdcm2_project_category if it exists
+if (length(checkTable) == 1) {
+  con <- dbConnect(MySQL(), 
+                   host = "tools.labsdb", 
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "u16664__wdcm_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  q <- "DROP TABLE wdcm2_project_item100;"
+  res <- dbSendQuery(con, q)
+  dbClearResult(res)
+  dbDisconnect(con)
+}
+# - get item labels for wdcm2_project_item100
+items <- gsub("Q", "", unique(wdcm2_project_item100$eu_entity_id), fixed = T)
+itemList <- paste0(items, collapse = ", ", sep = "")
+# - fetch English labels where available from wb_terms
+con <- dbConnect(MySQL(), 
+                 host = "wikidatawiki.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "wikidatawiki_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+q <- paste("SELECT term_entity_id, term_text FROM wb_terms ",
+           "WHERE ((term_entity_id IN (", itemList, ")) AND ",
+           "(term_language = 'en') AND ",
+           "(term_entity_type = 'item') AND (term_type = 'label'));", 
+           sep = "");
+rm(itemList)
+res <- dbSendQuery(con, q)
+itemLabels <- fetch(res, -1)
+dbClearResult(res)
+dbDisconnect(con)
+itemLabels$term_entity_id <- paste("Q", itemLabels$term_entity_id, sep = "")
+colnames(itemLabels) <- c('eu_entity_id', 'eu_label')
+# - insert labels to wdcm2_project_item100 + 
+wdcm2_project_item100 <- left_join(wdcm2_project_item100, itemLabels, by = 
'eu_entity_id')
+# - recognize missing English labels and replace w. the respective 
eu_entity_id value
+wdcm2_project_item100$eu_label[is.na(wdcm2_project_item100$eu_label)] <- 
+  wdcm2_project_item100$eu_entity_id[is.na(wdcm2_project_item100$eu_label)]
+rm(itemLabels); gc()
+# - CREATE wdcm2_project_item100
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+qCreate <- 'CREATE TABLE wdcm2_project_item100 ( 
+              eu_project VARCHAR(255) NOT NULL, 
+              eu_entity_id VARCHAR(37) NOT NULL, 
+              eu_count INT,
+              projecttype VARCHAR(255) NOT NULL,
+              eu_label VARCHAR(255) NOT NULL) 
+              ENGINE=InnoDB 
+              ROW_FORMAT=COMPRESSED;'
+res <- dbSendQuery(con, qCreate)
+dbClearResult(res)
+# - populate wdcm2_project_item100
+dbWriteTable(conn = con,
+             name = "wdcm2_project_item100",
+             value = wdcm2_project_item100,
+             row.names = F,
+             append = T)
+dbDisconnect(con)
+rm(wdcm2_project_item100); gc()
+# - CREATE INDEX on wdcm2_project_item100
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+q <- "CREATE INDEX wdcm2_project_item100_eu_project ON wdcm2_project_item100 
(eu_project); "
+res <- dbSendQuery(con, q)
+dbClearResult(res)
+q <- "CREATE INDEX wdcm2_project_item100_projecttype ON wdcm2_project_item100 
(projecttype); "
+res <- dbSendQuery(con, q)
+dbClearResult(res)
+dbDisconnect(con)
+
+### ---------------------------------------------------------------------------
+### --- wdcm2_project_category_item100
+### ---------------------------------------------------------------------------
+
+# - to nohup.out
+print(paste("wdcm2_project_category_item100: ", Sys.time(), sep = ""))
+
+# - load wdcm_project_category_item100.tsv as data.frame
+wdcm2_project_category_item100 <- fread('wdcm_project_category_item100.csv', 
data.table = F)
+wdcm2_project_category_item100$V1 <- NULL
+wdcm2_project_category_item100$t.row <- NULL
+colnames(wdcm2_project_category_item100) <- c('eu_project', 'category', 
'eu_entity_id', 'eu_count', 'projecttype')
+# - check whether wdcm2_project_item100 table exists:
+checkTable <- which(st %in% "wdcm2_project_category_item100")
+# - DROP wdcm2_project_category if it exists
+if (length(checkTable) == 1) {
+  con <- dbConnect(MySQL(), 
+                   host = "tools.labsdb", 
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "u16664__wdcm_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  q <- "DROP TABLE wdcm2_project_category_item100;"
+  res <- dbSendQuery(con, q)
+  dbClearResult(res)
+  dbDisconnect(con)
+}
+# - get item labels for wdcm2_project_category_item100
+items <- gsub("Q", "", unique(wdcm2_project_category_item100$eu_entity_id), 
fixed = T)
+itemList <- paste0(items, collapse = ", ", sep = "")
+# - fetch English labels where available from wb_terms
+con <- dbConnect(MySQL(), 
+                 host = "wikidatawiki.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "wikidatawiki_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+q <- paste("SELECT term_entity_id, term_text FROM wb_terms ",
+           "WHERE ((term_entity_id IN (", itemList, ")) AND ",
+           "(term_language = 'en') AND ",
+           "(term_entity_type = 'item') AND (term_type = 'label'));", 
+           sep = "");
+rm(itemList); gc()
+res <- dbSendQuery(con, q)
+itemLabels <- fetch(res, -1)
+dbClearResult(res)
+dbDisconnect(con)
+itemLabels$term_entity_id <- paste("Q", itemLabels$term_entity_id, sep = "")
+colnames(itemLabels) <- c('eu_entity_id', 'eu_label')
+# - insert labels to wdcm2_project_category_item100 + 
+wdcm2_project_category_item100 <- left_join(wdcm2_project_category_item100, 
itemLabels, 
+                                            by = 'eu_entity_id')
+# - recognize missing English labels and replace w. the respective 
eu_entity_id value
+wdcm2_project_category_item100$eu_label[is.na(wdcm2_project_category_item100$eu_label)]
 <- 
+  
wdcm2_project_category_item100$eu_entity_id[is.na(wdcm2_project_category_item100$eu_label)]
+rm(itemLabels); gc()
+# - CREATE wdcm2_project_category_item100
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+qCreate <- 'CREATE TABLE wdcm2_project_category_item100 (
+              eu_project VARCHAR(255) NOT NULL, 
+              category VARCHAR(255) NOT NULL,
+              eu_entity_id VARCHAR(37) NOT NULL,
+              eu_count INT,
+              projecttype VARCHAR(255) NOT NULL,
+              eu_label VARCHAR(255) NOT NULL) 
+              ENGINE=InnoDB 
+              ROW_FORMAT=COMPRESSED;'
+res <- dbSendQuery(con, qCreate)
+dbClearResult(res)
+# - populate wdcm2_project_category_item100
+dbWriteTable(conn = con,
+             name = "wdcm2_project_category_item100",
+             value = wdcm2_project_category_item100,
+             row.names = F,
+             append = T)
+dbDisconnect(con)
+rm(wdcm2_project_category_item100); gc()
+# - CREATE INDEX on wdcm2_project_category_item100
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+q <- "CREATE INDEX wdcm2_project_category_item100_eu_project_category ON 
wdcm2_project_category_item100 (eu_project, category);"
+res <- dbSendQuery(con, q)
+dbClearResult(res)
+q <- "CREATE INDEX wdcm2_project_category_item100_eu_project ON 
wdcm2_project_category_item100 (eu_project);"
+res <- dbSendQuery(con, q)
+dbClearResult(res)
+q <- "CREATE INDEX wdcm2_project_category_item100_category ON 
wdcm2_project_category_item100 (category);"
+res <- dbSendQuery(con, q)
+dbClearResult(res)
+q <- "CREATE INDEX wdcm2_project_category_item100_projecttype ON 
wdcm2_project_category_item100 (projecttype);"
+res <- dbSendQuery(con, q)
+dbClearResult(res)
+dbDisconnect(con)
+
+### ---------------------------------------------------------------------------
+### --- wdcm2_category_item100
+### ---------------------------------------------------------------------------
+
+# - to nohup.out
+print(paste("wdcm2_category_item100: ", Sys.time(), sep = ""))
+
+# - check whether wdcm2_category_item100 table exists:
+checkTable <- which(st %in% "wdcm2_category_item100")
+# - DROP wdcm2_category_item100 if it exists
+if (length(checkTable) == 1) {
+  con <- dbConnect(MySQL(), 
+                   host = "tools.labsdb", 
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "u16664__wdcm_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  q <- "DROP TABLE wdcm2_category_item100;"
+  res <- dbSendQuery(con, q)
+  dbClearResult(res)
+  dbDisconnect(con)
+}
+# - CREATE wdcm2_category_item100
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+qCreate <- 'CREATE TABLE wdcm2_category_item100 (
+              eu_entity_id VARCHAR(255) NOT NULL,
+              eu_count INT, 
+              category VARCHAR(255) NOT NULL, 
+              eu_label VARCHAR(255) NOT NULL) 
+              ENGINE=InnoDB 
+              ROW_FORMAT=COMPRESSED;'
+res <- dbSendQuery(con, qCreate)
+dbClearResult(res)
+dbDisconnect(con)
+# - populate wdcm2_category_item100
+itemFiles <- list.files()
+itemFiles <- itemFiles[grepl("^wdcm_item", itemFiles)]
+for (i in 1:length(itemFiles)) {
+  # - load categoryFile[i].tsv as data.frame
+  categoryName <- strsplit(itemFiles[i], ".", fixed = T)[[1]][1]
+  categoryName <- strsplit(categoryName, "_", fixed = T)[[1]][3]
+  categoryName <- gsub("([[:lower:]])([[:upper:]])", "\\1 \\2", categoryName)
+  ### --- FIX THIS (!!!)
+  if (categoryName == "Workof Art") {
+    categoryName <- "Work of Art"
+  }
+  ### --- FIX ABOVE (!!!)
+  categoryFile <- fread(itemFiles[i], nrows = 100)
+  categoryFile$category <- categoryName
+  # - get item labels for categoryFile[i]
+  items <- gsub("Q", "", unique(categoryFile$eu_entity_id), fixed = T)
+  itemList <- paste0(items, collapse = ", ", sep = "")
+  # - fetch English labels where available from wb_terms
+  con <- dbConnect(MySQL(), 
+                   host = "wikidatawiki.labsdb", 
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "wikidatawiki_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  q <- paste("SELECT term_entity_id, term_text FROM wb_terms ",
+             "WHERE ((term_entity_id IN (", itemList, ")) AND ",
+             "(term_language = 'en') AND ",
+             "(term_entity_type = 'item') AND (term_type = 'label'));", 
+             sep = "");
+  rm(itemList); gc()
+  res <- dbSendQuery(con, q)
+  itemLabels <- fetch(res, -1)
+  dbClearResult(res)
+  dbDisconnect(con)
+  itemLabels$term_entity_id <- paste("Q", itemLabels$term_entity_id, sep = "")
+  colnames(itemLabels) <- c('eu_entity_id', 'eu_label')
+  # - insert labels to categoryFile[i] + 
+  categoryFile <- left_join(categoryFile, itemLabels, by = 'eu_entity_id')
+  # - recognize missing English labels and replace w. the respective 
eu_entity_id value
+  categoryFile$eu_label[is.na(categoryFile$eu_label)] <- 
+    categoryFile$eu_entity_id[is.na(categoryFile$eu_label)]
+  rm(itemLabels); gc()
+  # - populate wdcm2_category_item100 by categoryFile[i]
+  con <- dbConnect(MySQL(), 
+                   host = "tools.labsdb", 
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "u16664__wdcm_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  dbWriteTable(conn = con,
+               name = "wdcm2_category_item100",
+               value = categoryFile,
+               row.names = F,
+               append = T)
+  dbDisconnect(con)
+  rm(categoryFile); gc()
+}
+# - CREATE INDEX on wdcm2_category_item100
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+q <- "CREATE INDEX wdcm2_category_item100_category ON wdcm2_category_item100 
(category); "
+res <- dbSendQuery(con, q)
+dbClearResult(res)
+dbDisconnect(con)
+
+### ---------------------------------------------------------------------------
+### --- wdcm2_projects_2dmaps
+### ---------------------------------------------------------------------------
+
+# - to nohup.out
+print(paste("wdcm2_projects_2dmaps: ", Sys.time(), sep = ""))
+
+# - check whether wdcm2_projects_2dmaps table exists:
+checkTable <- which(st %in% "wdcm2_projects_2dmaps")
+# - DROP wdcm2_projects_2dmaps if it exists
+if (length(checkTable) == 1) {
+  con <- dbConnect(MySQL(), 
+                   host = "tools.labsdb", 
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "u16664__wdcm_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  q <- "DROP TABLE wdcm2_projects_2dmaps;"
+  res <- dbSendQuery(con, q)
+  dbClearResult(res)
+  dbDisconnect(con)
+}
+# - CREATE wdcm2_projects_2dmaps
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+qCreate <- 'CREATE TABLE wdcm2_projects_2dmaps (
+              D1 FLOAT,
+              D2 FLOAT,
+              eu_project VARCHAR(255) NOT NULL, 
+              projecttype VARCHAR(255) NOT NULL,
+              category VARCHAR(37) NOT NULL) 
+              ENGINE=InnoDB 
+              ROW_FORMAT=COMPRESSED;'
+res <- dbSendQuery(con, qCreate)
+dbClearResult(res)
+dbDisconnect(con)
+# - populate wdcm2_projects_2dmaps
+itemFiles <- list.files()
+itemFiles <- itemFiles[grepl("^wdcm2_tsne", itemFiles)]
+for (i in 1:length(itemFiles)) {
+  # - load categoryFile[i].tsv as data.frame
+  categoryName <- strsplit(itemFiles[i], ".", fixed = T)[[1]][1]
+  categoryName <- strsplit(categoryName, "_", fixed = T)[[1]][4]
+  categoryName <- gsub("([[:lower:]])([[:upper:]])", "\\1 \\2", categoryName)
+  ### --- FIX THIS (!!!)
+  if (categoryName == "Workof Art") {
+    categoryName <- "Work of Art"
+  }
+  ### --- FIX ABOVE (!!!)
+  categoryFile <- fread(itemFiles[i], data.table = F)
+  categoryFile$V1 <- NULL
+  categoryFile$category <- categoryName
+  colnames(categoryFile)[3] <- 'eu_project'
+  con <- dbConnect(MySQL(), 
+                   host = "tools.labsdb", 
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "u16664__wdcm_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  dbWriteTable(conn = con,
+               name = "wdcm2_projects_2dmaps",
+               value = categoryFile,
+               row.names = F,
+               append = T)
+  dbDisconnect(con)
+  rm(categoryFile); gc()
+}
+# - CREATE INDEX on wdcm2_projects_2dmaps
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+q <- "CREATE INDEX wdcm2_projects_2dmaps_category_euproject ON 
wdcm2_projects_2dmaps (category, eu_project);"
+res <- dbSendQuery(con, q)
+dbClearResult(res)
+dbDisconnect(con)
+
+### ---------------------------------------------------------------------------
+### --- wdcm2_project_category_2dmap
+### ---------------------------------------------------------------------------
+
+# - to nohup.out
+print(paste("wdcm2_project_category_2dmap: ", Sys.time(), sep = ""))
+
+# - check whether wdcm2_project_category_2dmap table exists:
+checkTable <- which(st %in% "wdcm2_project_category_2dmap")
+# - DROP wdcm2_project_category_2dmap if it exists
+if (length(checkTable) == 1) {
+  con <- dbConnect(MySQL(), 
+                   host = "tools.labsdb", 
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "u16664__wdcm_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  q <- "DROP TABLE wdcm2_project_category_2dmap;"
+  res <- dbSendQuery(con, q)
+  dbClearResult(res)
+  dbDisconnect(con)
+}
+# - fetch wdcm2_project_category:
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+qCreate <- 'SELECT * FROM wdcm2_project_category;'
+res <- dbSendQuery(con, qCreate)
+wdcm2_project_category <- fetch(res, -1)
+dbClearResult(res)
+dbDisconnect(con)
+# - wrangle wdcm2_project_category for t-SNE:
+tsneData <- wdcm2_project_category[, 1:3]
+tsneData <- spread(tsneData,
+                   key = category,
+                   value = eu_count, 
+                   fill = 0)
+rownames(tsneData) <- tsneData$eu_project
+tsneData$eu_project <- NULL
+tsneData <- as.matrix(dist(tsneData, method = "euclidean"))
+projects <- rownames(tsneData)
+# - t-SNE 2D reduction:
+tsneData <- Rtsne(tsneData, theta = .5, is_distance = T)
+tsneData <- as.data.frame(tsneData$Y)
+tsneData$projects <- projects
+tsneData$projecttype <- projectType(tsneData$projects)
+colnames(tsneData)[1:2] <- c('D1', 'D2')
+# - populate wdcm2_project_category_2dmap
+con <- dbConnect(MySQL(),
+                 host = "tools.labsdb",
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+dbWriteTable(conn = con,
+             name = "wdcm2_project_category_2dmap",
+             value = tsneData,
+             row.names = F,
+             append = T)
+dbDisconnect(con)
+rm(wdcm2_project_category); rm(tsneData); gc()
+
+### ---------------------------------------------------------------------------
+### --- wdcm2_category_project_2dmap
+### ---------------------------------------------------------------------------
+
+# - to nohup.out
+print(paste("wdcm2_category_project_2dmap: ", Sys.time(), sep = ""))
+
+# - check whether wdcm2_category_project_2dmap table exists:
+checkTable <- which(st %in% "wdcm2_category_project_2dmap")
+# - DROP wdcm2_category_project_2dmap if it exists
+if (length(checkTable) == 1) {
+  con <- dbConnect(MySQL(), 
+                   host = "tools.labsdb", 
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "u16664__wdcm_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  q <- "DROP TABLE wdcm2_category_project_2dmap;"
+  res <- dbSendQuery(con, q)
+  dbClearResult(res)
+  dbDisconnect(con)
+}
+# - fetch wdcm2_project_category:
+con <- dbConnect(MySQL(), 
+                 host = "tools.labsdb", 
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+qCreate <- 'SELECT * FROM wdcm2_project_category;'
+res <- dbSendQuery(con, qCreate)
+wdcm2_project_category <- fetch(res, -1)
+dbClearResult(res)
+dbDisconnect(con)
+# - wrangle wdcm2_project_category for t-SNE:
+tsneData <- wdcm2_project_category[, 1:3]
+tsneData <- spread(tsneData,
+                   key = category,
+                   value = eu_count, 
+                   fill = 0)
+rownames(tsneData) <- tsneData$eu_project
+tsneData$eu_project <- NULL
+tsneData <- as.matrix(dist(t(tsneData), method = "euclidean"))
+category <- rownames(tsneData)
+# - t-SNE 2D reduction:
+tsneData <- Rtsne(tsneData, theta = .5, is_distance = T, perplexity = 4)
+tsneData <- as.data.frame(tsneData$Y)
+tsneData$category <- category
+colnames(tsneData)[1:2] <- c('D1', 'D2')
+# - populate wdcm2_category_project_2dmap
+con <- dbConnect(MySQL(),
+                 host = "tools.labsdb",
+                 defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                 dbname = "u16664__wdcm_p",
+                 user = mySQLCreds$user,
+                 password = mySQLCreds$password)
+dbWriteTable(conn = con,
+             name = "wdcm2_category_project_2dmap",
+             value = tsneData,
+             row.names = F,
+             append = T)
+dbDisconnect(con)
+rm(wdcm2_project_category); rm(tsneData); gc()
+
+### ---------------------------------------------------------------------------
+### --- wdcm2_itemtopic_ tables
+### ---------------------------------------------------------------------------
+
+# - to nohup.out
+print(paste("wdcm2_itemtopic_ tables: ", Sys.time(), sep = ""))
+
+lF <- list.files()
+lF <- lF[grepl("wdcm2_itemtopic_", lF, fixed = T)]
+for (i in 1:length(lF)) {
+  # - check whether the current wdcm2_itemtopic_ table exists:
+  tName <- gsub(".csv", "", lF[i])
+  checkTable <- which(st %in% tName)
+  # - DROP current wdcm2_itemtopic_ table if it exists
+  if (length(checkTable) == 1) {
+    con <- dbConnect(MySQL(), 
+                     host = "tools.labsdb", 
+                     defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                     dbname = "u16664__wdcm_p",
+                     user = mySQLCreds$user,
+                     password = mySQLCreds$password)
+    q <- paste("DROP TABLE ", tName, ";", sep = "")
+    res <- dbSendQuery(con, q)
+    dbClearResult(res)
+    dbDisconnect(con)
+  }
+  # - populate current wdcm2_itemtopic_
+  cTable <- fread(lF[i])
+  colnames(cTable)[1] <- 'eu_entity_id'
+  # - get item labels for current wdcm2_itemtopic_
+  items <- gsub("Q", "", unique(cTable$eu_entity_id), fixed = T)
+  itemList <- paste0(items, collapse = ", ", sep = "")
+  # - fetch English labels where available from wb_terms
+  con <- dbConnect(MySQL(), 
+                   host = "wikidatawiki.labsdb", 
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "wikidatawiki_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  q <- paste("SELECT term_entity_id, term_text FROM wb_terms ",
+             "WHERE ((term_entity_id IN (", itemList, ")) AND ",
+             "(term_language = 'en') AND ",
+             "(term_entity_type = 'item') AND (term_type = 'label'));", 
+             sep = "");
+  rm(itemList)
+  res <- dbSendQuery(con, q)
+  itemLabels <- fetch(res, -1)
+  dbClearResult(res)
+  dbDisconnect(con)
+  itemLabels$term_entity_id <- paste("Q", itemLabels$term_entity_id, sep = "")
+  colnames(itemLabels) <- c('eu_entity_id', 'eu_label')
+  # - insert labels to current wdcm2_itemtopic_
+  cTable <- left_join(cTable, itemLabels, by = 'eu_entity_id')
+  # - recognize missing English labels and replace w. the respective 
eu_entity_id value
+  cTable$eu_label[is.na(cTable$eu_label)] <- 
+    cTable$eu_entity_id[is.na(cTable$eu_label)]
+  rm(itemLabels); gc()
+  # - write current wdcm2_itemtopic_
+  con <- dbConnect(MySQL(),
+                   host = "tools.labsdb",
+                   defult.file = 
"/home/goransm/mySQL_Credentials/replica.my.cnf",
+                   dbname = "u16664__wdcm_p",
+                   user = mySQLCreds$user,
+                   password = mySQLCreds$password)
+  dbWriteTable(conn = con,
+               name = tName,
+               value = cTable,
+               row.names = F,
+               append = F)
+  dbDisconnect(con)
+}
+
+### --- copy toLabsReport.csv to all WDCM Dashboards:
+# - to nohup.out
+print(paste("copy toLabsReport.csv to all WDCM Dashboards: ", Sys.time(), sep 
= ""))
+
+# - Overview Dashboard Update File
+system('sudo cp /home/goransm/WMDE/WDCM/WDCM_SystemUpdate/toLabsReport.csv 
/srv/shiny-server/WDCM_OverviewDashboard/update/', 
+       wait = F)
+# - Usage Dashboard Update File
+system('sudo cp /home/goransm/WMDE/WDCM/WDCM_SystemUpdate/toLabsReport.csv 
/srv/shiny-server/WDCM_UsageDashboard/update/', 
+       wait = F)
+# - Semantics Dashboard Update File
+system('sudo cp /home/goransm/WMDE/WDCM/WDCM_SystemUpdate/toLabsReport.csv 
/srv/shiny-server/WDCM_SemanticsDashboard/update/', 
+       wait = F)
+# - Semantics Dashboard Data Sets
+lF <- list.files()
+lF <- lF[grepl("wdcm2_projecttopic_", lF, fixed = T)]
+for (i in 1:length(lF)) {
+  system(paste('sudo cp ', inputDir, 
+               lF[i], 
+               ' /srv/shiny-server/WDCM_SemanticsDashboard/data/',
+               sep = ''),
+         wait = F)
+}
+lF <- list.files()
+lF <- lF[grepl("wdcm2_visNetworkNodes_project", lF, fixed = T)]
+for (i in 1:length(lF)) {
+  system(paste('sudo cp ', inputDir, 
+               lF[i], 
+               ' /srv/shiny-server/WDCM_SemanticsDashboard/data/',
+               sep = ''),
+         wait = F)
+}
+lF <- list.files()
+lF <- lF[grepl("wdcm2_visNetworkEdges_project", lF, fixed = T)]
+for (i in 1:length(lF)) {
+  system(paste('sudo cp ', inputDir, 
+               lF[i], 
+               ' /srv/shiny-server/WDCM_SemanticsDashboard/data/',
+               sep = ''),
+         wait = F)
+}
+
+### --- Report: Update completed
+# - to nohup.out
+print(paste("WDCM Dashboards Report Completed: ", Sys.time(), sep = ""))
+
+
+
diff --git a/WDCM_Update_Labs.R b/WDCM_Update_Labs.R
new file mode 100644
index 0000000..a0bbfaf
--- /dev/null
+++ b/WDCM_Update_Labs.R
@@ -0,0 +1,247 @@
+### ---------------------------------------------------------------------------
+### --- WDCM Process Module, v. Beta 0.1
+### --- Script: WDCM_Update_Labs.R, v. Beta 0.1
+### ---------------------------------------------------------------------------
+### --- DESCRIPTION:
+### --- WDCM_Update_Labs.R runs on cron from wikidataconcepts.eqiad.wmflabs
+### --- periodically checking the changes in /srv/published-datasets/wdcm
+### --- on stat1005 by accessing https://analytics.wikimedia.org/datasets/wdcm/
+### --- onto which the /srv/published-datasets/wdcm from stat1005
+### --- is mapped.
+### --- This script will check the timestamp in toLabsReport.csv
+### --- and compare it to a locally stored version.
+### --- If the timestamp found in production is newer, the script
+### --- will copy all files from https://analytics.wikimedia.org/datasets/wdcm/
+### --- to: /home/goransm/WMDE/WDCM/WDCM_DataIN/WDCM_DataIN_ClientUsage_v2
+### --- and update its local timestamp.
+### --- Once the timestamp is updated and the files copied, the script will 
run 
+### --- nohup /home/goransm/WMDE/WDCM/WDCM_RScripts &
+### --- to update the WDCM Dashboards.
+### ---------------------------------------------------------------------------
+
+### ---------------------------------------------------------------------------
+### --- LICENSE:
+### ---------------------------------------------------------------------------
+### --- GPL v2
+### --- This file is part of Wikidata Concepts Monitor (WDCM)
+### ---
+### --- WDCM is free software: you can redistribute it and/or modify
+### --- it under the terms of the GNU General Public License as published by
+### --- the Free Software Foundation, either version 2 of the License, or
+### --- (at your option) any later version.
+### ---
+### --- WDCM is distributed in the hope that it will be useful,
+### --- but WITHOUT ANY WARRANTY; without even the implied warranty of
+### --- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+### --- GNU General Public License for more details.
+### ---
+### --- You should have received a copy of the GNU General Public License
+### --- along with WDCM. If not, see <http://www.gnu.org/licenses/>.
+### ---------------------------------------------------------------------------
+
+### --- Setup
+library(httr)
+library(stringr)
+
+### ---------------------------------------------------------------------------
+### --- 1. WDCM Dashboards: General Update
+### --- affected dashboards: Overview, Usage, Semantics
+### ---------------------------------------------------------------------------
+
+### --- Check update timestamp in production:
+### --- https://analytics.wikimedia.org/datasets/wdcm/toLabsReport.csv
+download.file('https://analytics.wikimedia.org/datasets/wdcm/toLabsReport.csv',
+              destfile = 
'/home/goransm/WMDE/WDCM/WDCM_SystemUpdate/timestampProducton.csv',
+              quiet = T)
+timestampProduction <- 
read.csv('/home/goransm/WMDE/WDCM/WDCM_SystemUpdate/timestampProducton.csv',
+                               header = T,
+                               check.names = F,
+                               stringsAsFactors = F, 
+                               row.names = 1)
+tProd <- as.POSIXct(timestampProduction$timeStamp, format = "%Y-%m-%d 
%H:%M:%S")
+
+### --- Check local timestamp on Labs:
+timestampLocal <- 
read.csv('/home/goransm/WMDE/WDCM/WDCM_SystemUpdate/timestampLocal.csv',
+                           header = T,
+                           check.names = F,
+                           stringsAsFactors = F,
+                           row.names = 1)
+tLoc <- as.POSIXct(timestampLocal$timeStamp, format = "%Y-%m-%d %H:%M:%S")
+
+
+### --- Compare with the current local timestamp
+### --- and update if necessary:
+if (tLoc < tProd) {
+  
+  # - update local timestamp:
+  timestampLocal$timeStamp <- timestampProduction$timeStamp
+  write.csv(timestampLocal, file = 
'/home/goransm/WMDE/WDCM/WDCM_SystemUpdate/timestampLocal.csv')
+  
+  # - list files:
+  url <- 'https://analytics.wikimedia.org/datasets/wdcm/'
+  page <- as.character(GET(url))
+  links <- str_extract_all(page, "<a href=.+>.+</a>")
+  links <- sapply(links, function(x) {str_extract_all(x, ">.+<")})
+  links <- sapply(links, function(x) {gsub('^>|"|<$|>|<', "", x)})
+  links <- links[3:length(links)]
+  
+  # - clean up:
+  wREADME <- which(grepl("README", links))
+  links <- links[-wREADME]
+  wtoLabsReport <- which(grepl("toLabs", links))
+  links <- links[-wtoLabsReport]
+  wWDCM_MainReport <- which(grepl("WDCM_MainReport", links))
+  links <- links[-wWDCM_MainReport]
+  
+  # - clear output dir:
+  setwd('/home/goransm/WMDE/WDCM/WDCM_DataIN/WDCM_DataIN_ClientUsage_v2')
+  lF <- list.files()
+  rmF <- file.remove(lF)
+  
+  # - Download files to: 
/home/goransm/WMDE/WDCM/WDCM_DataIN/WDCM_DataIN_ClientUsage_v2
+  destDir <- '/home/goransm/WMDE/WDCM/WDCM_DataIN/WDCM_DataIN_ClientUsage_v2/'
+  for (i in 1:length(links)) {
+    # - to nohup:
+    print(paste("Downloading file: ", links[i], ": ", i, "/", length(links), 
sep = ""))
+    download.file(url = paste(url, links[i], sep = ""), 
+                  destfile = paste(destDir, links[i], sep = ""), 
+                  quiet = T)
+  }
+  
+  ### --- fetch toLabsReport.csv and place to:
+  ### --- /home/goransm/WMDE/WDCM/WDCM_SystemUpdate/
+  
download.file('https://analytics.wikimedia.org/datasets/wdcm/toLabsReport.csv',
+                destfile = 
'/home/goransm/WMDE/WDCM/WDCM_SystemUpdate/toLabsReport.csv',
+                quiet = T)
+  
+  # - set updateDir
+  updateDir <- '/home/goransm/WMDE/WDCM/WDCM_SystemUpdate/'
+  setwd(updateDir)
+  
+  ### --- start WDCM Dashboards Update:
+  # - remove previous runtime report:
+  file.remove('WDCM_Update_Process_nohupReport.txt')
+  nohupReport <- system(command = 'nohup Rscript 
/home/goransm/WMDE/WDCM/WDCM_RScripts/WDCM_Process.R', intern = T)
+  write(nohupReport, 'WDCM_Update_Process_nohupReport.txt')
+  
+  ### --- Update Report:
+  lF <- list.files()
+  if ('WDCM_SystemUpdateReport.csv' %in% lF) {
+    updateReport <- read.csv('WDCM_SystemUpdateReport.csv',
+                             header = T,
+                             row.names = 1,
+                             check.names = F,
+                             stringsAsFactors = F)
+    newReport <- data.frame(Step = 'WDCM Update',
+                            Time = as.character(Sys.time()),
+                            stringsAsFactors = F)
+    updateReport <- rbind(updateReport, newReport)
+    write.csv(updateReport, 'WDCM_SystemUpdateReport.csv')
+  } else {
+    newReport <- data.frame(Step = 'WDCM Update',
+                            Time = as.character(Sys.time()),
+                            stringsAsFactors = F)
+    write.csv(newReport, 'WDCM_SystemUpdateReport.csv')
+  }
+
+}
+
+### ---------------------------------------------------------------------------
+### --- 2. WDCM GeoDashboard Update
+### --- affected dashboards: Overview, Usage, Semantics
+### ---------------------------------------------------------------------------
+
+### --- Check update timestamp in production:
+### --- https://analytics.wikimedia.org/datasets/wdcm/toLabsReport.csv
+download.file('https://analytics.wikimedia.org/datasets/wdcm/toLabsGeoReport.csv',
+              destfile = 
'/home/goransm/WMDE/WDCM/WDCM_SystemUpdate/timestampGeoProduction.csv',
+              quiet = T)
+timestampProduction <- 
read.csv('/home/goransm/WMDE/WDCM/WDCM_SystemUpdate/timestampGeoProduction.csv',
+                                header = T,
+                                check.names = F,
+                                stringsAsFactors = F, 
+                                row.names = 1)
+tProd <- as.POSIXct(timestampProduction$timeStamp, format = "%Y-%m-%d 
%H:%M:%S")
+
+### --- Check local timestamp on Labs:
+timestampLocal <- 
read.csv('/home/goransm/WMDE/WDCM/WDCM_SystemUpdate/timestampGeoLocal.csv',
+                           header = T,
+                           check.names = F,
+                           stringsAsFactors = F,
+                           row.names = 1)
+tLoc <- as.POSIXct(timestampLocal$timeStamp, format = "%Y-%m-%d %H:%M:%S")
+
+### --- Compare with the current local timestamp
+### --- and update if necessary:
+if (tLoc < tProd) {
+  
+  # - update local timestamp:
+  timestampLocal$timeStamp <- timestampProduction$timeStamp
+  write.csv(timestampLocal, file = 
'/home/goransm/WMDE/WDCM/WDCM_SystemUpdate/timestampGeoLocal.csv')
+  
+  # - list files:
+  url <- 'https://analytics.wikimedia.org/datasets/wdcm/'
+  page <- as.character(GET(url))
+  links <- str_extract_all(page, "<a href=.+>.+</a>")
+  links <- sapply(links, function(x) {str_extract_all(x, ">.+<")})
+  links <- sapply(links, function(x) {gsub('^>|"|<$|>|<', "", x)})
+  links <- links[3:length(links)]
+  links <- links[which(grepl("^wdcm_geoitem_", links))]
+  
+  # - Download files to: 
/home/goransm/WMDE/WDCM/WDCM_DataIN/WDCM_DataIN_ClientUsage_v2
+  destDir <- '/home/goransm/WMDE/WDCM/WDCM_DataIN/WDCM_DataIN_ClientUsage_v2/'
+  for (i in 1:length(links)) {
+    # - to nohup:
+    print(paste("GeoDashboard - Downloading file: ", links[i], ": ", i, "/", 
length(links), sep = ""))
+    download.file(url = paste(url, links[i], sep = ""), 
+                  destfile = paste(destDir, links[i], sep = ""), 
+                  quiet = T)
+  }
+  
+  ### --- fetch toLabsReport.csv and place to:
+  ### --- /home/goransm/WMDE/WDCM/WDCM_SystemUpdate/
+  
download.file('https://analytics.wikimedia.org/datasets/wdcm/toLabsGeoReport.csv',
+                destfile = 
'/home/goransm/WMDE/WDCM/WDCM_SystemUpdate/toLabsGeoReport.csv',
+                quiet = T)
+  
+  # - Copy GeoDashboard Data Sets to Dashboard /srv
+  setwd(destDir)
+  lF <- list.files()
+  lF <- lF[grepl("^wdcm_geoitem_", lF)]
+  for (i in 1:length(lF)) {
+    system(paste('sudo cp ', destDir, 
+                 lF[i], 
+                 ' /srv/shiny-server/WDCM_GeoDashboard/data/',
+                 sep = ''),
+           wait = F)
+  }
+  
+  ### --- Copy toLabsGeoReport to Dashboard /srv
+  system('sudo cp 
/home/goransm/WMDE/WDCM/WDCM_SystemUpdate/toLabsGeoReport.csv 
/srv/shiny-server/WDCM_GeoDashboard/update/', 
+         wait = F)
+
+  ### --- Update Report:
+  # - set updateDir
+  updateDir <- '/home/goransm/WMDE/WDCM/WDCM_SystemUpdate/'
+  setwd(updateDir)
+  lF <- list.files()
+  if ('WDCM_SystemUpdateReport.csv' %in% lF) {
+    updateReport <- read.csv('WDCM_SystemUpdateReport.csv',
+                             header = T,
+                             row.names = 1,
+                             check.names = F,
+                             stringsAsFactors = F)
+    newReport <- data.frame(Step = 'WDCM Geo Update',
+                            Time = as.character(Sys.time()),
+                            stringsAsFactors = F)
+    updateReport <- rbind(updateReport, newReport)
+    write.csv(updateReport, 'WDCM_SystemUpdateReport.csv')
+  } else {
+    newReport <- data.frame(Step = 'WDCM Geo Update',
+                            Time = as.character(Sys.time()),
+                            stringsAsFactors = F)
+    write.csv(newReport, 'WDCM_SystemUpdateReport.csv')
+  }
+  
+}
+

-- 
To view, visit https://gerrit.wikimedia.org/r/398692
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I1cfad83ed71be0e046205a5cff09987df0a8ab7f
Gerrit-PatchSet: 1
Gerrit-Project: analytics/wmde/WDCM
Gerrit-Branch: master
Gerrit-Owner: GoranSMilovanovic <goran.milovanovic_...@wikimedia.de>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to