GoranSMilovanovic has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/386121 )
Change subject: Semantics Dashboard ...................................................................... Semantics Dashboard Change-Id: I53b8d162a3e729f388992efdb3d5358ab6646565 --- M WDCM_SemanticsDashboard/server.R M WDCM_SemanticsDashboard/ui.R A WDCM_ShinyServerFrontPage/SemanticsDashboard.png M WDCM_ShinyServerFrontPage/wdcm_ShinyFront.html 4 files changed, 398 insertions(+), 20 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/wmde/WDCM refs/changes/21/386121/1 diff --git a/WDCM_SemanticsDashboard/server.R b/WDCM_SemanticsDashboard/server.R index 1ebb4a8..f07ae5b 100644 --- a/WDCM_SemanticsDashboard/server.R +++ b/WDCM_SemanticsDashboard/server.R @@ -57,6 +57,9 @@ res <- dbSendQuery(con, q) dbClearResult(res) +### --- itemTopicTables +itemTopicTables <- st$tables[which(grepl("wdcm2_itemtopic_", st$tables, fixed = T))] + ### --- fetch wdcm2_project q <- "SELECT * FROM wdcm2_project;" res <- dbSendQuery(con, q) @@ -96,7 +99,7 @@ lF <- lF[grepl("wdcm2_projecttopic_", lF, fixed = T)] projectTopic <- vector(mode = "list", length = length(lF)) for (i in 1:length(lF)) { - projectTopic[[i]] <- fread(lF[i]) + projectTopic[[i]] <- fread(lF[i], data.table = F) } names(projectTopic) <- sapply(lF, function(x) { strsplit(strsplit(x, split = ".", fixed = T)[[1]][1], @@ -157,6 +160,294 @@ ### --- shinyServer shinyServer(function(input, output, session) { + ### ------------------------------------------ + ### --- TAB: tabPanel Semantic Models + ### ------------------------------------------ + + ### --- SELECT: update select 'selectCategory' + updateSelectizeInput(session, + 'selectCategory', + "Select Semantic Category:", + choices = categories, + selected = categories[round(runif(1, 1, length(categories)))], + server = TRUE) + + ### --- REACTIVE: category specific wdcm_itemtopic data.frame + itemTopicsNum <- reactive({ + sC <- gsub(" ", "", input$selectCategory, fixed = T) + sTable <- itemTopicTables[which(grepl(sC, itemTopicTables, fixed = T))] + ### -- Connect + con <- dbConnect(MySQL(), + host = "tools.labsdb", + defult.file = "/home/goransm/mySQL_Credentials/replica.my.cnf", + dbname = "u16664__wdcm_p", + user = mySQLCreds$user, + password = mySQLCreds$password) + ### --- check the particular table + q <- paste("DESCRIBE ", sTable, ";", sep = "") + res <- dbSendQuery(con, q) + sIT <- fetch(res, -1) + dbClearResult(res) + ### --- Disconnect + dbDisconnect(con) + sum(grepl("topic", sIT$Field)) + }) + + ### --- SELECT: updateSelectizeInput 'selectCatTopic' + output$selectCatTopic <- + renderUI({ + if ((is.null(input$selectCategory)) | (length(input$selectCategory) == 0)) { + selectInput(inputId = "selectCategoryTopic", + label = "Select Semantic Topic:", + choices = NULL, + selected = NULL) + } else { + cH <- paste("Topic", 1:itemTopicsNum(), sep = " ") + selectInput(inputId = "selectCategoryTopic", + label = "Select Semantic Topic:", + choices = cH, + selected = cH[1]) + } + }) + + ### --- REACTIVE current itemTopic table: + itemTopic <- reactive({ + sC <- gsub(" ", "", input$selectCategory, fixed = T) + sTable <- itemTopicTables[which(grepl(sC, itemTopicTables, fixed = T))] + cTopic <- tolower(gsub(" ", "", input$selectCategoryTopic)) + if (!length(cTopic) == 0) { + ### -- Connect + con <- dbConnect(MySQL(), + host = "tools.labsdb", + defult.file = "/home/goransm/mySQL_Credentials/replica.my.cnf", + dbname = "u16664__wdcm_p", + user = mySQLCreds$user, + password = mySQLCreds$password) + ### --- check the particular table + q <- 'SET CHARACTER SET utf8;' + res <- dbSendQuery(con, q) + q <- paste("SELECT * FROM ", sTable, " ORDER BY ", cTopic, " DESC LIMIT 50;", sep = "") + res <- dbSendQuery(con, q) + iT <- fetch(res, -1) + dbClearResult(res) + ### --- Disconnect + dbDisconnect(con) + ### --- Output: + return(iT) + } else {return(NULL)} + }) + + ### --- OUTPUT output$topItemsTopic + output$topItemsTopic <- renderPlot({ + if (!is.null(itemTopic())) { + cTopic <- tolower(gsub(" ", "", input$selectCategoryTopic)) + plotFrame <- itemTopic() + plotFrame <- select(plotFrame, + eu_label, eu_entity_id, cTopic) + colnames(plotFrame) <- c('Label', 'Id', 'Probability') + plotFrame$Label <- paste(1:dim(plotFrame)[1], ". ", plotFrame$Label, sep = "") + plotFrame$Label <- factor(plotFrame$Label, + levels = plotFrame$Label[order(plotFrame$Probability)]) + plotFrame$Sign <- paste("(", 1:dim(plotFrame)[1], ") ", plotFrame$Id, sep = "") + ggplot(plotFrame, aes(x = Probability, + y = Label, + label = Sign)) + + geom_line(size = .25, color = "#4c8cff", group = 1) + + geom_point(size = 1.5, color = "#4c8cff") + + geom_point(size = 1, color = "white") + + geom_label_repel(size = 3, segment.size = .25, show.legend = FALSE) + + ylab("Items Labels") + xlab("Item Importance\n(Item Probability in Topic)") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, size = 12, hjust = 1)) + + theme(axis.text.y = element_text(size = 12, hjust = 1)) + + theme(axis.title.x = element_text(size = 12)) + + theme(axis.title.y = element_text(size = 12)) %>% + withProgress(message = 'Generating plot', + min = 0, + max = 1, + value = 1, {incProgress(amount = 0)}) + } else {return(NULL)} + }) + + # - output$networkItemsTopic + output$networkItemsTopic <- renderVisNetwork({ + + if (!is.null(itemTopic())) { + # - normalization: Luce's choice axiom + itemNames <- itemTopic()$eu_label + root <- select(itemTopic(), starts_with('topic')) + root <- as.matrix(parDist(as.matrix(root), method = "euclidean")) + rownames(root) <- itemNames + colnames(root) <- itemNames + indexMinDist <- sapply(rownames(root), function(x) { + w <- which(rownames(root) %in% x) + y <- sort(root[w, -w], decreasing = T) + names(y)[length(y)] + }) + id <- 1:length(colnames(root)) + label <- colnames(root) + nodes <- data.frame(id = id, + label = label, + stringsAsFactors = F) + conceptsStruct <- data.frame(from = names(indexMinDist), + to = unname(indexMinDist), + stringsAsFactors = F) + conceptsStruct$from <- sapply(conceptsStruct$from, function(x) { + nodes$id[which(nodes$label %in% x)] + }) + conceptsStruct$to <- sapply(conceptsStruct$to, function(x) { + nodes$id[which(nodes$label %in% x)] + }) + conceptsStruct$arrows <- rep("to", length(conceptsStruct$to)) + visNetwork(nodes = nodes, + edges = conceptsStruct, + width = "100%", + height = "100%") %>% + visEvents(type = "once", + startStabilizing = "function() {this.moveTo({scale:0.65})}") %>% + visPhysics(stabilization = FALSE) %>% + withProgress(message = 'Generating plot', + min = 0, + max = 1, + value = 1, {incProgress(amount = 0)}) + } else {return(NULL)} + }) + + ### --- REACTIVE current pTopic data.frame: + pTopic <- reactive({ + w <- which(names(projectTopic) %in% gsub(" ", "", input$selectCategory, fixed = T)) + if (!length(w) == 0) { + pTopic <- as.data.frame(projectTopic[[w]]) + cTopic <- which(colnames(pTopic) %in% tolower(gsub(" ", "", input$selectCategoryTopic))) + if (!length(cTopic) == 0) { + pTopic <- pTopic %>% + select(cTopic, project, projecttype) %>% + arrange(desc(pTopic[, cTopic])) + pTopic <- pTopic[1:50, ] + if (sum(is.na(pTopic$project)) > 0) { + pTopic <- pTopic[-which(is.na(pTopic$project)), ] + } + colnames(pTopic) <- c('Probability', 'Project', 'Project Type') + pTopic$Label <- paste(1:dim(pTopic)[1], ". ", pTopic$Project, sep = "") + pTopic$Label <- factor(pTopic$Label, + levels = pTopic$Label[order(pTopic$Probability)]) + return(pTopic) + } else {return(NULL)} + } else {return(NULL)} + }) + + ### --- OUTPUT output$topProjectsTopic + output$topProjectsTopic <- renderPlot({ + if (!is.null(pTopic())) { + ggplot(pTopic(), aes(x = Probability, + y = Label, + label = Label)) + + geom_line(size = .25, color = "#4c8cff", group = 1) + + geom_point(size = 1.5, color = "#4c8cff") + + geom_point(size = 1, color = "white") + + geom_label_repel(size = 3, segment.size = .25, show.legend = FALSE) + + ylab("Projects") + xlab("Topic Importance in Project\n(Topic Probability in Project)") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, size = 12, hjust = 1)) + + theme(axis.text.y = element_text(size = 12, hjust = 1)) + + theme(axis.title.x = element_text(size = 12)) + + theme(axis.title.y = element_text(size = 12)) %>% + withProgress(message = 'Generating plot', + min = 0, + max = 1, + value = 1, {incProgress(amount = 0)}) + } else {return(NULL)} + }) + + ### ------------------------------------------ + ### --- TAB: tabPanel Projects + ### ------------------------------------------ + + ### --- SELECT: update select 'selectProject' + updateSelectizeInput(session, + 'selectProject', + choices = c(projects, paste("_", projectTypes, sep = "")), + selected = "_Wikipedia", + server = TRUE) + + ### --- REACTIVE: selectedProjects + selectedProjects <- reactive({ + ### --- selected projects: + if (!is.null(input$selectProject)) { + wUnzip <- which(names(unzip_projectTypes) %in% input$selectProject) + if (length(wUnzip > 0)) { + selectedProjects <- unname(do.call(c, unzip_projectTypes[wUnzip])) + } + wSel <- which(projects %in% input$selectProject) + if (length(wSel > 0)) { + selectedProjects <- c(selectedProjects, projects[wSel]) + } + selectedProjects <- unique(selectedProjects) + return(selectedProjects) + } else {return(NULL)} + }) + + ### --- OBSERVE: input$applySelection + observeEvent(input$applySelection, { + + #### --- Chart: projectTopicImportance + output$projectTopicImportance <- renderPlot({ + # - Plot Frame for projectTopicImportance + projList <- lapply(names(projectTopic), function(x) { + cCategory <- which(names(projectTopic) %in% x) + cProj <- projectTopic[[cCategory]] + if (sum(which(cProj$project %in% isolate(selectedProjects()))) == 0) { + return(NULL) + } else { + cProj <- cProj %>% + filter(project %in% isolate(selectedProjects())) %>% + select(starts_with("topic"), project) %>% + gather(key = Topic, + value = Probability, + starts_with('topic')) + catName <- gsub("([[:lower:]])([[:upper:]])", "\\1 \\2", names(projectTopic)[cCategory]) + # - FIX THIS: + catName <- gsub("Workof Art", "Work of Art", catName, fixed = T) + cProj$Category <- catName + cProj <- cProj %>% + select(Topic, Probability, Category) %>% + group_by(Category, Topic) %>% + summarise(Proportion = sum(Probability)) + cProj$Proportion <- round(cProj$Proportion/sum(cProj$Proportion)*100, 2) + return(cProj) + } + }) + wEl <- sapply(projList, function(x) { + !is.null(x) + }) + projList <- as.data.frame(rbindlist(projList[wEl])) + # - factor projList$Topic: + projList$Topic <- str_to_title(gsub("([[:alpha:]]+)", "\\1 ", projList$Topic)) + projList$Topic <- factor(projList$Topic, levels = unique(projList$Topic)) + # - visualize w. ggplot2 + ggplot(projList, + aes(x = Topic, + y = Proportion, + label = paste(Proportion, "%", sep = "")) + ) + + geom_bar(stat = "identity", width = .1, color = "#4c8cff", fill = "#4c8cff", group = 1) + + geom_label(size = 4) + + facet_wrap(~ Category, ncol = 3, scales = "free_y") + + xlab('Topic') + ylab('Topic Engagement (%)') + + scale_y_continuous(labels = comma) + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, size = 12, hjust = 1)) + + theme(axis.title.x = element_text(size = 12)) + + theme(axis.title.y = element_text(size = 12)) + + theme(strip.text = element_text(size = 13)) %>% + withProgress(message = 'Generating plot', + min = 0, + max = 1, + value = 1, {incProgress(amount = 0)}) + }) + + }, ignoreNULL = FALSE) + }) ### --- END shinyServer diff --git a/WDCM_SemanticsDashboard/ui.R b/WDCM_SemanticsDashboard/ui.R index 1eb01a6..e9df674 100644 --- a/WDCM_SemanticsDashboard/ui.R +++ b/WDCM_SemanticsDashboard/ui.R @@ -71,7 +71,7 @@ fluidRow( column(width = 6, br(), - HTML('<font size=2><b>Note:</b> This page provides an oportunity to study the WDCM semantic models. The WDCM organizes its knowledge of Wikidata usage + HTML('<font size=2>This page provides an oportunity to study the WDCM semantic models. The WDCM organizes its knowledge of Wikidata usage into <b>semantic categories</b> and currently uses 14 of them. Each semantic category encompasses a set of Wikidata items that match a particular intuitive, natural concept (e.g. "Human", "Geographical Object", "Event", etc).<br> The WDCM develops a <b>semantic topic model</b> (see: <a href = "https://en.wikipedia.org/wiki/Topic_model" target = "_blank">Topic Model</a>) for each @@ -79,13 +79,71 @@ Wikidata items from the respective semantic category in that topic. Here you can browse the semantic categories and inspect the structure of topics that are encompassed by the respective semantic model. You can also learn about the most important projects in a given category for a given topic from its semantic model.<br> - The Dashboard will initialize a random choice of <i>Category</i> and then a <i>Topic</i> from its semantic model. Use the drop-down menus to select - a category and a topic from its semantic model.</font>') + The Dashboard will initialize a random choice of <i>Category</i> and pick the first <i>Topic</i> from its semantic model. Use the drop-down menus to select + a category and a topic from its semantic model. Three outputs will be generated on this page: the Top 50 items chart, the topic similarity network, and the top 50 + projects in this topic chart (scroll down).</font>') ) ), - # - fluidRow: ValueBoxes + # - fluidRow: Selections fluidRow( - + br(), + column(width = 3, + selectizeInput("selectCategory", + "Select Item Category:", + multiple = F, + choices = NULL, + selected = NULL) + ), + column(width = 3, + uiOutput("selectCatTopic") + ) + ), + fluidRow( + column(width = 12, + hr(), + h4('Top 50 items in this topic'), + HTML('<font size="2">The chart represents the top 50 most important items in this topic. The importance of each item is given by its + probability of being generated by this particular semantic topic (horizontal axis). The items are ranked; the rank numbers next to the labels + on the vertical axis correspond to the rank numbers in parentheses next to data labels that show the item Wikidata IDs. + <i>There\'s a game that you can play here:</i> ask yourself what makes this 50 items go together, what makes them similar, what unifying principle + holds them together in the same semantic topic? Do not forget: it is not only about what you know about the World, but also about how our communities use + Wikidata on their respective projects.</font>'), + br(), br(), + withSpinner(plotOutput('topItemsTopic', + width = "100%", + height = "850px")) + ) + ), + fluidRow( + column(width = 12, + hr(), + h4('Topic similarity network'), + HTML('<font size="2">Each bubble represents one among the top 50 most important items in this semantic topic. Each item points towards the + the item to which it is most similar. Similarity between items is derived not only from item importances (i.e. probabilities) in this topic, but + from all topics that are encompasses by this category\'s semantic model. In interpreting the similarities, do not forget that game is not only + about what you know about the World, but also about how different communities use Wikidata. The more similarly the items are used across the + sister projects, the more likely they will group together in this network. You can drag the network and the nodes around and zoom in and out by + your mouse wheel.</font>'), + br(), br(), + withSpinner(visNetwork::visNetworkOutput('networkItemsTopic', height = 850)) + ) + ), + fluidRow( + column(width = 12, + hr(), + h4('Top 50 projects in this topic'), + HTML('<font size="2">To put it in a nutshell: here you can see what projects use the selected topic from the respective semantic category the most. + The chart represents the top 50 projects in respect to the prominence of the selected topic. In the WDCM topic models, + the usage pattern of any particular semantic category of Wikidata items, in a particular project, can be viewed as a mixture of semantic topics + from the respective category\'s semantic model. Thus, each project\'s usage pattern in a particular semantic category can be expressed + as a set of proportions up to which each topic contributes to it. The horizontal axis represents the proportion (e.g. the probability) of the + selected topic\'s presence in a particular project. Projects are found on the vertical axis, with the rank numbers corresponding to those near + the data points in the chart.</font>'), + br(), br(), + withSpinner(plotOutput('topProjectsTopic', + width = "100%", + height = "850px")) + ) ) ), # - tabPanel Semantic Models END @@ -96,28 +154,57 @@ fluidRow( column(width = 6, br(), - HTML('<font size = 2>Here you can make <b>selections</b> of client projects and semantic categories to learn about Wikidata - usage across them.<br> <b>Note:</b> You can search and add projects into the <i>Search projects</i> field by + HTML('<font size = 2>Here you can make a selection of projects and learn about the importance of all available semantic topics from + each semantic category in the project(s) of your choice. <b>Note:</b> You can search and add projects into the <i>Search projects</i> field by using (a) <b>project names</b> (e.g. <i>enwiki</i>, <i>dewiki</i>, <i>sawikiquote</i>, and similar or (b) by using <b>project types</b> that start with <b>"_"</b> (underscore, e.g. <i>_Wikipedia</i>, <i>_Wikisource</i>, <i>_Commons</i>, and similar; try typing anything into the Select projects field that starts with an underscore). Please note that by selecting a project type (again: <i>_Wikipedia</i>, <i>_Wikiquote</i>, and similar) you are selecting <b>all</b> client projects of the respective type, and that\'s potentially a lot of data. The Dashboard will pick unique - projects from whatever you have inserted into the Search projects field. The selection of projects will be intesected - with the selection of semantic categories from the Select categories field, and the obtained results will refer only - to the Wikidata items from the current selection of client projects <i>and</i> semantic categories. - In other words: <i>disjunction</i> operates inside the two search fields, while <i>conjunction</i> operates - across the two search fields.<br> <b>Note:</b> The Dashboard will initialize a choice of three project types - (<i>Wikipedia</i>, <i>Wikinews</i>, and <i>Wiktionary</i>) and a random choice of six semantic categories. All charts will present at - most 25 top projects in respect to the Wikidata usage and relative to the current selection; however, <b>complete - selection data sets</b> are available for download (<i>.csv</i>) beneath each chart.</font>'), + projects from whatever you have inserted into the Search projects field. <br> <b>Note:</b> The Dashboard will initialize with a choice of + all <i>Wikipedia</i> projects. Then you can make a selection of projects of your own and hit <i>Apply Selection</i> to obtain the result.</font>'), br(), br() ) ) ) ), + # - fluidRow: Selections fluidRow( - + br(), + column(width = 6, + selectizeInput("selectProject", + "Select Projects:", + multiple = T, + choices = NULL, + selected = NULL, + width = 800) + ) + ), + fluidRow( + column(width = 2, + actionButton('applySelection', + label = "Apply Selection", + width = '70%', + icon = icon("database", + class = NULL, + lib = "font-awesome") + ) + ) + ), + fluidRow( + column(width = 12, + hr(), + h4('Semantic Topics in Wikimedia Projects'), + HTML('<font size="2">The vertical axes represent the % of topic engagement in this particular selection of Wikimedia projects. <br> + <b>Note:</b> Please be remindided that semantic topics are <i>category-specific</i>: each category has its own semantic model, and each + semantic model encompasses a number of topics. To clarify: Topic 1 is not the same thing in two different categories. You can learn about + the content of any semantic topic in any of the semantic categories on the <i>Semantic Models</i> tab - and in fact that is what one should + do <i>before</i> any attempt to interpret the data that are provided here.</font>'), + br(), br(), + withSpinner(plotOutput('projectTopicImportance', + width = "100%", + height = "1000px")) + ) ) ) # - tabPanel Projects END diff --git a/WDCM_ShinyServerFrontPage/SemanticsDashboard.png b/WDCM_ShinyServerFrontPage/SemanticsDashboard.png new file mode 100644 index 0000000..159803b --- /dev/null +++ b/WDCM_ShinyServerFrontPage/SemanticsDashboard.png Binary files differ diff --git a/WDCM_ShinyServerFrontPage/wdcm_ShinyFront.html b/WDCM_ShinyServerFrontPage/wdcm_ShinyFront.html index d9368c4..fa93e8b 100644 --- a/WDCM_ShinyServerFrontPage/wdcm_ShinyFront.html +++ b/WDCM_ShinyServerFrontPage/wdcm_ShinyFront.html @@ -234,7 +234,7 @@ <div id="shiny"> - <h2>WDCM Dahsboards</h2> + <h2>WDCM Dashboards</h2> <hr> <a href = "http://wdcm.wmflabs.org/WDCM_OverviewDashboard/"><h4>WDCM Overview</h4></a> <a href = "http://wdcm.wmflabs.org/WDCM_OverviewDashboard/"><img src="OverviewDashboard.png" alt="WDCM Overview" style="width:300px;"></a> @@ -252,8 +252,8 @@ </div> <hr> - <a href = "http://wdcm.wmflabs.org/WDCM_UsageDashboard/"><h4>WDCM Semantics</h4></a> - <a href = "http://wdcm.wmflabs.org/WDCM_UsageDashboard/"><img src="UsageDashboard.png" alt="WDCM Usage" style="width:300px;"></a> + <a href = "http://wdcm.wmflabs.org/WDCM_SemanticsDashboard/"><h4>WDCM Semantics</h4></a> + <a href = "http://wdcm.wmflabs.org/WDCM_SemanticsDashboard/"><img src="SemanticsDashboard.png" alt="WDCM Semantics" style="width:300px;"></a> <br> <div class="caption"> The Semantics Dashboard provides an insight into the distributional -- To view, visit https://gerrit.wikimedia.org/r/386121 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I53b8d162a3e729f388992efdb3d5358ab6646565 Gerrit-PatchSet: 1 Gerrit-Project: analytics/wmde/WDCM Gerrit-Branch: master Gerrit-Owner: GoranSMilovanovic <goran.milovanovic_...@wikimedia.de> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits