FYI That works for me, but maybe this is also of interest for others, so I wonder if somebody of the Bioc annotation/experiment team (Sonali, Valerie, Martin?) could update this accordingly for ExperimentHub?
Best, Ludwig -- Dr. Ludwig Geistlinger Lehr- und Forschungseinheit für Bioinformatik Institut für Informatik Ludwig-Maximilians-Universität München Amalienstrasse 17, 2. Stock, Büro A201 80333 München Tel.: 089-2180-4067 eMail: ludwig.geistlin...@bio.ifi.lmu.de > Hi Ludwig, > > In november I sent the updated recipe to Martin, but I think it was not > updated yet. > > Anyway, you can do it yourself with the code here below: > > library("GEOquery") > library("Biobase") > > suppl <- GEOquery::getGEOSuppFiles("GSE62944") > > setwd("GSE62944") > > clinvar <- > > read.delim("GSE62944_06_01_15_TCGA_24_548_Clinical_Variables_9264_Samples.txt.gz") > clinvar2 <- t(clinvar) > > # add variable names > colnames(clinvar2) <- clinvar2[1,] > # and remove the 2nd abbreviation, with the CDE_ID too > clinvar3 <- clinvar2[-c(1:3),] > > # substitute dots with dashes in the ids, to be consistent with > previous object > clinvar4 <- clinvar3 > rownames(clinvar4) <- gsub("\\.","-",rownames(clinvar3)) > clinvar4 <- as.data.frame(clinvar4) > > CancerType <- > read.delim("GSE62944_06_01_15_TCGA_24_CancerType_Samples.txt.gz", > header=FALSE, colClasses=c("character", "factor"), > col.names=c("sample", "type")) > idx <- match(rownames(clinvar4), CancerType$sample) > # these are already nicely sorted > clinvar4$CancerType <- CancerType$type[idx] > > > countFile <- > "GSM1536837_06_01_15_TCGA_24.tumor_Rsubread_FeatureCounts.txt.gz" > untar("GSE62944_RAW.tar", countFile) > > counts <- local({ > data <- scan(countFile, what=character(), sep="\t", quote="") > m <- matrix(data, 9265) > dimnames(m) <- list(m[,1], m[1,]) > m <- t(m[-1, -1]) > mode(m) <- "integer" > m > }) > > # just to be sure > gplots::venn(list(colnames(counts),rownames(cl4))) # they are all > there, but not correctly sorted > head(colnames(counts)) > head(rownames(clinvar4)) > > # re-sorting according to the counts object > cl5 <- > clinvar4[rownames(clinvar4)[match(colnames(counts),rownames(clinvar4))],] > head(rownames(cl5),20) > head(colnames(counts),20) > > # as in your example > eset_new <- Biobase::ExpressionSet(counts, AnnotatedDataFrame(cl5)) > > # or as SummarizedExperiment > library("GenomicRanges") > se <- SummarizedExperiment(assays=list(counts)) > colData(se) <- S4Vectors::DataFrame(cl5) > > # data exploration to see how samples are related to each other > library("DESeq2") > ddsTCGA <- DESeqDataSet(se,design=~CancerType) > > ddsTCGA <- estimateSizeFactors(ddsTCGA) > log2tcga <- log2(1+counts(ddsTCGA,normalized=TRUE)) > se_log2tcga <- SummarizedExperiment(assays=list(log2tcga)) > colData(se_log2tcga) <- colData(ddsTCGA) # the rlog transform takes > very long time, so just a quick and dirty check > > pca_d4 <- function (x, intgroup = "condition", ntop = 500, > returnData = FALSE,title=NULL, > pcX = 1, pcY = 2,text_labels=TRUE,point_size=3) > # customized principal components > { > library("DESeq2") > library("genefilter") > library("ggplot2") > rv <- rowVars(assay(x)) > select <- order(rv, decreasing = > TRUE)[seq_len(min(ntop,length(rv)))] > pca <- prcomp(t(assay(x)[select, ])) > percentVar <- pca$sdev^2/sum(pca$sdev^2) > > intgroup.df <- as.data.frame(colData(x)[, intgroup, drop = FALSE]) > group <- factor(apply(intgroup.df, 1, paste, collapse = " : ")) > d <- data.frame(PC1 = pca$x[, pcX], PC2 = pca$x[, pcY], group = > group, > intgroup.df, names = colnames(x)) > colnames(d)[1] <- paste0("PC",pcX) > colnames(d)[2] <- paste0("PC",pcY) > if (returnData) { > attr(d, "percentVar") <- percentVar[1:2] > return(d) > } > # clever way of positioning the labels > d$hjust = ifelse((sign(d[,paste0("PC",pcX)])==1),0.9,0.1)# (1 + > varname.adjust * sign(PC1))/2) > g <- ggplot(data = d, aes_string(x = paste0("PC",pcX), y = > paste0("PC",pcY), color = "group")) + > geom_point(size = point_size) + > xlab(paste0("PC",pcX,": ", round(percentVar[pcX] * 100,digits = > 2), "% variance")) + > ylab(paste0("PC",pcY,": ", round(percentVar[pcY] * 100,digits = > 2), "% variance")) > if(text_labels) g <- g + geom_text(mapping = > aes(label=names,hjust=hjust, vjust=-0.5), show.legend = F) > if(!is.null(title)) g <- g + ggtitle(title) > g > } > > pdf("allTCGA_diy.pdf",height=30,width=30) > pca_d4(se_log2tcga,intgroup="CancerType",text_labels = FALSE) > dev.off() > > If nothing changed in the data structure over there it should be > correctly reproduced. Apologize if something indeed should *not work*, I > did not touch the code that much afterwards. > But for a starter, it might be covering your need. > > If you have any suggestion or improvement, I'm all ears ;) > > Best, > > Federico > > -- > Federico Marini, M.Sc. > Institute of Medical Biostatistics, Epidemiology and Informatics (IMBEI) > Division Biostatistics and Bioinformatics > University Medical Center of the Johannes Gutenberg University Mainz > Postal address: 55101 Mainz > Office and delivery address: Obere Zahlbacher Str. 69, 55131 Mainz > > Phone: +49 (0) 6131 17-8108 > Fax: +49 (0) 6131 17-2968 > Web: www.imbei.uni-mainz.de > E-Mail: mari...@uni-mainz.de > > _______________________________________________ Bioc-devel@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/bioc-devel