WGCNA看上去是个高大上的分析方法,但是笔者看来,该分析缺少生物背景的支持,像是纯计算的人做出来的工具。虽然分析流程的旁路上,确实可以进行GO注释,但是其注释的结果如何整合到最终分析的结果中去呢?或许只能作为结果解释的“仅仅一个注释”而已(机械地组合分析结果和注释)。因此,笔者不建议使用这个分析方法,除非灌水SCI。当然,该方法的无标度网络分析(包括模块聚类)是值得借鉴的。
官方教程请参考:WGCNA tutorials
网上有个博客对代码进行了比较详细的解释,可参考:WGCNA分析,简单全面的最新教程
library(EBImage)
pic = readImage("plots/flowchart.PNG")
display(pic)
library(WGCNA);
# The following setting is important, do not omit.
options(stringsAsFactors = FALSE);
#Read in the female liver data set
femData = read.csv("data/LiverFemale3600.csv");
datExpr0 = as.data.frame(t(femData[, -c(1:8)]));
names(datExpr0) = femData$substanceBXH;
rownames(datExpr0) = names(femData)[-c(1:8)];
gsg = goodSamplesGenes(datExpr0, verbose = 3);
## Flagging genes and samples with too many missing values...
## ..step 1
gsg$allOK
## [1] TRUE
if (!gsg$allOK)
{
# Optionally, print the gene and sample names that were removed:
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
# Remove the offending genes and samples from the data:
datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}
sampleTree = hclust(dist(datExpr0), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
# sizeGrWindow(12,9)
#pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
cex.axis = 1.5, cex.main = 2)
# Plot a line to show the cut
abline(h = 15, col = "red");
# Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 15, minSize = 10)
table(clust)
## clust
## 0 1
## 1 134
# clust 1 contains the samples we want to keep.
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
traitData = read.csv("data/ClinicalTraits.csv");
# remove columns that hold information we do not need.
allTraits = traitData[, -c(31, 16)];
allTraits = allTraits[, c(2, 11:36) ];
# Form a data frame analogous to expression data that will hold the clinical traits.
femaleSamples = rownames(datExpr);
traitRows = match(femaleSamples, allTraits$Mice);
datTraits = allTraits[traitRows, -1];
rownames(datTraits) = allTraits[traitRows, 1];
collectGarbage();
# Re-cluster samples
sampleTree2 = hclust(dist(datExpr), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(datTraits, signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree2, traitColors,
groupLabels = names(datTraits),
main = "Sample dendrogram and trait heatmap")
save(datExpr, datTraits, file = "data/FemaleLiver-01-dataInput.RData")
options(stringsAsFactors = FALSE);
# Allow multi-threading within WGCNA. This helps speed up certain calculations.
# At present this call is necessary for the code to work.
# Any error here may be ignored but you may want to update WGCNA if you see one.
# Caution: skip this line if you run RStudio or other third-party R environments.
# See note above.
enableWGCNAThreads()
## Allowing parallel execution with up to 7 working processes.
# Load the data saved in the first part
lnames = load(file = "data/FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames
## [1] "datExpr" "datTraits"
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
## pickSoftThreshold: will use block size 3600.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 3600 of 3600
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.0278 0.345 0.456 747.00 762.0000 1210.0
## 2 2 0.1260 -0.597 0.843 254.00 251.0000 574.0
## 3 3 0.3400 -1.030 0.972 111.00 102.0000 324.0
## 4 4 0.5060 -1.420 0.973 56.50 47.2000 202.0
## 5 5 0.6810 -1.720 0.940 32.20 25.1000 134.0
## 6 6 0.9020 -1.500 0.962 19.90 14.5000 94.8
## 7 7 0.9210 -1.670 0.917 13.20 8.6800 84.1
## 8 8 0.9040 -1.720 0.876 9.25 5.3900 76.3
## 9 9 0.8590 -1.700 0.836 6.80 3.5600 70.5
## 10 10 0.8330 -1.660 0.831 5.19 2.3800 65.8
## 11 12 0.8530 -1.480 0.911 3.33 1.1500 58.1
## 12 14 0.8760 -1.380 0.949 2.35 0.5740 51.9
## 13 16 0.9070 -1.300 0.970 1.77 0.3090 46.8
## 14 18 0.9120 -1.240 0.973 1.39 0.1670 42.5
## 15 20 0.9310 -1.210 0.977 1.14 0.0951 38.7
# Plot the results:
# sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
# setwd("data")
net = blockwiseModules(datExpr, power = 6,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "femaleMouseTOM",
verbose = 3)
## Calculating module eigengenes block-wise from all genes
## Flagging genes and samples with too many missing values...
## ..step 1
## Cluster size 3600 broken into 2133 1467
## Cluster size 2133 broken into 1221 912
## Done cluster 1221
## Done cluster 912
## Done cluster 2133
## Done cluster 1467
## ..Working on block 1 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.396405
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 1 into file femaleMouseTOM-block.1.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 1 genes from module 1 because their KME is too low.
## ..removing 1 genes from module 7 because their KME is too low.
## ..removing 1 genes from module 8 because their KME is too low.
## ..removing 1 genes from module 21 because their KME is too low.
## ..merging modules that are too close..
## mergeCloseModules: Merging modules whose distance is less than 0.25
## Calculating new MEs...
# setwd("..")
# open a graphics window
# sizeGrWindow(12, 9)
# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];
save(MEs, moduleLabels, moduleColors, geneTree,
file = "data/FemaleLiver-02-networkConstruction-auto.RData")
# Load the expression and trait data saved in the first part
lnames = load(file = "data/FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames
## [1] "datExpr" "datTraits"
# Load network data saved in the second part.
lnames = load(file = "data/FemaleLiver-02-networkConstruction-auto.RData");
lnames
## [1] "MEs" "moduleLabels" "moduleColors" "geneTree"
# Define numbers of genes and samples
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
# sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
# Define variable weight containing the weight column of datTrait
weight = as.data.frame(datTraits$weight_g);
names(weight) = "weight"
# names (colors) of the modules
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");
geneTraitSignificance = as.data.frame(cor(datExpr, weight, use = "p"));
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
names(geneTraitSignificance) = paste("GS.", names(weight), sep="");
names(GSPvalue) = paste("p.GS.", names(weight), sep="");
module = "brown"
column = match(module, modNames);
moduleGenes = moduleColors==module;
# sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance for body weight",
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
# names(datExpr)[moduleColors=="brown"]
annot = read.csv(file = "data/GeneAnnotation.csv");
probes = names(datExpr)
probes2annot = match(probes, annot$substanceBXH)
# The following is the number or probes without annotation:
sum(is.na(probes2annot))
## [1] 0
# Should return 0.
# Create the starting data frame
geneInfo0 = data.frame(substanceBXH = probes,
geneSymbol = annot$gene_symbol[probes2annot],
LocusLinkID = annot$LocusLinkID[probes2annot],
moduleColor = moduleColors,
geneTraitSignificance,
GSPvalue)
# Order modules by their significance for weight
modOrder = order(-abs(cor(MEs, weight, use = "p")));
# Add module membership information in the chosen order
for (mod in 1:ncol(geneModuleMembership))
{
oldNames = names(geneInfo0)
geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]],
MMPvalue[, modOrder[mod]]);
names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""),
paste("p.MM.", modNames[modOrder[mod]], sep=""))
}
# Order the genes in the geneInfo variable first by module color, then by geneTraitSignificance
geneOrder = order(geneInfo0$moduleColor, -abs(geneInfo0$GS.weight));
geneInfo = geneInfo0[geneOrder, ]
write.csv(geneInfo, file = "data/geneInfo.csv")
# Load the expression and trait data saved in the first part
lnames = load(file = "data/FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames
## [1] "datExpr" "datTraits"
# Load network data saved in the second part.
lnames = load(file = "data/FemaleLiver-02-networkConstruction-auto.RData");
lnames
## [1] "MEs" "moduleLabels" "moduleColors" "geneTree"
# Read in the probe annotation
annot = read.csv(file = "data/GeneAnnotation.csv");
# Match probes in the data set to the probe IDs in the annotation file
probes = names(datExpr)
probes2annot = match(probes, annot$substanceBXH)
# Get the corresponding Locuis Link IDs
allLLIDs = annot$LocusLinkID[probes2annot];
# $ Choose interesting modules
intModules = c("brown", "red", "salmon")
for (module in intModules)
{
# Select module probes
modGenes = (moduleColors==module)
# Get their entrez ID codes
modLLIDs = allLLIDs[modGenes];
# Write them into a file
fileName = paste("data/LocusLinkIDs-", module, ".txt", sep="");
write.table(as.data.frame(modLLIDs), file = fileName,
row.names = FALSE, col.names = FALSE)
}
# As background in the enrichment analysis, we will use all probes in the analysis.
fileName = paste("data/LocusLinkIDs-all.txt", sep="");
write.table(as.data.frame(allLLIDs), file = fileName,
row.names = FALSE, col.names = FALSE)
# BiocManager::install("org.Mm.eg.db")
library(org.Mm.eg.db)
GOenr = GOenrichmentAnalysis(moduleColors, allLLIDs, organism = "mouse", nBestP = 10);
## GOenrichmentAnalysis: loading annotation data...
## ..of the 3038 Entrez identifiers submitted, 2829 are mapped in current GO categories.
## ..will use 2829 background genes for enrichment calculations.
## ..preparing term lists (this may take a while)..
## ..working on label set 1 ..
## ..calculating enrichments (this may also take a while)..
## ..putting together terms with highest enrichment significance..
tab = GOenr$bestPTerms[[4]]$enrichment
names(tab)
## [1] "module" "modSize" "bkgrModSize"
## [4] "rank" "enrichmentP" "BonferoniP"
## [7] "nModGenesInTerm" "fracOfBkgrModSize" "fracOfBkgrTermSize"
## [10] "bkgrTermSize" "termID" "termOntology"
## [13] "termName" "termDefinition"
write.table(tab, file = "data/GOEnrichmentTable.csv", sep = ",", quote = TRUE, row.names = FALSE)
keepCols = c(1, 2, 5, 6, 7, 12, 13);
screenTab = tab[, keepCols];
# Round the numeric columns to 2 decimal places:
numCols = c(3, 4);
screenTab[, numCols] = signif(apply(screenTab[, numCols], 2, as.numeric), 2)
# Truncate the the term name to at most 40 characters
screenTab[, 7] = substring(screenTab[, 7], 1, 40)
# Shorten the column names:
colnames(screenTab) = c("module", "size", "p-val", "Bonf", "nInTerm", "ont", "term name");
rownames(screenTab) = NULL;
# Set the width of R's output. The reader should play with this number to obtain satisfactory output.
options(width=95)
# Finally, display the enrichment table:
# screenTab
# Load the expression and trait data saved in the first part
lnames = load(file = "data/FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames
## [1] "datExpr" "datTraits"
# Load network data saved in the second part.
lnames = load(file = "data/FemaleLiver-02-networkConstruction-auto.RData");
lnames
## [1] "MEs" "moduleLabels" "moduleColors" "geneTree"
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
# Calculate topological overlap anew: this could be done more efficiently by saving the TOM
# calculated during module detection, but let us do it again here.
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6);
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.396405
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# Transform dissTOM with a power to make moderately strong connections more visible in the heatmap
plotTOM = dissTOM^7;
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA;
# Call the plot function
# sizeGrWindow(9,9)
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
nSelect = 400
# For reproducibility, we set the random seed
set.seed(10);
select = sample(nGenes, size = nSelect);
selectTOM = dissTOM[select, select];
# There's no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select];
# Open a graphical window
# sizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^7;
diag(plotDiss) = NA;
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
# Recalculate module eigengenes
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
# Isolate weight from the clinical traits
weight = as.data.frame(datTraits$weight_g);
names(weight) = "weight"
# Add the weight to existing module eigengenes
MET = orderMEs(cbind(MEs, weight))
# Plot the relationships among the eigengenes and the trait
# sizeGrWindow(5,7.5);
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle = 90)
# Plot the dendrogram
# sizeGrWindow(6,6);
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),
plotHeatmaps = FALSE)
# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
plotDendrograms = FALSE, xLabelsAngle = 90)
# Load the expression and trait data saved in the first part
lnames = load(file = "data/FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
#lnames
# Load network data saved in the second part.
lnames = load(file = "data/FemaleLiver-02-networkConstruction-auto.RData");
#lnames
# Recalculate topological overlap
TOM = TOMsimilarityFromExpr(datExpr, power = 6);
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.396405
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# Read in the annotation file
annot = read.csv(file = "data/GeneAnnotation.csv");
# Select module
module = "brown";
# Select module probes
probes = names(datExpr)
inModule = (moduleColors==module);
modProbes = probes[inModule];
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
# Export the network into an edge list file VisANT can read
vis = exportNetworkToVisANT(modTOM,
file = paste("data/VisANTInput-", module, ".txt", sep=""),
weighted = TRUE,
threshold = 0,
probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol) )
nTop = 30;
IMConn = softConnectivity(datExpr[, modProbes]);
## softConnectivity: FYI: connecitivty of genes with less than 45 valid samples will be returned as NA.
## ..calculating connectivities..
top = (rank(-IMConn) <= nTop)
vis = exportNetworkToVisANT(modTOM[top, top],
file = paste("data/VisANTInput-", module, "-top30.txt", sep=""),
weighted = TRUE,
threshold = 0,
probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol) )
# Recalculate topological overlap if needed
TOM = TOMsimilarityFromExpr(datExpr, power = 6);
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.396405
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# Read in the annotation file
annot = read.csv(file = "data/GeneAnnotation.csv");
# Select modules
modules = c("brown", "red");
# Select module probes
probes = names(datExpr)
inModule = is.finite(match(moduleColors, modules));
modProbes = probes[inModule];
modGenes = annot$gene_symbol[match(modProbes, annot$substanceBXH)];
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
# Export the network into edge and node list files Cytoscape can read
cyt = exportNetworkToCytoscape(modTOM,
edgeFile = paste("data/CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
nodeFile = paste("data/CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
weighted = TRUE,
threshold = 0.02,
nodeNames = modProbes,
altNodeNames = modGenes,
nodeAttr = moduleColors[inModule])