WGCNA

前言

WGCNA看上去是个高大上的分析方法，但是笔者看来，该分析缺少生物背景的支持，像是纯计算的人做出来的工具。虽然分析流程的旁路上，确实可以进行GO注释，但是其注释的结果如何整合到最终分析的结果中去呢？或许只能作为结果解释的“仅仅一个注释”而已（机械地组合分析结果和注释）。因此，笔者不建议使用这个分析方法，除非灌水SCI。当然，该方法的无标度网络分析（包括模块聚类）是值得借鉴的。

官方教程请参考：WGCNA tutorials

网上有个博客对代码进行了比较详细的解释，可参考：WGCNA分析，简单全面的最新教程

分析流程

library(EBImage)
pic = readImage("plots/flowchart.PNG")
display(pic)

数据准备

library(WGCNA);
# The following setting is important, do not omit.
options(stringsAsFactors = FALSE);
#Read in the female liver data set
femData = read.csv("data/LiverFemale3600.csv");

datExpr0 = as.data.frame(t(femData[, -c(1:8)]));
names(datExpr0) = femData$substanceBXH;
rownames(datExpr0) = names(femData)[-c(1:8)];

gsg = goodSamplesGenes(datExpr0, verbose = 3);

##  Flagging genes and samples with too many missing values...
##   ..step 1

gsg$allOK

## [1] TRUE

if (!gsg$allOK)
{
  # Optionally, print the gene and sample names that were removed:
  if (sum(!gsg$goodGenes)>0) 
     printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
  if (sum(!gsg$goodSamples)>0) 
     printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
  # Remove the offending genes and samples from the data:
  datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}

sampleTree = hclust(dist(datExpr0), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.

# sizeGrWindow(12,9)
#pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, 
     cex.axis = 1.5, cex.main = 2)

# Plot a line to show the cut
abline(h = 15, col = "red");

# Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 15, minSize = 10)
table(clust)

## clust
##   0   1 
##   1 134

# clust 1 contains the samples we want to keep.
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

traitData = read.csv("data/ClinicalTraits.csv");

# remove columns that hold information we do not need.
allTraits = traitData[, -c(31, 16)];
allTraits = allTraits[, c(2, 11:36) ];

# Form a data frame analogous to expression data that will hold the clinical traits.

femaleSamples = rownames(datExpr);
traitRows = match(femaleSamples, allTraits$Mice);
datTraits = allTraits[traitRows, -1];
rownames(datTraits) = allTraits[traitRows, 1];

collectGarbage();

# Re-cluster samples
sampleTree2 = hclust(dist(datExpr), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(datTraits, signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree2, traitColors,
                    groupLabels = names(datTraits), 
                    main = "Sample dendrogram and trait heatmap")

save(datExpr, datTraits, file = "data/FemaleLiver-01-dataInput.RData")

计算共表达模块

options(stringsAsFactors = FALSE);
# Allow multi-threading within WGCNA. This helps speed up certain calculations.
# At present this call is necessary for the code to work.
# Any error here may be ignored but you may want to update WGCNA if you see one.
# Caution: skip this line if you run RStudio or other third-party R environments. 
# See note above.
enableWGCNAThreads()

## Allowing parallel execution with up to 7 working processes.

# Load the data saved in the first part
lnames = load(file = "data/FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames

## [1] "datExpr"   "datTraits"

# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)

## pickSoftThreshold: will use block size 3600.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 3600 of 3600
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1      1   0.0278  0.345          0.456  747.00  762.0000 1210.0
## 2      2   0.1260 -0.597          0.843  254.00  251.0000  574.0
## 3      3   0.3400 -1.030          0.972  111.00  102.0000  324.0
## 4      4   0.5060 -1.420          0.973   56.50   47.2000  202.0
## 5      5   0.6810 -1.720          0.940   32.20   25.1000  134.0
## 6      6   0.9020 -1.500          0.962   19.90   14.5000   94.8
## 7      7   0.9210 -1.670          0.917   13.20    8.6800   84.1
## 8      8   0.9040 -1.720          0.876    9.25    5.3900   76.3
## 9      9   0.8590 -1.700          0.836    6.80    3.5600   70.5
## 10    10   0.8330 -1.660          0.831    5.19    2.3800   65.8
## 11    12   0.8530 -1.480          0.911    3.33    1.1500   58.1
## 12    14   0.8760 -1.380          0.949    2.35    0.5740   51.9
## 13    16   0.9070 -1.300          0.970    1.77    0.3090   46.8
## 14    18   0.9120 -1.240          0.973    1.39    0.1670   42.5
## 15    20   0.9310 -1.210          0.977    1.14    0.0951   38.7

# Plot the results:
# sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

# setwd("data")
net = blockwiseModules(datExpr, power = 6,
                       TOMType = "unsigned", minModuleSize = 30,
                       reassignThreshold = 0, mergeCutHeight = 0.25,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,
                       saveTOMs = TRUE,
                       saveTOMFileBase = "femaleMouseTOM", 
                       verbose = 3)

##  Calculating module eigengenes block-wise from all genes
##    Flagging genes and samples with too many missing values...
##     ..step 1
## Cluster size 3600 broken into 2133 1467 
## Cluster size 2133 broken into 1221 912 
## Done cluster 1221 
## Done cluster 912 
## Done cluster 2133 
## Done cluster 1467 
##  ..Working on block 1 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.396405
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 1 into file femaleMouseTOM-block.1.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##      ..removing 1 genes from module 1 because their KME is too low.
##      ..removing 1 genes from module 7 because their KME is too low.
##      ..removing 1 genes from module 8 because their KME is too low.
##      ..removing 1 genes from module 21 because their KME is too low.
##  ..merging modules that are too close..
##      mergeCloseModules: Merging modules whose distance is less than 0.25
##        Calculating new MEs...

# setwd("..")

# open a graphics window
# sizeGrWindow(12, 9)
# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];
save(MEs, moduleLabels, moduleColors, geneTree, 
     file = "data/FemaleLiver-02-networkConstruction-auto.RData")

将模块与表型进行关联

# Load the expression and trait data saved in the first part
lnames = load(file = "data/FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames

## [1] "datExpr"   "datTraits"

# Load network data saved in the second part.
lnames = load(file = "data/FemaleLiver-02-networkConstruction-auto.RData");
lnames

## [1] "MEs"          "moduleLabels" "moduleColors" "geneTree"

# Define numbers of genes and samples
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);

# sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix =  paste(signif(moduleTraitCor, 2), "\n(",
                           signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(datTraits),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = greenWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))

# Define variable weight containing the weight column of datTrait
weight = as.data.frame(datTraits$weight_g);
names(weight) = "weight"
# names (colors) of the modules
modNames = substring(names(MEs), 3)

geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));

names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");

geneTraitSignificance = as.data.frame(cor(datExpr, weight, use = "p"));
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));

names(geneTraitSignificance) = paste("GS.", names(weight), sep="");
names(GSPvalue) = paste("p.GS.", names(weight), sep="");

module = "brown"
column = match(module, modNames);
moduleGenes = moduleColors==module;

# sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                   abs(geneTraitSignificance[moduleGenes, 1]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "Gene significance for body weight",
                   main = paste("Module membership vs. gene significance\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)

# names(datExpr)[moduleColors=="brown"]

annot = read.csv(file = "data/GeneAnnotation.csv");
probes = names(datExpr)
probes2annot = match(probes, annot$substanceBXH)
# The following is the number or probes without annotation:
sum(is.na(probes2annot))

## [1] 0

# Should return 0.

# Create the starting data frame
geneInfo0 = data.frame(substanceBXH = probes,
                      geneSymbol = annot$gene_symbol[probes2annot],
                      LocusLinkID = annot$LocusLinkID[probes2annot],
                      moduleColor = moduleColors,
                      geneTraitSignificance,
                      GSPvalue)
# Order modules by their significance for weight
modOrder = order(-abs(cor(MEs, weight, use = "p")));
# Add module membership information in the chosen order
for (mod in 1:ncol(geneModuleMembership))
{
  oldNames = names(geneInfo0)
  geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]], 
                         MMPvalue[, modOrder[mod]]);
  names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""),
                       paste("p.MM.", modNames[modOrder[mod]], sep=""))
}
# Order the genes in the geneInfo variable first by module color, then by geneTraitSignificance
geneOrder = order(geneInfo0$moduleColor, -abs(geneInfo0$GS.weight));
geneInfo = geneInfo0[geneOrder, ]


write.csv(geneInfo, file = "data/geneInfo.csv")

功能注释和富集分析

# Load the expression and trait data saved in the first part
lnames = load(file = "data/FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames

## [1] "datExpr"   "datTraits"

# Load network data saved in the second part.
lnames = load(file = "data/FemaleLiver-02-networkConstruction-auto.RData");
lnames

## [1] "MEs"          "moduleLabels" "moduleColors" "geneTree"

# Read in the probe annotation
annot = read.csv(file = "data/GeneAnnotation.csv");
# Match probes in the data set to the probe IDs in the annotation file 
probes = names(datExpr)
probes2annot = match(probes, annot$substanceBXH)
# Get the corresponding Locuis Link IDs
allLLIDs = annot$LocusLinkID[probes2annot];
# $ Choose interesting modules
intModules = c("brown", "red", "salmon")
for (module in intModules)
{
  # Select module probes
  modGenes = (moduleColors==module)
  # Get their entrez ID codes
  modLLIDs = allLLIDs[modGenes];
  # Write them into a file
  fileName = paste("data/LocusLinkIDs-", module, ".txt", sep="");
  write.table(as.data.frame(modLLIDs), file = fileName,
              row.names = FALSE, col.names = FALSE)
}
# As background in the enrichment analysis, we will use all probes in the analysis.
fileName = paste("data/LocusLinkIDs-all.txt", sep="");
write.table(as.data.frame(allLLIDs), file = fileName,
            row.names = FALSE, col.names = FALSE)

# BiocManager::install("org.Mm.eg.db")
library(org.Mm.eg.db)
GOenr = GOenrichmentAnalysis(moduleColors, allLLIDs, organism = "mouse", nBestP = 10);

##  GOenrichmentAnalysis: loading annotation data...
##   ..of the 3038  Entrez identifiers submitted, 2829 are mapped in current GO categories.
##   ..will use 2829 background genes for enrichment calculations.
##   ..preparing term lists (this may take a while).. 
##   ..working on label set 1 ..
##     ..calculating enrichments (this may also take a while)..
##     ..putting together terms with highest enrichment significance..

tab = GOenr$bestPTerms[[4]]$enrichment

names(tab)

##  [1] "module"             "modSize"            "bkgrModSize"       
##  [4] "rank"               "enrichmentP"        "BonferoniP"        
##  [7] "nModGenesInTerm"    "fracOfBkgrModSize"  "fracOfBkgrTermSize"
## [10] "bkgrTermSize"       "termID"             "termOntology"      
## [13] "termName"           "termDefinition"

write.table(tab, file = "data/GOEnrichmentTable.csv", sep = ",", quote = TRUE, row.names = FALSE)

keepCols = c(1, 2, 5, 6, 7, 12, 13);
screenTab = tab[, keepCols];
# Round the numeric columns to 2 decimal places:
numCols = c(3, 4);
screenTab[, numCols] = signif(apply(screenTab[, numCols], 2, as.numeric), 2)
# Truncate the the term name to at most 40 characters
screenTab[, 7] = substring(screenTab[, 7], 1, 40)
# Shorten the column names:
colnames(screenTab) = c("module", "size", "p-val", "Bonf", "nInTerm", "ont", "term name");
rownames(screenTab) = NULL;
# Set the width of R's output. The reader should play with this number to obtain satisfactory output.
options(width=95)
# Finally, display the enrichment table:
# screenTab

模块间的关联

# Load the expression and trait data saved in the first part
lnames = load(file = "data/FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
lnames

## [1] "datExpr"   "datTraits"

# Load network data saved in the second part.
lnames = load(file = "data/FemaleLiver-02-networkConstruction-auto.RData");
lnames

## [1] "MEs"          "moduleLabels" "moduleColors" "geneTree"

nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

# Calculate topological overlap anew: this could be done more efficiently by saving the TOM
# calculated during module detection, but let us do it again here.
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6);

## TOM calculation: adjacency..
## ..will not use multithreading.
##  Fraction of slow calculations: 0.396405
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.

# Transform dissTOM with a power to make moderately strong connections more visible in the heatmap
plotTOM = dissTOM^7;
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA;
# Call the plot function
# sizeGrWindow(9,9)
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")

nSelect = 400
# For reproducibility, we set the random seed
set.seed(10);
select = sample(nGenes, size = nSelect);
selectTOM = dissTOM[select, select];
# There's no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select];
# Open a graphical window
# sizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing 
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^7;
diag(plotDiss) = NA;
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")

# Recalculate module eigengenes
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
# Isolate weight from the clinical traits
weight = as.data.frame(datTraits$weight_g);
names(weight) = "weight"
# Add the weight to existing module eigengenes
MET = orderMEs(cbind(MEs, weight))
# Plot the relationships among the eigengenes and the trait
# sizeGrWindow(5,7.5);
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle = 90)

# Plot the dendrogram
# sizeGrWindow(6,6);
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),
                      plotHeatmaps = FALSE)

# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
                      plotDendrograms = FALSE, xLabelsAngle = 90)

导出结果

# Load the expression and trait data saved in the first part
lnames = load(file = "data/FemaleLiver-01-dataInput.RData");
#The variable lnames contains the names of loaded variables.
#lnames
# Load network data saved in the second part.
lnames = load(file = "data/FemaleLiver-02-networkConstruction-auto.RData");
#lnames

# Recalculate topological overlap
TOM = TOMsimilarityFromExpr(datExpr, power = 6);

## TOM calculation: adjacency..
## ..will not use multithreading.
##  Fraction of slow calculations: 0.396405
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.

# Read in the annotation file
annot = read.csv(file = "data/GeneAnnotation.csv");
# Select module
module = "brown";
# Select module probes
probes = names(datExpr)
inModule = (moduleColors==module);
modProbes = probes[inModule];
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
# Export the network into an edge list file VisANT can read
vis = exportNetworkToVisANT(modTOM,
  file = paste("data/VisANTInput-", module, ".txt", sep=""),
  weighted = TRUE,
  threshold = 0,
  probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol) )

nTop = 30;
IMConn = softConnectivity(datExpr[, modProbes]);

##  softConnectivity: FYI: connecitivty of genes with less than 45 valid samples will be returned as NA.
##  ..calculating connectivities..

top = (rank(-IMConn) <= nTop)
vis = exportNetworkToVisANT(modTOM[top, top],
  file = paste("data/VisANTInput-", module, "-top30.txt", sep=""),
  weighted = TRUE,
  threshold = 0,
  probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol) )

# Recalculate topological overlap if needed
TOM = TOMsimilarityFromExpr(datExpr, power = 6);

## TOM calculation: adjacency..
## ..will not use multithreading.
##  Fraction of slow calculations: 0.396405
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.

# Read in the annotation file
annot = read.csv(file = "data/GeneAnnotation.csv");
# Select modules
modules = c("brown", "red");
# Select module probes
probes = names(datExpr)
inModule = is.finite(match(moduleColors, modules));
modProbes = probes[inModule];
modGenes = annot$gene_symbol[match(modProbes, annot$substanceBXH)];
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
# Export the network into edge and node list files Cytoscape can read
cyt = exportNetworkToCytoscape(modTOM,
  edgeFile = paste("data/CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
  nodeFile = paste("data/CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
  weighted = TRUE,
  threshold = 0.02,
  nodeNames = modProbes,
  altNodeNames = modGenes,
  nodeAttr = moduleColors[inModule])