# Only run the following commands once to install WGCNA and flashClust on your computer
# source("http://bioconductor.org/biocLite.R")
# biocLite("WGCNA")
# install.packages("flashClust")
# Load WGCNA and flashClust libraries every time you open R
library(WGCNA)
## Loading required package: dynamicTreeCut
## Loading required package: fastcluster
##
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
##
## hclust
## ==========================================================================
## *
## * Package WGCNA 1.42 loaded.
## *
## * Important note: It appears that your system supports multi-threading,
## * but it is not enabled within WGCNA in R.
## * To allow multi-threading within WGCNA with all available cores, use
## *
## * allowWGCNAThreads()
## *
## * within R. Use disableWGCNAThreads() to disable threading if necessary.
## * Alternatively, set the following environment variable on your system:
## *
## * ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## * for example
## *
## * ALLOW_WGCNA_THREADS=2
## *
## * To set the environment variable in linux bash shell, type
## *
## * export ALLOW_WGCNA_THREADS=2
## *
## * before running R. Other operating systems or shells will
## * have a similar command to achieve the same aim.
## *
## ==========================================================================
##
## Attaching package: 'WGCNA'
## The following object is masked from 'package:stats':
##
## cor
library(flashClust)
##
## Attaching package: 'flashClust'
## The following object is masked from 'package:fastcluster':
##
## hclust
## The following object is masked from 'package:stats':
##
## hclust
#Set your current working directory (where all your files are)
setwd("~/Desktop/WGCNA/") # Change the text within quotes as necessary. I have downloaded and unzipped the WGCNA folder on my Desktop
Uploading data into R and formatting it for WGCNA
# This creates an object called "datExpr" that contains the normalized counts file output from DESeq2
datExpr = read.csv("SampleTimeSeriesRLD.csv")
# "head" the file to preview it
head(datExpr) # You see that genes are listed in a column named "X" and samples are in columns
## X A0 A1 A2 A4 A5 A6
## 1 isogroup2041 3.2279882 4.425669 4.053493 4.081084 3.842665 4.244835
## 2 isogroup27249 0.9261955 2.031387 2.085966 2.463860 1.823165 2.743842
## 3 isogroup26716 1.0776734 2.016798 2.798007 2.014945 1.936990 2.689329
## 4 isogroup21525 2.3277012 3.961517 3.532261 3.515290 3.860423 3.651837
## 5 isogroup17063 1.7921210 2.919299 2.883117 2.719757 3.584092 2.972269
## 6 isogroup23098 7.5494339 8.605921 8.714458 8.285594 8.359864 8.287737
## A7 A8 A9 A10 A11 A12 B0 B1
## 1 3.676771 3.941789 4.471518 3.701433 3.721957 4.228176 2.711784 4.438969
## 2 2.999606 2.259128 2.773004 3.285110 2.589221 2.343527 1.065831 2.102840
## 3 1.775270 3.166939 3.205823 1.556635 2.692233 2.704825 1.222318 2.552086
## 4 3.481384 3.095274 2.596508 3.672719 3.652501 3.434640 2.554495 4.371392
## 5 3.577360 3.559317 3.596309 3.693239 2.953543 3.217554 1.629696 3.271179
## 6 7.949861 7.872196 8.282509 7.873440 7.813140 8.006631 7.379813 8.614006
## B2 B3 B4 B5 B6 B7 B8 B9
## 1 4.192873 4.283763 4.145406 3.646702 3.818861 4.133641 3.990479 3.834212
## 2 2.321835 1.615376 2.227152 2.695438 2.382110 3.172436 2.175775 2.881731
## 3 2.710030 2.628911 2.067072 2.569413 3.135256 2.535408 3.358315 2.747720
## 4 3.624168 3.266043 3.382221 3.970439 3.827180 3.380975 3.037175 4.128778
## 5 3.619758 2.629647 3.387749 3.052602 3.060756 3.423325 2.888494 3.587651
## 6 8.731997 8.634263 8.368881 8.351090 8.118341 8.291326 8.059225 7.997958
## B10 B12 C0 C1 C2 C3 C4 C5
## 1 4.287655 4.391409 3.704225 4.429310 4.518705 4.174755 4.121667 3.712698
## 2 3.297518 2.558725 1.478850 2.417905 2.299758 2.778593 2.016950 2.564248
## 3 3.269745 2.959743 1.225789 2.179640 2.290551 2.608619 2.346118 3.086956
## 4 2.924908 3.679077 1.722992 4.011196 3.393334 3.560959 2.707955 3.156925
## 5 3.620145 3.119855 1.982470 3.029753 3.156511 2.659158 3.411735 3.325926
## 6 7.920953 7.821971 7.555304 8.748981 8.765364 8.474679 8.197553 8.329218
## C6 C7 C8 C9 C10 C12 D0 D1
## 1 4.121563 4.662792 4.485772 3.995811 4.016984 3.710906 3.438738 4.778724
## 2 1.044283 2.287411 1.852964 2.560540 1.617758 2.791151 1.104202 1.238932
## 3 2.681671 1.802319 2.490925 2.978003 3.083341 3.481888 1.262211 1.563040
## 4 3.262889 3.706466 2.689109 2.848256 3.269979 3.249449 1.762327 4.412169
## 5 3.411635 3.851133 3.593428 3.475429 3.688129 3.186268 1.672265 3.322115
## 6 8.320269 8.305366 7.958708 7.979192 8.269686 7.917266 7.541028 8.437456
## D2 D3 D4 D5 D6 D7 D8 D9
## 1 4.372119 4.471382 3.791108 4.053877 4.127585 4.450004 3.948535 4.066282
## 2 2.053670 2.188338 1.762516 2.522993 2.235528 2.612178 2.309813 1.709138
## 3 2.551285 2.718058 2.634042 2.831299 1.894300 2.381463 2.731666 3.226271
## 4 3.148012 3.064167 3.459427 2.946796 3.268502 2.743258 3.474978 3.408191
## 5 3.382879 2.623312 2.661818 3.464174 2.646809 3.785781 3.359743 3.187177
## 6 8.792712 8.708494 8.456304 8.376324 8.253110 8.260721 7.926216 8.015008
## D10 D11
## 1 4.083817 4.521707
## 2 2.312437 1.839739
## 3 2.215026 3.051648
## 4 2.957649 3.116199
## 5 3.768126 3.520356
## 6 8.061845 8.100753
# Manipulate file so it matches the format WGCNA needs
row.names(datExpr) = datExpr$X
datExpr$X = NULL
datExpr = as.data.frame(t(datExpr)) # now samples are rows and genes are columns
dim(datExpr) # 48 samples and 1000 genes (you will have many more genes in reality)
## [1] 48 1000
# Run this to check if there are gene outliers
gsg = goodSamplesGenes(datExpr, verbose = 3)
## Flagging genes and samples with too many missing values...
## ..step 1
gsg$allOK
## [1] TRUE
#If the last statement returns TRUE, all genes have passed the cuts. If not, we remove the offending genes and samples from the data with the following:
#if (!gsg$allOK)
# {if (sum(!gsg$goodGenes)>0)
# printFlush(paste("Removing genes:", paste(names(datExpr)[!gsg$goodGenes], collapse= ", ")));
# if (sum(!gsg$goodSamples)>0)
# printFlush(paste("Removing samples:", paste(rownames(datExpr)[!gsg$goodSamples], collapse=", ")))
# datExpr= datExpr[gsg$goodSamples, gsg$goodGenes]
# }
#Create an object called "datTraits" that contains your trait data
datTraits = read.csv("Traits_23May2015.csv")
head(datTraits)
## Sample Day0 Day1 Day2 Day3 Day4 Day5 Day6 Day7 Day8 Day9 Day10 Day11
## 1 A0 1 0 0 0 0 0 0 0 0 0 0 0
## 2 A1 0 1 0 0 0 0 0 0 0 0 0 0
## 3 A2 0 0 1 0 0 0 0 0 0 0 0 0
## 4 A4 0 0 0 0 1 0 0 0 0 0 0 0
## 5 A5 0 0 0 0 0 1 0 0 0 0 0 0
## 6 A6 0 0 0 0 0 0 1 0 0 0 0 0
## Day12 HPF RedFluoro GreenFluoro
## 1 0 3 0.00000 0.0000
## 2 0 22 0.00000 0.0000
## 3 0 46 0.00000 0.0000
## 4 0 56 54.13165 19.4236
## 5 0 80 75.66280 17.8254
## 6 0 104 92.42900 56.9575
#form a data frame analogous to expression data that will hold the clinical traits.
rownames(datTraits) = datTraits$Sample
datTraits$Sample = NULL
table(rownames(datTraits)==rownames(datExpr)) #should return TRUE if datasets align correctly, otherwise your names are out of order
##
## TRUE
## 48
You have finished uploading and formatting expression and trait data
Expression data is in datExpr, corresponding traits are datTraits
Cluster samples by expression
A = adjacency(t(datExpr),type="signed") # this calculates the whole network connectivity
k = as.numeric(apply(A,2,sum))-1 # standardized connectivity
Z.k = scale(k)
thresholdZ.k = -2.5 # often -2.5
outlierColor = ifelse(Z.k<thresholdZ.k,"red","black")
sampleTree = flashClust(as.dist(1-A), method = "average")
# Convert traits to a color representation where red indicates high values
traitColors = data.frame(numbers2colors(datTraits,signed=FALSE))
dimnames(traitColors)[[2]] = paste(names(datTraits))
datColors = data.frame(outlier = outlierColor,traitColors)
plotDendroAndColors(sampleTree,groupLabels=names(datColors),
colors=datColors,main="Sample Dendrogram and Trait Heatmap")
Day “0” outliers have been identified. You could exclude these samples with the code below, but this scientists had a biological reason to NOT exclude these samples. It’s up to you. Justify whatever decision you make.
# Remove outlying samples
#remove.samples = Z.k<thresholdZ.k | is.na(Z.k)
#datExprOut = datExpr[!remove.samples,]
#datTraitsOut = datTraits[!remove.samples,]
#save(datExprOut, datTraitsOut, file="SamplesAndTraits_OutliersRemoved.RData")
Choose a soft threshold power- USE A SUPERCOMPUTER IRL
powers = c(c(1:10), seq(from =10, to=30, by=1)) #choosing a set of soft-thresholding powers
sft = pickSoftThreshold(datExpr, powerVector=powers, verbose =5, networkType="signed") #call network topology analysis function
## pickSoftThreshold: will use block size 1000.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 1000 of 1000
## Warning: executing %dopar% sequentially: no parallel backend registered
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.604 2.520 0.85600 524.00 543.000 609.0
## 2 2 0.177 -0.985 -0.00776 317.00 322.000 448.0
## 3 3 0.434 -1.180 0.28100 210.00 203.000 354.0
## 4 4 0.562 -1.190 0.55000 147.00 133.000 291.0
## 5 5 0.610 -1.160 0.68600 107.00 90.900 245.0
## 6 6 0.634 -1.180 0.77700 81.00 66.500 209.0
## 7 7 0.674 -1.180 0.81700 62.60 49.900 180.0
## 8 8 0.709 -1.210 0.85200 49.30 38.600 157.0
## 9 9 0.729 -1.240 0.87600 39.50 29.600 138.0
## 10 10 0.756 -1.240 0.90300 32.10 23.000 122.0
## 11 10 0.756 -1.240 0.90300 32.10 23.000 122.0
## 12 11 0.766 -1.290 0.90500 26.40 18.200 108.0
## 13 12 0.791 -1.300 0.92200 21.90 14.500 96.5
## 14 13 0.792 -1.350 0.91400 18.30 11.500 86.5
## 15 14 0.811 -1.360 0.92600 15.50 9.210 77.8
## 16 15 0.829 -1.380 0.93600 13.20 7.520 70.3
## 17 16 0.861 -1.350 0.95100 11.30 6.250 63.6
## 18 17 0.874 -1.370 0.95900 9.71 5.060 57.8
## 19 18 0.880 -1.370 0.95600 8.42 4.140 52.6
## 20 19 0.897 -1.360 0.96800 7.33 3.400 48.0
## 21 20 0.903 -1.350 0.96700 6.42 2.830 43.9
## 22 21 0.922 -1.350 0.97700 5.64 2.390 40.3
## 23 22 0.936 -1.330 0.98500 4.98 1.980 37.0
## 24 23 0.944 -1.340 0.98900 4.41 1.670 34.0
## 25 24 0.953 -1.320 0.99500 3.92 1.390 31.3
## 26 25 0.956 -1.310 0.99300 3.50 1.140 28.9
## 27 26 0.958 -1.300 0.98900 3.14 0.953 26.7
## 28 27 0.964 -1.280 0.99300 2.82 0.807 24.8
## 29 28 0.970 -1.270 0.99400 2.54 0.690 22.9
## 30 29 0.974 -1.260 0.99400 2.29 0.584 21.3
## 31 30 0.976 -1.250 0.98900 2.08 0.501 19.8
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab= "Soft Threshold (power)", ylab="Scale Free Topology Model Fit, signed R^2", type= "n", main= paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers, col="red")
abline(h=0.90, col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab= "Soft Threshold (power)", ylab="Mean Connectivity", type="n", main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, col="red")
From this plot, we would choose a power of 18 becuase it’s the lowest power for which the scale free topology index reaches 0.90
enableWGCNAThreads()
## Allowing parallel execution with up to 2 working processes.
softPower = 18
adjacency = adjacency(datExpr, power = softPower, type = "signed") #specify network type
Construct Networks- USE A SUPERCOMPUTER IRL
#translate the adjacency into topological overlap matrix and calculate the corresponding dissimilarity:
TOM = TOMsimilarity(adjacency, TOMType="signed") # specify network type
## ..connectivity..
## ..matrix multiplication..
## ..normalization..
## ..done.
dissTOM = 1-TOM
Generate Modules
# Generate a clustered gene tree
geneTree = flashClust(as.dist(dissTOM), method="average")
plot(geneTree, xlab="", sub="", main= "Gene Clustering on TOM-based dissimilarity", labels= FALSE, hang=0.04)
#This sets the minimum number of genes to cluster into a module
minModuleSize = 30
dynamicMods = cutreeDynamic(dendro= geneTree, distM= dissTOM, deepSplit=2, pamRespectsDendro= FALSE, minClusterSize = minModuleSize)
## ..cutHeight not given, setting it to 0.997 ===> 99% of the (truncated) height range in dendro.
## ..done.
dynamicColors= labels2colors(dynamicMods)
MEList= moduleEigengenes(datExpr, colors= dynamicColors,softPower = 18)
MEs= MEList$eigengenes
MEDiss= 1-cor(MEs)
METree= flashClust(as.dist(MEDiss), method= "average")
save(dynamicMods, MEList, MEs, MEDiss, METree, file= "Network_allSamples_signed_RLDfiltered.RData")
#plots tree showing how the eigengenes cluster together
plot(METree, main= "Clustering of module eigengenes", xlab= "", sub= "")
#set a threhold for merging modules. In this example we are not merging so MEDissThres=0.0
MEDissThres = 0.0
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight= MEDissThres, verbose =3)
## mergeCloseModules: Merging modules whose distance is less than 0
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 9 module eigengenes in given set.
## Calculating new MEs...
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 9 module eigengenes in given set.
mergedColors = merge$colors
mergedMEs = merge$newMEs
#plot dendrogram with module colors below it
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors), c("Dynamic Tree Cut", "Merged dynamic"), dendroLabels= FALSE, hang=0.03, addGuide= TRUE, guideHang=0.05)
moduleColors = mergedColors
colorOrder = c("grey", standardColors(50))
moduleLabels = match(moduleColors, colorOrder)-1
MEs = mergedMEs
Correlate traits
#Define number of genes and samples
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#Print correlation heatmap between modules and traits
textMatrix= paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
par(mar= c(6, 8.5, 3, 3))
#display the corelation values with a heatmap plot
labeledHeatmap(Matrix= moduleTraitCor,
xLabels= names(datTraits),
yLabels= names(MEs),
ySymbols= names(MEs),
colorLabels= FALSE,
colors= blueWhiteRed(50),
textMatrix= textMatrix,
setStdMargins= FALSE,
cex.text= 0.5,
zlim= c(-1,1),
main= paste("Module-trait relationships"))