# Only run the following commands once to install WGCNA and flashClust on your computer
# source("http://bioconductor.org/biocLite.R")
# biocLite("WGCNA")
# install.packages("flashClust") 

# Load WGCNA and flashClust libraries every time you open R
library(WGCNA)
## Loading required package: dynamicTreeCut
## Loading required package: fastcluster
## 
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
## 
##     hclust
## ==========================================================================
## *
## *  Package WGCNA 1.42 loaded.
## *
## *    Important note: It appears that your system supports multi-threading,
## *    but it is not enabled within WGCNA in R. 
## *    To allow multi-threading within WGCNA with all available cores, use 
## *
## *          allowWGCNAThreads()
## *
## *    within R. Use disableWGCNAThreads() to disable threading if necessary.
## *    Alternatively, set the following environment variable on your system:
## *
## *          ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## *    for example 
## *
## *          ALLOW_WGCNA_THREADS=2
## *
## *    To set the environment variable in linux bash shell, type 
## *
## *           export ALLOW_WGCNA_THREADS=2
## *
## *     before running R. Other operating systems or shells will
## *     have a similar command to achieve the same aim.
## *
## ==========================================================================
## 
## Attaching package: 'WGCNA'
## The following object is masked from 'package:stats':
## 
##     cor
library(flashClust)
## 
## Attaching package: 'flashClust'
## The following object is masked from 'package:fastcluster':
## 
##     hclust
## The following object is masked from 'package:stats':
## 
##     hclust
#Set your current working directory (where all your files are)
setwd("~/Desktop/WGCNA/") # Change the text within quotes as necessary. I have downloaded and unzipped the WGCNA folder on my Desktop

Uploading data into R and formatting it for WGCNA

# This creates an object called "datExpr" that contains the normalized counts file output from DESeq2
datExpr = read.csv("SampleTimeSeriesRLD.csv")
# "head" the file to preview it
head(datExpr) # You see that genes are listed in a column named "X" and samples are in columns
##               X        A0       A1       A2       A4       A5       A6
## 1  isogroup2041 3.2279882 4.425669 4.053493 4.081084 3.842665 4.244835
## 2 isogroup27249 0.9261955 2.031387 2.085966 2.463860 1.823165 2.743842
## 3 isogroup26716 1.0776734 2.016798 2.798007 2.014945 1.936990 2.689329
## 4 isogroup21525 2.3277012 3.961517 3.532261 3.515290 3.860423 3.651837
## 5 isogroup17063 1.7921210 2.919299 2.883117 2.719757 3.584092 2.972269
## 6 isogroup23098 7.5494339 8.605921 8.714458 8.285594 8.359864 8.287737
##         A7       A8       A9      A10      A11      A12       B0       B1
## 1 3.676771 3.941789 4.471518 3.701433 3.721957 4.228176 2.711784 4.438969
## 2 2.999606 2.259128 2.773004 3.285110 2.589221 2.343527 1.065831 2.102840
## 3 1.775270 3.166939 3.205823 1.556635 2.692233 2.704825 1.222318 2.552086
## 4 3.481384 3.095274 2.596508 3.672719 3.652501 3.434640 2.554495 4.371392
## 5 3.577360 3.559317 3.596309 3.693239 2.953543 3.217554 1.629696 3.271179
## 6 7.949861 7.872196 8.282509 7.873440 7.813140 8.006631 7.379813 8.614006
##         B2       B3       B4       B5       B6       B7       B8       B9
## 1 4.192873 4.283763 4.145406 3.646702 3.818861 4.133641 3.990479 3.834212
## 2 2.321835 1.615376 2.227152 2.695438 2.382110 3.172436 2.175775 2.881731
## 3 2.710030 2.628911 2.067072 2.569413 3.135256 2.535408 3.358315 2.747720
## 4 3.624168 3.266043 3.382221 3.970439 3.827180 3.380975 3.037175 4.128778
## 5 3.619758 2.629647 3.387749 3.052602 3.060756 3.423325 2.888494 3.587651
## 6 8.731997 8.634263 8.368881 8.351090 8.118341 8.291326 8.059225 7.997958
##        B10      B12       C0       C1       C2       C3       C4       C5
## 1 4.287655 4.391409 3.704225 4.429310 4.518705 4.174755 4.121667 3.712698
## 2 3.297518 2.558725 1.478850 2.417905 2.299758 2.778593 2.016950 2.564248
## 3 3.269745 2.959743 1.225789 2.179640 2.290551 2.608619 2.346118 3.086956
## 4 2.924908 3.679077 1.722992 4.011196 3.393334 3.560959 2.707955 3.156925
## 5 3.620145 3.119855 1.982470 3.029753 3.156511 2.659158 3.411735 3.325926
## 6 7.920953 7.821971 7.555304 8.748981 8.765364 8.474679 8.197553 8.329218
##         C6       C7       C8       C9      C10      C12       D0       D1
## 1 4.121563 4.662792 4.485772 3.995811 4.016984 3.710906 3.438738 4.778724
## 2 1.044283 2.287411 1.852964 2.560540 1.617758 2.791151 1.104202 1.238932
## 3 2.681671 1.802319 2.490925 2.978003 3.083341 3.481888 1.262211 1.563040
## 4 3.262889 3.706466 2.689109 2.848256 3.269979 3.249449 1.762327 4.412169
## 5 3.411635 3.851133 3.593428 3.475429 3.688129 3.186268 1.672265 3.322115
## 6 8.320269 8.305366 7.958708 7.979192 8.269686 7.917266 7.541028 8.437456
##         D2       D3       D4       D5       D6       D7       D8       D9
## 1 4.372119 4.471382 3.791108 4.053877 4.127585 4.450004 3.948535 4.066282
## 2 2.053670 2.188338 1.762516 2.522993 2.235528 2.612178 2.309813 1.709138
## 3 2.551285 2.718058 2.634042 2.831299 1.894300 2.381463 2.731666 3.226271
## 4 3.148012 3.064167 3.459427 2.946796 3.268502 2.743258 3.474978 3.408191
## 5 3.382879 2.623312 2.661818 3.464174 2.646809 3.785781 3.359743 3.187177
## 6 8.792712 8.708494 8.456304 8.376324 8.253110 8.260721 7.926216 8.015008
##        D10      D11
## 1 4.083817 4.521707
## 2 2.312437 1.839739
## 3 2.215026 3.051648
## 4 2.957649 3.116199
## 5 3.768126 3.520356
## 6 8.061845 8.100753
# Manipulate file so it matches the format WGCNA needs 
row.names(datExpr) = datExpr$X
datExpr$X = NULL
datExpr = as.data.frame(t(datExpr)) # now samples are rows and genes are columns
dim(datExpr) # 48 samples and 1000 genes (you will have many more genes in reality)
## [1]   48 1000
# Run this to check if there are gene outliers
gsg = goodSamplesGenes(datExpr, verbose = 3)
##  Flagging genes and samples with too many missing values...
##   ..step 1
gsg$allOK 
## [1] TRUE
#If the last statement returns TRUE, all genes have passed the cuts. If not, we remove the offending genes and samples from the data with the following:

#if (!gsg$allOK)
#  {if (sum(!gsg$goodGenes)>0)
#       printFlush(paste("Removing genes:", paste(names(datExpr)[!gsg$goodGenes], collapse= ", ")));
#       if (sum(!gsg$goodSamples)>0)
#           printFlush(paste("Removing samples:", paste(rownames(datExpr)[!gsg$goodSamples], collapse=", ")))
#       datExpr= datExpr[gsg$goodSamples, gsg$goodGenes]
#       }

#Create an object called "datTraits" that contains your trait data
datTraits = read.csv("Traits_23May2015.csv")
head(datTraits)
##   Sample Day0 Day1 Day2 Day3 Day4 Day5 Day6 Day7 Day8 Day9 Day10 Day11
## 1     A0    1    0    0    0    0    0    0    0    0    0     0     0
## 2     A1    0    1    0    0    0    0    0    0    0    0     0     0
## 3     A2    0    0    1    0    0    0    0    0    0    0     0     0
## 4     A4    0    0    0    0    1    0    0    0    0    0     0     0
## 5     A5    0    0    0    0    0    1    0    0    0    0     0     0
## 6     A6    0    0    0    0    0    0    1    0    0    0     0     0
##   Day12 HPF RedFluoro GreenFluoro
## 1     0   3   0.00000      0.0000
## 2     0  22   0.00000      0.0000
## 3     0  46   0.00000      0.0000
## 4     0  56  54.13165     19.4236
## 5     0  80  75.66280     17.8254
## 6     0 104  92.42900     56.9575
#form a data frame analogous to expression data that will hold the clinical traits.
rownames(datTraits) = datTraits$Sample
datTraits$Sample = NULL
table(rownames(datTraits)==rownames(datExpr)) #should return TRUE if datasets align correctly, otherwise your names are out of order
## 
## TRUE 
##   48

You have finished uploading and formatting expression and trait data

Expression data is in datExpr, corresponding traits are datTraits

Cluster samples by expression

A = adjacency(t(datExpr),type="signed") # this calculates the whole network connectivity
k = as.numeric(apply(A,2,sum))-1 # standardized connectivity
Z.k = scale(k)
thresholdZ.k = -2.5 # often -2.5
outlierColor = ifelse(Z.k<thresholdZ.k,"red","black")
sampleTree = flashClust(as.dist(1-A), method = "average")
# Convert traits to a color representation where red indicates high values
traitColors = data.frame(numbers2colors(datTraits,signed=FALSE))
dimnames(traitColors)[[2]] = paste(names(datTraits))
datColors = data.frame(outlier = outlierColor,traitColors)

plotDendroAndColors(sampleTree,groupLabels=names(datColors),
                    colors=datColors,main="Sample Dendrogram and Trait Heatmap")

Day “0” outliers have been identified. You could exclude these samples with the code below, but this scientists had a biological reason to NOT exclude these samples. It’s up to you. Justify whatever decision you make.

# Remove outlying samples 
#remove.samples = Z.k<thresholdZ.k | is.na(Z.k)
#datExprOut = datExpr[!remove.samples,]
#datTraitsOut = datTraits[!remove.samples,]
#save(datExprOut, datTraitsOut, file="SamplesAndTraits_OutliersRemoved.RData")

Choose a soft threshold power- USE A SUPERCOMPUTER IRL

powers = c(c(1:10), seq(from =10, to=30, by=1)) #choosing a set of soft-thresholding powers
sft = pickSoftThreshold(datExpr, powerVector=powers, verbose =5, networkType="signed") #call network topology analysis function
## pickSoftThreshold: will use block size 1000.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 1000 of 1000
## Warning: executing %dopar% sequentially: no parallel backend registered
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1      1    0.604  2.520        0.85600  524.00   543.000  609.0
## 2      2    0.177 -0.985       -0.00776  317.00   322.000  448.0
## 3      3    0.434 -1.180        0.28100  210.00   203.000  354.0
## 4      4    0.562 -1.190        0.55000  147.00   133.000  291.0
## 5      5    0.610 -1.160        0.68600  107.00    90.900  245.0
## 6      6    0.634 -1.180        0.77700   81.00    66.500  209.0
## 7      7    0.674 -1.180        0.81700   62.60    49.900  180.0
## 8      8    0.709 -1.210        0.85200   49.30    38.600  157.0
## 9      9    0.729 -1.240        0.87600   39.50    29.600  138.0
## 10    10    0.756 -1.240        0.90300   32.10    23.000  122.0
## 11    10    0.756 -1.240        0.90300   32.10    23.000  122.0
## 12    11    0.766 -1.290        0.90500   26.40    18.200  108.0
## 13    12    0.791 -1.300        0.92200   21.90    14.500   96.5
## 14    13    0.792 -1.350        0.91400   18.30    11.500   86.5
## 15    14    0.811 -1.360        0.92600   15.50     9.210   77.8
## 16    15    0.829 -1.380        0.93600   13.20     7.520   70.3
## 17    16    0.861 -1.350        0.95100   11.30     6.250   63.6
## 18    17    0.874 -1.370        0.95900    9.71     5.060   57.8
## 19    18    0.880 -1.370        0.95600    8.42     4.140   52.6
## 20    19    0.897 -1.360        0.96800    7.33     3.400   48.0
## 21    20    0.903 -1.350        0.96700    6.42     2.830   43.9
## 22    21    0.922 -1.350        0.97700    5.64     2.390   40.3
## 23    22    0.936 -1.330        0.98500    4.98     1.980   37.0
## 24    23    0.944 -1.340        0.98900    4.41     1.670   34.0
## 25    24    0.953 -1.320        0.99500    3.92     1.390   31.3
## 26    25    0.956 -1.310        0.99300    3.50     1.140   28.9
## 27    26    0.958 -1.300        0.98900    3.14     0.953   26.7
## 28    27    0.964 -1.280        0.99300    2.82     0.807   24.8
## 29    28    0.970 -1.270        0.99400    2.54     0.690   22.9
## 30    29    0.974 -1.260        0.99400    2.29     0.584   21.3
## 31    30    0.976 -1.250        0.98900    2.08     0.501   19.8
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab= "Soft Threshold (power)", ylab="Scale Free Topology Model Fit, signed R^2", type= "n", main= paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers, col="red")
abline(h=0.90, col="red")

plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab= "Soft Threshold (power)", ylab="Mean Connectivity", type="n", main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, col="red")

From this plot, we would choose a power of 18 becuase it’s the lowest power for which the scale free topology index reaches 0.90

enableWGCNAThreads()
## Allowing parallel execution with up to 2 working processes.
softPower = 18
adjacency = adjacency(datExpr, power = softPower, type = "signed") #specify network type

Construct Networks- USE A SUPERCOMPUTER IRL

#translate the adjacency into topological overlap matrix and calculate the corresponding dissimilarity:
TOM = TOMsimilarity(adjacency, TOMType="signed") # specify network type
## ..connectivity..
## ..matrix multiplication..
## ..normalization..
## ..done.
dissTOM = 1-TOM

Generate Modules

# Generate a clustered gene tree
geneTree = flashClust(as.dist(dissTOM), method="average")

plot(geneTree, xlab="", sub="", main= "Gene Clustering on TOM-based dissimilarity", labels= FALSE, hang=0.04)

#This sets the minimum number of genes to cluster into a module
minModuleSize = 30 
dynamicMods = cutreeDynamic(dendro= geneTree, distM= dissTOM, deepSplit=2, pamRespectsDendro= FALSE, minClusterSize = minModuleSize)
##  ..cutHeight not given, setting it to 0.997  ===>  99% of the (truncated) height range in dendro.
##  ..done.
dynamicColors= labels2colors(dynamicMods)
MEList= moduleEigengenes(datExpr, colors= dynamicColors,softPower = 18)
MEs= MEList$eigengenes
MEDiss= 1-cor(MEs)
METree= flashClust(as.dist(MEDiss), method= "average")

save(dynamicMods, MEList, MEs, MEDiss, METree, file= "Network_allSamples_signed_RLDfiltered.RData")

#plots tree showing how the eigengenes cluster together
plot(METree, main= "Clustering of module eigengenes", xlab= "", sub= "")

#set a threhold for merging modules. In this example we are not merging so MEDissThres=0.0
MEDissThres = 0.0
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight= MEDissThres, verbose =3)
##  mergeCloseModules: Merging modules whose distance is less than 0
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 9 module eigengenes in given set.
##    Calculating new MEs...
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 9 module eigengenes in given set.
mergedColors = merge$colors
mergedMEs = merge$newMEs

#plot dendrogram with module colors below it
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors), c("Dynamic Tree Cut", "Merged dynamic"), dendroLabels= FALSE, hang=0.03, addGuide= TRUE, guideHang=0.05)

moduleColors = mergedColors
colorOrder = c("grey", standardColors(50))
moduleLabels = match(moduleColors, colorOrder)-1
MEs = mergedMEs

Correlate traits

#Define number of genes and samples
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

#Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

#Print correlation heatmap between modules and traits
textMatrix= paste(signif(moduleTraitCor, 2), "\n(", 
                    signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
par(mar= c(6, 8.5, 3, 3))
#display the corelation values with a heatmap plot
labeledHeatmap(Matrix= moduleTraitCor, 
            xLabels= names(datTraits), 
            yLabels= names(MEs), 
            ySymbols= names(MEs), 
            colorLabels= FALSE, 
            colors= blueWhiteRed(50), 
            textMatrix= textMatrix, 
            setStdMargins= FALSE, 
            cex.text= 0.5, 
            zlim= c(-1,1), 
            main= paste("Module-trait relationships"))