# Only run the following commands once to install WGCNA and flashClust on your computer
# source("http://bioconductor.org/biocLite.R")
# biocLite("WGCNA")
# install.packages("flashClust") 

# Load WGCNA and flashClust libraries every time you open R
library(WGCNA)

## Loading required package: dynamicTreeCut

## Loading required package: fastcluster

## 
## Attaching package: 'fastcluster'

## The following object is masked from 'package:stats':
## 
##     hclust

## ==========================================================================
## *
## *  Package WGCNA 1.42 loaded.
## *
## *    Important note: It appears that your system supports multi-threading,
## *    but it is not enabled within WGCNA in R. 
## *    To allow multi-threading within WGCNA with all available cores, use 
## *
## *          allowWGCNAThreads()
## *
## *    within R. Use disableWGCNAThreads() to disable threading if necessary.
## *    Alternatively, set the following environment variable on your system:
## *
## *          ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## *    for example 
## *
## *          ALLOW_WGCNA_THREADS=2
## *
## *    To set the environment variable in linux bash shell, type 
## *
## *           export ALLOW_WGCNA_THREADS=2
## *
## *     before running R. Other operating systems or shells will
## *     have a similar command to achieve the same aim.
## *
## ==========================================================================

## 
## Attaching package: 'WGCNA'

## The following object is masked from 'package:stats':
## 
##     cor

library(flashClust)

## 
## Attaching package: 'flashClust'

## The following object is masked from 'package:fastcluster':
## 
##     hclust

## The following object is masked from 'package:stats':
## 
##     hclust

#Set your current working directory (where all your files are)
setwd("~/Desktop/WGCNA/") # Change the text within quotes as necessary. I have downloaded and unzipped the WGCNA folder on my Desktop

Uploading data into R and formatting it for WGCNA

# This creates an object called "datExpr" that contains the normalized counts file output from DESeq2
datExpr = read.csv("SampleTimeSeriesRLD.csv")
# "head" the file to preview it
head(datExpr) # You see that genes are listed in a column named "X" and samples are in columns

##               X        A0       A1       A2       A4       A5       A6
## 1  isogroup2041 3.2279882 4.425669 4.053493 4.081084 3.842665 4.244835
## 2 isogroup27249 0.9261955 2.031387 2.085966 2.463860 1.823165 2.743842
## 3 isogroup26716 1.0776734 2.016798 2.798007 2.014945 1.936990 2.689329
## 4 isogroup21525 2.3277012 3.961517 3.532261 3.515290 3.860423 3.651837
## 5 isogroup17063 1.7921210 2.919299 2.883117 2.719757 3.584092 2.972269
## 6 isogroup23098 7.5494339 8.605921 8.714458 8.285594 8.359864 8.287737
##         A7       A8       A9      A10      A11      A12       B0       B1
## 1 3.676771 3.941789 4.471518 3.701433 3.721957 4.228176 2.711784 4.438969
## 2 2.999606 2.259128 2.773004 3.285110 2.589221 2.343527 1.065831 2.102840
## 3 1.775270 3.166939 3.205823 1.556635 2.692233 2.704825 1.222318 2.552086
## 4 3.481384 3.095274 2.596508 3.672719 3.652501 3.434640 2.554495 4.371392
## 5 3.577360 3.559317 3.596309 3.693239 2.953543 3.217554 1.629696 3.271179
## 6 7.949861 7.872196 8.282509 7.873440 7.813140 8.006631 7.379813 8.614006
##         B2       B3       B4       B5       B6       B7       B8       B9
## 1 4.192873 4.283763 4.145406 3.646702 3.818861 4.133641 3.990479 3.834212
## 2 2.321835 1.615376 2.227152 2.695438 2.382110 3.172436 2.175775 2.881731
## 3 2.710030 2.628911 2.067072 2.569413 3.135256 2.535408 3.358315 2.747720
## 4 3.624168 3.266043 3.382221 3.970439 3.827180 3.380975 3.037175 4.128778
## 5 3.619758 2.629647 3.387749 3.052602 3.060756 3.423325 2.888494 3.587651
## 6 8.731997 8.634263 8.368881 8.351090 8.118341 8.291326 8.059225 7.997958
##        B10      B12       C0       C1       C2       C3       C4       C5
## 1 4.287655 4.391409 3.704225 4.429310 4.518705 4.174755 4.121667 3.712698
## 2 3.297518 2.558725 1.478850 2.417905 2.299758 2.778593 2.016950 2.564248
## 3 3.269745 2.959743 1.225789 2.179640 2.290551 2.608619 2.346118 3.086956
## 4 2.924908 3.679077 1.722992 4.011196 3.393334 3.560959 2.707955 3.156925
## 5 3.620145 3.119855 1.982470 3.029753 3.156511 2.659158 3.411735 3.325926
## 6 7.920953 7.821971 7.555304 8.748981 8.765364 8.474679 8.197553 8.329218
##         C6       C7       C8       C9      C10      C12       D0       D1
## 1 4.121563 4.662792 4.485772 3.995811 4.016984 3.710906 3.438738 4.778724
## 2 1.044283 2.287411 1.852964 2.560540 1.617758 2.791151 1.104202 1.238932
## 3 2.681671 1.802319 2.490925 2.978003 3.083341 3.481888 1.262211 1.563040
## 4 3.262889 3.706466 2.689109 2.848256 3.269979 3.249449 1.762327 4.412169
## 5 3.411635 3.851133 3.593428 3.475429 3.688129 3.186268 1.672265 3.322115
## 6 8.320269 8.305366 7.958708 7.979192 8.269686 7.917266 7.541028 8.437456
##         D2       D3       D4       D5       D6       D7       D8       D9
## 1 4.372119 4.471382 3.791108 4.053877 4.127585 4.450004 3.948535 4.066282
## 2 2.053670 2.188338 1.762516 2.522993 2.235528 2.612178 2.309813 1.709138
## 3 2.551285 2.718058 2.634042 2.831299 1.894300 2.381463 2.731666 3.226271
## 4 3.148012 3.064167 3.459427 2.946796 3.268502 2.743258 3.474978 3.408191
## 5 3.382879 2.623312 2.661818 3.464174 2.646809 3.785781 3.359743 3.187177
## 6 8.792712 8.708494 8.456304 8.376324 8.253110 8.260721 7.926216 8.015008
##        D10      D11
## 1 4.083817 4.521707
## 2 2.312437 1.839739
## 3 2.215026 3.051648
## 4 2.957649 3.116199
## 5 3.768126 3.520356
## 6 8.061845 8.100753

# Manipulate file so it matches the format WGCNA needs 
row.names(datExpr) = datExpr$X
datExpr$X = NULL
datExpr = as.data.frame(t(datExpr)) # now samples are rows and genes are columns
dim(datExpr) # 48 samples and 1000 genes (you will have many more genes in reality)

## [1]   48 1000

# Run this to check if there are gene outliers
gsg = goodSamplesGenes(datExpr, verbose = 3)

##  Flagging genes and samples with too many missing values...
##   ..step 1

gsg$allOK

## [1] TRUE

#If the last statement returns TRUE, all genes have passed the cuts. If not, we remove the offending genes and samples from the data with the following:

#if (!gsg$allOK)
#  {if (sum(!gsg$goodGenes)>0)
#       printFlush(paste("Removing genes:", paste(names(datExpr)[!gsg$goodGenes], collapse= ", ")));
#       if (sum(!gsg$goodSamples)>0)
#           printFlush(paste("Removing samples:", paste(rownames(datExpr)[!gsg$goodSamples], collapse=", ")))
#       datExpr= datExpr[gsg$goodSamples, gsg$goodGenes]
#       }

#Create an object called "datTraits" that contains your trait data
datTraits = read.csv("Traits_23May2015.csv")
head(datTraits)

##   Sample Day0 Day1 Day2 Day3 Day4 Day5 Day6 Day7 Day8 Day9 Day10 Day11
## 1     A0    1    0    0    0    0    0    0    0    0    0     0     0
## 2     A1    0    1    0    0    0    0    0    0    0    0     0     0
## 3     A2    0    0    1    0    0    0    0    0    0    0     0     0
## 4     A4    0    0    0    0    1    0    0    0    0    0     0     0
## 5     A5    0    0    0    0    0    1    0    0    0    0     0     0
## 6     A6    0    0    0    0    0    0    1    0    0    0     0     0
##   Day12 HPF RedFluoro GreenFluoro
## 1     0   3   0.00000      0.0000
## 2     0  22   0.00000      0.0000
## 3     0  46   0.00000      0.0000
## 4     0  56  54.13165     19.4236
## 5     0  80  75.66280     17.8254
## 6     0 104  92.42900     56.9575

#form a data frame analogous to expression data that will hold the clinical traits.
rownames(datTraits) = datTraits$Sample
datTraits$Sample = NULL
table(rownames(datTraits)==rownames(datExpr)) #should return TRUE if datasets align correctly, otherwise your names are out of order

## 
## TRUE 
##   48

You have finished uploading and formatting expression and trait data

Expression data is in datExpr, corresponding traits are datTraits

Cluster samples by expression

A = adjacency(t(datExpr),type="signed") # this calculates the whole network connectivity
k = as.numeric(apply(A,2,sum))-1 # standardized connectivity
Z.k = scale(k)
thresholdZ.k = -2.5 # often -2.5
outlierColor = ifelse(Z.k<thresholdZ.k,"red","black")
sampleTree = flashClust(as.dist(1-A), method = "average")
# Convert traits to a color representation where red indicates high values
traitColors = data.frame(numbers2colors(datTraits,signed=FALSE))
dimnames(traitColors)[[2]] = paste(names(datTraits))
datColors = data.frame(outlier = outlierColor,traitColors)

plotDendroAndColors(sampleTree,groupLabels=names(datColors),
                    colors=datColors,main="Sample Dendrogram and Trait Heatmap")

Day “0” outliers have been identified. You could exclude these samples with the code below, but this scientists had a biological reason to NOT exclude these samples. It’s up to you. Justify whatever decision you make.

# Remove outlying samples 
#remove.samples = Z.k<thresholdZ.k | is.na(Z.k)
#datExprOut = datExpr[!remove.samples,]
#datTraitsOut = datTraits[!remove.samples,]
#save(datExprOut, datTraitsOut, file="SamplesAndTraits_OutliersRemoved.RData")

Choose a soft threshold power- USE A SUPERCOMPUTER IRL

powers = c(c(1:10), seq(from =10, to=30, by=1)) #choosing a set of soft-thresholding powers
sft = pickSoftThreshold(datExpr, powerVector=powers, verbose =5, networkType="signed") #call network topology analysis function

## pickSoftThreshold: will use block size 1000.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 1000 of 1000

## Warning: executing %dopar% sequentially: no parallel backend registered

##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1      1    0.604  2.520        0.85600  524.00   543.000  609.0
## 2      2    0.177 -0.985       -0.00776  317.00   322.000  448.0
## 3      3    0.434 -1.180        0.28100  210.00   203.000  354.0
## 4      4    0.562 -1.190        0.55000  147.00   133.000  291.0
## 5      5    0.610 -1.160        0.68600  107.00    90.900  245.0
## 6      6    0.634 -1.180        0.77700   81.00    66.500  209.0
## 7      7    0.674 -1.180        0.81700   62.60    49.900  180.0
## 8      8    0.709 -1.210        0.85200   49.30    38.600  157.0
## 9      9    0.729 -1.240        0.87600   39.50    29.600  138.0
## 10    10    0.756 -1.240        0.90300   32.10    23.000  122.0
## 11    10    0.756 -1.240        0.90300   32.10    23.000  122.0
## 12    11    0.766 -1.290        0.90500   26.40    18.200  108.0
## 13    12    0.791 -1.300        0.92200   21.90    14.500   96.5
## 14    13    0.792 -1.350        0.91400   18.30    11.500   86.5
## 15    14    0.811 -1.360        0.92600   15.50     9.210   77.8
## 16    15    0.829 -1.380        0.93600   13.20     7.520   70.3
## 17    16    0.861 -1.350        0.95100   11.30     6.250   63.6
## 18    17    0.874 -1.370        0.95900    9.71     5.060   57.8
## 19    18    0.880 -1.370        0.95600    8.42     4.140   52.6
## 20    19    0.897 -1.360        0.96800    7.33     3.400   48.0
## 21    20    0.903 -1.350        0.96700    6.42     2.830   43.9
## 22    21    0.922 -1.350        0.97700    5.64     2.390   40.3
## 23    22    0.936 -1.330        0.98500    4.98     1.980   37.0
## 24    23    0.944 -1.340        0.98900    4.41     1.670   34.0
## 25    24    0.953 -1.320        0.99500    3.92     1.390   31.3
## 26    25    0.956 -1.310        0.99300    3.50     1.140   28.9
## 27    26    0.958 -1.300        0.98900    3.14     0.953   26.7
## 28    27    0.964 -1.280        0.99300    2.82     0.807   24.8
## 29    28    0.970 -1.270        0.99400    2.54     0.690   22.9
## 30    29    0.974 -1.260        0.99400    2.29     0.584   21.3
## 31    30    0.976 -1.250        0.98900    2.08     0.501   19.8

plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab= "Soft Threshold (power)", ylab="Scale Free Topology Model Fit, signed R^2", type= "n", main= paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers, col="red")
abline(h=0.90, col="red")

plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab= "Soft Threshold (power)", ylab="Mean Connectivity", type="n", main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, col="red")

From this plot, we would choose a power of 18 becuase it’s the lowest power for which the scale free topology index reaches 0.90

enableWGCNAThreads()

## Allowing parallel execution with up to 2 working processes.

softPower = 18
adjacency = adjacency(datExpr, power = softPower, type = "signed") #specify network type

Construct Networks- USE A SUPERCOMPUTER IRL

#translate the adjacency into topological overlap matrix and calculate the corresponding dissimilarity:
TOM = TOMsimilarity(adjacency, TOMType="signed") # specify network type

## ..connectivity..
## ..matrix multiplication..
## ..normalization..
## ..done.

dissTOM = 1-TOM

Generate Modules

# Generate a clustered gene tree
geneTree = flashClust(as.dist(dissTOM), method="average")

plot(geneTree, xlab="", sub="", main= "Gene Clustering on TOM-based dissimilarity", labels= FALSE, hang=0.04)

#This sets the minimum number of genes to cluster into a module
minModuleSize = 30 
dynamicMods = cutreeDynamic(dendro= geneTree, distM= dissTOM, deepSplit=2, pamRespectsDendro= FALSE, minClusterSize = minModuleSize)

##  ..cutHeight not given, setting it to 0.997  ===>  99% of the (truncated) height range in dendro.
##  ..done.

dynamicColors= labels2colors(dynamicMods)
MEList= moduleEigengenes(datExpr, colors= dynamicColors,softPower = 18)
MEs= MEList$eigengenes
MEDiss= 1-cor(MEs)
METree= flashClust(as.dist(MEDiss), method= "average")

save(dynamicMods, MEList, MEs, MEDiss, METree, file= "Network_allSamples_signed_RLDfiltered.RData")

#plots tree showing how the eigengenes cluster together
plot(METree, main= "Clustering of module eigengenes", xlab= "", sub= "")

#set a threhold for merging modules. In this example we are not merging so MEDissThres=0.0
MEDissThres = 0.0
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight= MEDissThres, verbose =3)

##  mergeCloseModules: Merging modules whose distance is less than 0
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 9 module eigengenes in given set.
##    Calculating new MEs...
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 9 module eigengenes in given set.

mergedColors = merge$colors
mergedMEs = merge$newMEs

#plot dendrogram with module colors below it
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors), c("Dynamic Tree Cut", "Merged dynamic"), dendroLabels= FALSE, hang=0.03, addGuide= TRUE, guideHang=0.05)

moduleColors = mergedColors
colorOrder = c("grey", standardColors(50))
moduleLabels = match(moduleColors, colorOrder)-1
MEs = mergedMEs

Correlate traits

#Define number of genes and samples
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

#Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use= "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

#Print correlation heatmap between modules and traits
textMatrix= paste(signif(moduleTraitCor, 2), "\n(", 
                    signif(moduleTraitPvalue, 1), ")", sep= "")
dim(textMatrix)= dim(moduleTraitCor)
par(mar= c(6, 8.5, 3, 3))
#display the corelation values with a heatmap plot
labeledHeatmap(Matrix= moduleTraitCor, 
            xLabels= names(datTraits), 
            yLabels= names(MEs), 
            ySymbols= names(MEs), 
            colorLabels= FALSE, 
            colors= blueWhiteRed(50), 
            textMatrix= textMatrix, 
            setStdMargins= FALSE, 
            cex.text= 0.5, 
            zlim= c(-1,1), 
            main= paste("Module-trait relationships"))

WGCNA Tutorial

Rachel Wright

May 25, 2016

Uploading data into R and formatting it for WGCNA

You have finished uploading and formatting expression and trait data

Expression data is in datExpr, corresponding traits are datTraits

Cluster samples by expression

Day “0” outliers have been identified. You could exclude these samples with the code below, but this scientists had a biological reason to NOT exclude these samples. It’s up to you. Justify whatever decision you make.

Choose a soft threshold power- USE A SUPERCOMPUTER IRL

From this plot, we would choose a power of 18 becuase it’s the lowest power for which the scale free topology index reaches 0.90

Construct Networks- USE A SUPERCOMPUTER IRL

Generate Modules

Correlate traits