Page History

...

DESeq2 Input:

DESeq2 takes as input count data in several forms: a table form, with each column representing a biological replicate/biological condition. DESEQ2 can also read data directly from htseq results, so we can use the 6 files we generated using htseq as input for DESeq2. The count data must be raw counts of sequencing reads, not already normalized data.

Example:

C1_R1 C1_R2 C1_R3 C2_R1 C2_R2 C2_R2

FBgn0000003 0 0 0 0 0 0
FBgn0000008 92 161 76 70 140 88
FBgn0000014 5 1 0 0 4 0

DESeq2 (with the use of an additional packages called tximport and readr) can read data directly from kallisto abundance files. We will need to provide the location of the 6 abundance files, the sample names associated to each file and a Sample Table that gives the mapping between sample and condtion.

SampleName Condition

DESeq2 Scripts:

DESeq2 script to work with kallisto count output is provided below.

Code Block

title	DESEQ2 KALLISTO script

R

#load libraries
library(tximport)
library("DESeq2")


#Import a file called file_list with all the locations of the abundance.tsv files
#eg below:
#/stor/SCRATCH/sample1/abundances.tsv
#/stor/SCRATCH/sample2/abundances.tsv
#/stor/SCRATCH/sample3/abundances.tsv
#/stor/SCRATCH/sample4/abundances.tsv
files<-as.character(read.table("file_list", header=FALSE)$V1)
#Import a file called samples with the sample names corresponding to each file in the file_list
#eg below:
#sample1
#sample2
#sample3
#sample4
#look at the data structures
files

samples<-as.character(read.table("samples",header=FALSE)$V1)
names(files)<-samples
#look at the data structures
samples
files

#Import a file called sampletable which is a tab-delimited file that contains each samplename along with the condition
#eg below:
#samples        condition
#sample1        alc
#sample2        alc
#sample3        con
#sample4        con
sampleTable <-read.table("sampleTable",header=TRUE, row.names=1)
#look at the data structure
head(sampleTable)


#IMPORTANT: MAKE SURE THE SAMPLES AND FILE_LIST ARE IN THE SAME ORDER- SAMPLES SHOULD MATCH UP WITH FILES
samples==rownames(sampleTable) #should return TRUE for all


#Import a file called tx2gene.csv which a csv file that contains the transcript id to gene id mapping
#For Drosophila, this is located at: tx2gene.csv
tx2gene <- read.csv("tx2gene.csv")
#look at this data structure
#read in kallisto abundance files, summarizing by gene
txi <- tximport(files, type = "kallisto", tx2gene = tx2gene)
names(txi)


#make a deseq2 object from the kallisto summarized counts
ddsMatrix <- DESeqDataSetFromTximport(txi, sampleTable, ~condition)
ddsMatrix


#Optionally, if you want to save this count matrix as a file
write.csv(assay(ddsMatrix), file="genecounts.raw.csv", quote=FALSE)

#Optionally, if you want to save normalized counts matrix as a file
ddsMatrix<- estimateSizeFactors(ddsMatrix)
normalized_counts <- counts(ddsMatrix, normalized=TRUE)
write.tablecsv(normalized_counts, file="genecounts.normalized.csv", quote=FALSE)

#Optionally, if you want to do variance stabilizing transformation or regularized log transformation on this count matrix and then save as a file: This can become input to things like wgcna, pca
vsd<- vst(ddsMatrix, blind=TRUE)
write.csv(assay(vsd), file="genecounts.variancestabilized.csv", quote=FALSE)
 
#estimate size factors, dispersion, normalize and perform negative binomial test to compare across conditions
dds<-DESeq(ddsMatrix)
#collect results from the statistical testing, order by adjusted pvalue and write into an output file
res<-results(dds)
resOrdered <- res[order(res$padj),]
summary(res)
write.csv(resOrdered, "deseq2_kallisto_C1_vs_C2.csv")
 
#generate MA plot
pdf('MAPlot.pdf')
plotMA(dds,ylim=c(-2,2),main="DESeq2")
dev.off()



#save the R data and history
save.image(file="deseq2.kallisto.Rdata")
savehistory(file="deseq2.kallisto.Rhistory")

Code Block

title	Run the R script

R CMD BATCH deseq2.kallisto.R

2. DESeq2 script to work with Htseq count output

Code Block

title	DESEQ2 HTSEQ Script

library("DESeq2")
 
#GET HTSEQ COUNTS AND SET UP SAMPLE TABLE
directory<-(getwd())
samples <- c("C1_count1.gff", "C1_count2.gff", "C1_count3.gff", "C2_count4.gff", "C2_count5.gff", "C2_count6.gff")
conditions <- c("control", "control", "control", "treated","treated","treated")
sampleTable<-data.frame(sampleName=samples, fileName=samples, condition=conditions)
sampleTable


#BUILD A DESEQ2 OBJECT FROM THE HTSEQ COUNTS DATA
ddsHTSeq<-DESeqDataSetFromHTSeqCount(sampleTable=sampleTable, directory=directory, design=~condition)
colData(ddsHTSeq)$condition<-factor(colData(ddsHTSeq)$condition, levels=c("control", "treated"))
 
 
#LOOK AT THE DESEQ2 OBJECT WE'VE CREATED BY READING IN HTSEQ COUNT FILES
ddsHTSeq

#Optionally, if you want to do variance stabilizing transformation or regularized log transformation on this count matrix and t$
vsd<- vst(ddsHTSeq)
write.csv(assay(vsd), file="genecounts.variancestabilized.csv")
 
#RUN THE STATISTICAL TEST IN ONE GO- NORMALIZATION, ESTIMATE DISPERSION/VARIANCE AND DO TEST FOR DIFFERENTIAL EXPRESSION 
dds<-DESeq(ddsHTSeq)
res<-results(dds)
res<-res[order(res$padj),]
mcols(res,use.names=TRUE)
summary(res)
 
#GENERATE MA PLOT
png('MAPlot_htseq.png')
plotMA(dds,ylim=c(-2,2),main="DESeq2") 
dev.off()
 
#WRITE RESULTS INTO FILE
write.csv(as.data.frame(res),file="deseq2_htseq_C1_vs_C2.csv")

Code Block

title	Run the R script

#IF YOU DIDNT WANT TO RUN THE SCRIPT INTERACTIVELY
R CMD BATCH deseq2.htseq.R

DESeq2 Output:

...

Space shortcuts

Page tree

Versions Compared

Old Version 73

New Version 74

Key