Idev session and getting data
Setting up
ssh <username>@lonestar.tacc.utexas.edu cds cp -r /corral-repl/utexas/BioITeam/short_rnaseq_course/ my_short_rnaseq_course cd my_short_rnaseq_course/
Step 1: Evaluate and manipulate raw data
Count and Fastqc
head data/Sample1_R1.fastq wc -l data/Sample1_R1.fastq grep -c '^@HWI' data/Sample1_R1.fastq module spider fastqc module load fastqc fastqc -h #DONT RUN ON HEAD NODE: fastqc data/Sample1_R1.fastq
Look at Fastqc results:
Ideal dataset: http://www.bioinformatics.babraham.ac.uk/projects/fastqc/good_sequence_short_fastqc.html
Our dataset: http://web.corral.tacc.utexas.edu/BioITeam/rnaseq_course/fastqc_exercise/Sample1_R1_fastqc/fastqc_report.html
manipulate fastq files- quality trimming and filtering
module spider fastx module load fastx_toolkit #DONT RUN ON HEAD NODE: fastx_trimmer -i data/Sample1_R1.fastq -l 90 -Q 33 -o Sample1_R1.trimmed.fastq #DONT RUN ON HEAD NODE: fastq_quality_filter -q 20 -p 80 -i data/Sample1_R1.fastq -Q 33 -o Sample1_R1.filtered.fastq grep -c '^@HWI' results/Sample1_R1.trimmed.fastq grep -c '^@HWI' results/Sample1_R1.filtered.fastq
manipulate fastq files- adaptor trimming
fastx_clipper -h #DONT RUN ON HEAD NODE: /corral-repl/utexas/BioITeam/bin/cutadapt-1.3/bin/cutadapt -m 22 -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC Sample1_R1.fastq /corral-repl/utexas/BioITeam/bin/cutadapt-1.3/bin/cutadapt -m 22 -a TGATCGTCGGACTGTAGAACTCTGAACGTGTAGA Sample1_R1.fastq
Step 2: Map Raw Data to Reference, Assess results
Map with BWA
module spider bwa module load bwa/0.7.7 #DONT RUN ON HEAD NODE: bwa index -a bwtsw reference/genome.fa bwa mem #DONT RUN ON HEAD NODE: bwa mem reference/genome.fa data/GSM794483_C1_R1_1.fq data/GSM794483_C1_R1_2.fq > C1_R1.mem.sam
Map with Tophat
module spider tophat module load tophat/2.0.10 #DONT RUN ON HEAD NODE tophat -p 2 -G reference/genes.gtf -o C1_R1_thout reference/genome.fa data/GSM794483_C1_R1_1.fq data/GSM794483_C1_R1_2.fq
Assess Mapping Results
module load samtools #SYNTAX: samtools view -b -S samfile > bamfile samtools sort bamfile sortedbamfile samtools index sortedbamfile #BWA RESULTS samtools flagstat bwa_results/C1_R1.mem.bam samtools idxstats bwa_results/C1_R1.mem.bam #SAMTOOLS RESULTS samtools flagstat tophat_results/C1_R1_thout/accepted_hits.bam #How is tophat result different from bwa head -20 tophat_results/C1_R1_thout/accepted_hits.sam cut -f 6 tophat_results/C1_R1_thout/accepted_hits.sam|grep 'N'|head cut -f 6 bwa_results/C1_R1.mem.sam|grep 'N'|head #DONT RUN ON HEAD NODE cut -f 6 tophat_results/C1_R1_thout/accepted_hits.sam|grep 'N'|wc -l
Step 3 and Step 4 : Quantify transcripts and ID DEGs
Look at bedtools and DESeq results
cd deg_exercise/ module swap intel gcc/4.7.1 module load bedtools bedtools bedtools multicov #DONT RUN ON HEAD NODE bedtools multicov -bams C1_R1_thout/accepted_hits.bam C1_R2_thout/accepted_hits.bam C1_R3_thout/accepted_hits.bam C2_R1_thout/accepted_hits.bam C2_R2_thout/accepted_hits.bam C2_R3_thout/accepted_hits.bam -bed reference/genes.formatted.gtf > gene_counts.gff head gene_counts.gff