Hi,
I would you to revise my edgeR code since it's possible I missed something important because being quite new on this.
Thanks, Bernardo
P.S. This questions has also been posted in Biostar forum:
I would you to revise my edgeR code since it's possible I missed something important because being quite new on this.
Thanks, Bernardo
P.S. This questions has also been posted in Biostar forum:
Code:
############################################################ #htseq-count stats ############################################################ # rRNA and tRNA will be discarded in counts file because the arbitrary mapped reads to these regions # NOTE: minimun alignment quality is set to 10 # '-t CDS -i ID' will exclude rRNA and tRNA. Also 'Parent' will give the correct locus tag name to each 'feature' in count table. python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i ID 14.sam HS372.gff > 14.counts python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i ID 15.sam HS372.gff > 15.counts python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i ID 16.sam HS372.gff > 16.counts #Last lines from .counts files #14.counts __no_feature 271363 __ambiguous 6 __too_low_aQual 137308 __not_aligned 133836 __alignment_not_unique 0 #15.counts __no_feature 346247 __ambiguous 3 __too_low_aQual 148014 __not_aligned 135920 __alignment_not_unique 0 #16.counts __no_feature 1067474 __ambiguous 0 __too_low_aQual 136484 __not_aligned 110488 __alignment_not_unique 0 ############################################################ #edgeR ############################################################ #R code library(edgeR) library(WriteXLS) dir () # The tag counts for the two individual libraries are stored in two separate plain text files. In each file, the tag IDs and counts for each tag are provided in a table targets <- read.delim("targets.txt", stringsAsFactors = FALSE) targets # files group description #1 14.counts biofilm biofilm F9 #2 15.counts planktonic planktonic F9 #3 16.counts stationary stationary F9 RG <- readDGE(targets) colnames(RG) <- c("biofilm","planktonic","stationary") RG dim(RG) #filter low expressed transcripts keep <- rowSums(cpm(RG)>1) > 1 #we keep genes that achieve at least one count per million (cpm) in at least TWO samples RG <- RG[keep,] dim(RG) RG$samples$lib.size <- colSums(RG$counts) # After filtering, it is a good idea to reset the library sizes: RG$samples #Normalization RG <- calcNormFactors(RG) # Apply TMM normalization RG$samples # se manual RG ############################################################ #Bio_vs_Plank ############################################################ bcv <- 0.2 # Assume a low BCV value of 0.2. The BCV (square root of the common dispersion) here is 20%, stilgthly higher than a typical size for a laboratory experiment with a cell line or a model organism. et <- exactTest(RG, pair=c("planktonic","biofilm"),dispersion=bcv^2) # exactTest. dispersion = 0.04 et class(et) top <- topTags(et) # Top ten differentially expressed tags with their p-values top class(top) cpm(RG)[rownames(top), ] # Check the individual cpm values for the top ten genes summary(de <- decideTestsDGE(et, p=0.05, adjust="BH")) # The total number of differentiallly expressed genes at FDR< 0.05 # Here the entries for -1, 0 and 1 are for down-regulated, non-differentially expressed and up-regulated tags respectively. #-1 54 #0 2153 #1 52 detags <- rownames(RG)[as.logical(de)] # detags contains the DE genes at 5% FDR head (detags) plotSmear(et, de.tags=detags, ylim=c(-7,7), xlim=c(0,15), cex.lab=1.4, cex.axis=1) # plot all genes and highlight DE genes at 5% FDR abline(h=c(-1, 1), col="blue") # The blue lines indicate 2-fold changes. title("Biofilm vs planktonic") dev.copy2pdf(file = "Figure_1.pdf") #Save as .pdf## # NOTE -> adjust 'n' depending on the available number of genes Bio_vs_Plank <- topTags(et, n=2259, adjust.method="BH", sort.by="logFC") # export excel file x <- Bio_vs_Plank$table WriteXLS("x","Bio_vs_Plank.xls", row.names = TRUE, col.names = TRUE) # export genes for TopGO. DEG_up_pvalue_0.05 x.df <- Bio_vs_Plank$table x.sub <- subset(x.df, logFC > 0 & PValue < 0.05) x.sub DEG_up_Pvalue_005 <- rownames(x.sub) write.table(DEG_up_Pvalue_005, "Bio_vs_Plank_DEG_up_pvalue_005.txt", quote = FALSE, row.names = FALSE, col.names = FALSE) # export genes for TopGO. DEG_down_pvalue_0.05 x.df <- Bio_vs_Plank$table x.sub <- subset(x.df, logFC < 0 & PValue < 0.05) x.sub DEG_down_Pvalue_005 <- rownames(x.sub) write.table(DEG_down_Pvalue_005, "Bio_vs_Plank_DEG_down_pvalue_005.txt", quote = FALSE, row.names = FALSE, col.names = FALSE) # export reference gene set reference_set <- rownames(RG$counts) write.table(reference_set, "reference_set.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)
Comment