Code:
require(rtracklayer) require(BSgenome.Hsapiens.UCSC.hg19) require(ShortRead) mirTrack = import('ftp://mirbase.org/pub/mirbase/20/genomes/hsa.gff3') # split into hairpin, 5arm and 3arm hp = mirTrack[mirTrack$type != "miRNA"] arm5 = mirTrack[grepl("-5p", mirTrack$Name)] arm3 = mirTrack[grepl("-3p", mirTrack$Name)] # make key for matching hairpin with arm5 and arm3 hp$arm5 = match(hp$ID, arm5$Derives_from) hp$arm3 = match(hp$ID, arm3$Derives_from) # discard hairpins without annotated 5p and 3p arms hp = hp[!is.na(hp$arm3) & !is.na(hp$arm5)] # prepare hp$loopStart = 0 hp$loopEnd = 0 # miRNAs on pos and neg strand have to be parsed separately ## posStrand isPos = as.vector(strand(hp) =="+") hp$loopStart[isPos] = end(arm5[hp$arm5[isPos]]) hp$loopEnd[isPos] = start(arm3[hp$arm3[isPos]]) ## negStrand hp$loopEnd[!isPos] = start(arm5[hp$arm5[!isPos]]) hp$loopStart[!isPos] = end(arm3[hp$arm3[!isPos]]) # GRanges for miRNA loops loops = GRanges(seqnames = seqnames(hp), IRanges(hp$loopStart, hp$loopEnd),strand = strand(hp), MI = hp$ID, Name = hp$Name) # sanity check hist(width(loops), breaks = 50) # we dont want th efirst/last nt of bounding mature miRs loops = loops-1 loops$seq = getSeq(Hsapiens, loops) # export loops loopsFasta = loops$seq names(loopsFasta) = paste("loop", loops$MI, loops$Name, sep = "_") writeFasta(loopsFasta, file = "mirbase20loops.fa") readLines("mirbase20loops.fa")
Leave a comment: