I'm writing a mitochondrial haplotyping script in R, and one of the things I needed to do was left-normalisation of INDELs. Because of the way the variants were originally defined, I'm using the notation <ref><pos><alt> for variants (e.g. ACCCCCTCTA8280A to describe a 9bp deletion at location 8280). Here's my script:
Here's a little test, using the example from here:
Note that the code uses vectorisable functions, so should work equally well on single values and vectors. A bit more tweaking is needed to get it to do the right thing on matrices.
Code:
## left-normalise INDELs ## derived from pseudocode from http://genome.sph.umich.edu/wiki/Variant_Normalization leftNorm <- function(ref.seq, ref, pos = NULL, alt = NULL){ if(any(grepl("[0-9]",ref))){ leftNorm(ref.seq=ref.seq, ref=sub("^(.*?)([0-9]+)(.*)$","\\1",ref), pos=as.numeric(sub("^(.*?)([0-9]+)(.*)$","\\2",ref)), alt=sub("^(.*?)([0-9]+)(.*)$","\\3",ref)); } else { equalRights <- (substring(ref,nchar(ref)) == substring(alt,nchar(alt))); while(sum(equalRights) > 0){ substring(ref,nchar(ref))[equalRights] <- ""; substring(alt,nchar(alt))[equalRights] <- ""; emptyAlleles <- ((nchar(ref) == 0) | (nchar(alt) == 0)); pos[emptyAlleles] <- pos[emptyAlleles] - 1; ref[emptyAlleles] <- paste0(substring(as.character(ref.seq),pos[emptyAlleles],pos[emptyAlleles]), ref[emptyAlleles]); alt[emptyAlleles] <- paste0(substring(as.character(ref.seq),pos[emptyAlleles],pos[emptyAlleles]), alt[emptyAlleles]); equalRights <- (substring(ref,nchar(ref)) == substring(alt,nchar(alt))); } leftChangeable <- (substring(ref,1,1) == substring(alt,1,1)) & (nchar(ref) >= 2) & (nchar(alt) >= 2); while(sum(leftChangeable) > 0){ substring(ref[leftChangeable],1,1) <- ""; substring(alt[leftChangeable],1,1) <- ""; pos[leftChangeable] <- pos[leftChangeable] + 1; leftChangeable <- (substring(ref,1,1) == substring(alt,1,1)) & (nchar(ref) >= 2) & (nchar(alt) >= 2); } paste0(ref,pos,alt); } }
Code:
> all(leftNorm("GGGCACACACAGGG",c("CA8","CAC6C","GCACA3GCA","GGCA2GG","GCA3G")) == "GCA3G"); [1] TRUE