#!/usr/bin/perl -w
#
# Converts bismark mapping output to SAM format. Currently functions for single-end mapping
# Usage: bismark_to_SAM.pl -c [chrom sizes file] -i [bismark mapping output] -o [SAM output]

use strict;
use warnings;
use Getopt::Std;
use Carp;

### Program variables ###

my $chrom_sizes_file; # Contains chromosome name used in mapping (column 1) and its length (column 2)
my $bismark_infile;
my $sam_outfile;

### Main program ###

parse_cmd_line();
generate_SAM_header();
convert_bismark_to_SAM();

### Subroutines ###

sub parse_cmd_line{
  my %opts;
  getopts('c:i:o:', \%opts);
  $chrom_sizes_file = $opts{'c'};
  warn "no chrom_sizes file indicated\n" unless defined $chrom_sizes_file;
  $bismark_infile = $opts{'i'};
  warn "no input file specified\n" unless defined $bismark_infile;
  if(!defined $chrom_sizes_file || !defined $bismark_infile){
    die usage();
  }
  $sam_outfile = $opts{'o'};
  $sam_outfile = $bismark_infile . ".sam" unless defined $sam_outfile;
}
  
sub usage{
  print <<EOF

Usage:
   bismark_to_SAM.pl -c [chrom sizes file] -i [bismark mapping output] -o [SAM output]

  -c [chrom sizes file]        -  file containing length of chromosomes/sequences used for Bowtie mapping
  -i [bismark mapping output]  -  file containing bismark mapping output
  -o [SAM output]              -  name for output file in SAM format (default: [input].sam)

EOF
    ;
  exit 1;
}

sub generate_SAM_header{
  open my $ifh, "<", $chrom_sizes_file or die "Cannot open chrom_sizes file ($chrom_sizes_file). $!";
  open my $ofh, ">", $sam_outfile or die "Cannot open output file ($sam_outfile). $!";
  print {$ofh} "\@" . "HD\tVN:1.0\tSO:unsorted\n";          # @HD = header, VN = version, SO = sort order [technically sorted by chromosome and start coordinate]
  while(my $line = <$ifh>){
    chomp $line;
    my ($chr, $length, @rest) = split "\t", $line;
    die "Chromosome length is not numeric in file ($chrom_sizes_file) on line $.\n" if $length =~ /\D/;
    print {$ofh} "\@" . "SQ\tSN:$chr\tLN:$length\n";        # @SQ = sequence, SN = seq name, LN = length
  }
}

sub convert_bismark_to_SAM{
  open my $ifh, "<", $bismark_infile or die "Cannot open input file ($bismark_infile). $!";
  open my $ofh, "|-", "sort -k3,3 -k4,4n >>$sam_outfile" or die "Cannot sort or append to output file ($sam_outfile). $!";
  while(my $line = <$ifh>){
    next if $line =~ /^Bismark version/;
    chomp $line;
    my ($id, $strand, $chr, $start, $stop, $actual_seq, $ref_seq, @rest) = split "\t", $line;

    ### Input validations ###
    die "Strand information ($strand) is incorrect in file ($bismark_infile) on line $.\n" if $strand =~ /[^\+\-]/;
    die "Start position ($start) is not numeric in file ($bismark_infile) on line $.\n" if $start =~ /\D/;
    die "End position ($stop) is not numeric in file ($bismark_infile) on line $.\n" if $stop =~ /\D/;

    # Assumes bisulfite sequences has A,C,T,G and N only
    die "Bisulfite sequence ($actual_seq) contains invalid nucleotides in file ($bismark_infile) on line $.\n" if $actual_seq =~ /[^ACTGNactgn]/;
    
    # Allows all degenerate nucleotide sequences in reference genome
    die "Reference sequence ($ref_seq) contains invalid nucleotides in file ($bismark_infile) on line $.\n" if $ref_seq =~ /[^ACTGNRYMKSWBDHVactgnrymkswbdhv]/;

    my $flag;                                        # FLAG variable used for SAM format.
    ($strand eq "+") ? ($flag = 0) : ($flag = 16);   # Currently, this only handles single-end mapping, with 0 for "+" strand and 16 for "-" strand
#    $start ++;                                      # Uncomment if using 0-based mapping
    my $mapq = 255;                                  # Assume mapping quality is unavailable
    my $cigar = length($actual_seq) . "M";           # Assume no indel during mapping (only matches and mismatches)
    my $mrnm = "*";                                  # Paired-end variable
    my $mpos = 0;                                    # Paired-end variable
    my $isize = 0;                                   # Paired-end variable
    $actual_seq = revcomp($actual_seq) if $strand eq "-"; # Sequence represented on the forward genomic strand 
    $ref_seq = substr($ref_seq, 0, length($ref_seq) - 1); # Removes additional nucleotide at the end
    $ref_seq = revcomp($ref_seq) if $strand eq "-";       # Required for comparison with actual sequence
    my $qual = "I" x length($ref_seq);               # Assume highest quality, since quality information is unavailable
    my $hemming_dist = hemming_dist($actual_seq, $ref_seq);
    my $NM_tag = "NM:i:$hemming_dist";               # Optional tag: edit distance based on nucleotide differences 
    my $MD_tag = make_mismatch_string($actual_seq, $ref_seq);  # Optional tag: String to provide mismatch and indel information. Expect only mismatches

# SAM format: QNAME, FLAG, RNAME, 1-based START, MAPQ, CIGAR, MRNM, MPOS, ISIZE, SEQ, QUAL
    print {$ofh} join("\t", ($id, $flag, $chr, $start, $mapq, $cigar, $mrnm, $mpos, $isize, $actual_seq, $qual, $NM_tag, $MD_tag)), "\n";
  }
}

sub revcomp{
  my $seq = shift or croak "Missing seq to reverse complement";
  $seq = reverse $seq;
  $seq =~ tr/ACTGactg/TGACtgac/;
  return $seq;
}

sub hemming_dist{
  my $string1 = shift or croak "Missing string 1";
  my $string2 = shift or croak "Missing string 2";
  return my $hd = length($string1) - (($string1 ^ $string2) =~ tr/[\0]/[\0]/);
}

sub make_mismatch_string{
  my $actual_seq = shift or croak "Missing actual sequence";
  my $ref_seq = shift or croak "Missing reference sequence";
  my $MD_tag = "MD:Z:";
  my $tmp = ($actual_seq ^ $ref_seq);              # Bitwise comparison
  my $prev_mm_pos = 0;
  while($tmp =~ /[^\0]/g){                         # Where bitwise comparison showed a difference
    my $nuc_match = pos($tmp) - $prev_mm_pos - 1;  # Generate number of nucleotide that matches since last mismatch
    my $nuc_mm = substr($actual_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($actual_seq);  # Obtain nucleotide that was different from reference
    $MD_tag .= "$nuc_match" if $nuc_match > 0;     # Ignore if mismatches are adjacent to each other
    $MD_tag .= "$nuc_mm" if defined $nuc_mm;       # Ignore if there is no mismatch (prevents uninitialized string concatenation)
    $prev_mm_pos = pos($tmp);                      # Position of last mismatch
  }
  my $end_matches = length($actual_seq) - $prev_mm_pos;  # Provides number of matches from last mismatch till end of sequence
  $MD_tag .= "$end_matches" if $end_matches > 0;   # Ignore if mismatch is at the end of sequence
  return $MD_tag;
}


