#!/usr/bin/perl -w
#
# Count methylation of cytosine genomewide from Bismark methylation caller output and generates a BedGraph
# Usage: genome_methylation_count.pl (--cutoff [threshold]) [Bismark methylation caller output] > [Output]

use strict;
use warnings;
use Carp;
use Getopt::Long;

my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status

GetOptions("cutoff=i" => \$coverage_threshold,
	  );

if(scalar @ARGV < 1){
  warn "Missing input file\n";
  die usage();
}

my $infile = shift;
open my $ifh, "sort -k3,3 -k4,4n $infile |" or die "Input file could not be sorted. $!";

# print "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n";

my $last_pos = 0;
my $last_chr;
my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total

while(my $line = <$ifh>){
  next if $line =~ /^Bismark/;
  chomp $line;
  my ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
  $last_chr = $chr unless defined $last_chr;
  $last_pos = $pos unless defined $last_pos;
  if(($last_pos ne $pos) || ($last_chr ne $chr)){
    generate_output() if $methylcalls[2] > 0;
    @methylcalls = qw (0 0 0);
    $last_chr = $chr;
    $last_pos = $pos;
  }
  my $validated = validate_methylation_call($meth_state, $meth_state2);
  unless($validated){
    warn "Methylation state of sequence ($name) in file ($infile) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
    next;
  }
  if($meth_state eq "+"){
    $methylcalls[0] ++;
    $methylcalls[2] ++;
  }
  else{
    $methylcalls[1] ++;
    $methylcalls[2] ++;
  }
}

generate_output() if $methylcalls[2] > 0;

close $ifh or die $!;


sub generate_output{
  my $methcount = $methylcalls[0];
  my $nonmethcount = $methylcalls[1];
  my $totalcount = $methylcalls[2];
  croak "Should not be generating output if there's no reads to this region" unless $totalcount > 0;
  croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if $totalcount != ($methcount + $nonmethcount);
  my $end_pos = $last_pos + 1;
  my $meth_percentage;
  ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef);
#  $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/;
  print "$last_chr\t$last_pos\t$end_pos\t$meth_percentage\n" if defined $meth_percentage;
}

sub validate_methylation_call{
  my $meth_state = shift;
  croak "Missing (+/-) methylation call" unless defined $meth_state;
  my $meth_state2 = shift;
  croak "Missing alphabetical methylation call" unless defined $meth_state2;
  my $is_consistent;
  ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2)) 
                          : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2));
  return 1 if $is_consistent;
  return 0;
}

sub check_CpG_methylation_call{
  my $meth1 = shift;
  my $meth2 = shift;
  return 1 if($meth1 eq "+" && $meth2 eq "Z");
  return 1 if($meth1 eq "-" && $meth2 eq "z");
  return 0;
}

sub check_nonCpG_methylation_call{
  my $meth1 = shift;
  my $meth2 = shift;
  return 1 if($meth1 eq "+" && $meth2 eq "C");
  return 1 if($meth1 eq "+" && $meth2 eq "X");
  return 1 if($meth1 eq "+" && $meth2 eq "H");
  return 1 if($meth1 eq "-" && $meth2 eq "c");
  return 1 if($meth1 eq "-" && $meth2 eq "x");
  return 1 if($meth1 eq "-" && $meth2 eq "H");
  return 0;
} 

sub usage{
  print <<EOF

  Usage: genome_methylation_count.pl (--cutoff [threshold] ) [Bismark methylation caller output] > [output]

  --cutoff [threshold]  -  The minimum number of times a methylation state was
                           seen for that nucleotide before its methylation 
                           percentage is reported.
                           Default is no threshold

  The output file is a tab-delimited BedGraph file with the following information:
  
  <Chromosome> <Start Position> <End Position> <Methylation Percentage>

  Bismark methylation caller (v0.2.0 or later) should produce three output files
    (CpG, CHG and CHH) when using the "--comprehensive" option 
    (Two files if using the "--merge_non_CpG" option).
    To count both CpG and Non-CpG, combine the output files.

  Bismark methylation caller (v0.1.5 or earlier) should produce two output files 
    (CpG and Non-CpG) when using the "--comprehensive" option. 
    To count both CpG and Non-CpG, combine the two output files.

EOF
    ;
  exit 1;
}

