#!/usr/bin/perl -w
#===============================================================================
#
# TopHat (at least v1.0.13) can accept a file containing a list of raw
# junctions. The file is provided to TopHat via the '--raw-juncs' option (see
# the TopHat manual for additional details).
#
# This script generates a raw junction file that contains the junctions in a
# human UCSC Known Gene annotation (e.g. hg19). The UCSC annotation must be
# provided as a 'knownGene' file. Note that the script only lists junctions for
# the standard chromosomes (1-22, X, Y, M).
#
#
# USAGE
# -----
# create_top_raw_junction_file <ucsc-knowngene-file> <output-junction-file>
#
# <ucsc-knowngene-file> should be a 'knownGene' file downloaded from UCSC.
# <output-junction-file> is the file that will be created.
#
#
# OUTPUT FILE FORMAT
# ------------------
# According to the online version of the manual (as of 2010-05-27), the file
# should contain one junction per line in a tab-delimited format as follows:
# <chrom> <left> <right> <+/->.
# The left and right fields are zero-based coordinates, with the <left> field
# representing the final base of the left exon of the junction, and the <right>
# field representing the first base of the right exon of the junction.
# 
#
# USE AT YOUR OWN RISK
# --------------------
# This code is released to the public domain without any implied fitness for
# purpose...
#
#===============================================================================

my $argumentCount = $#ARGV + 1;
if ($argumentCount != 2) {
   print("Arguments: <ucsc-knowngene-file> <output-junction-file>\n");
   exit();
}

my $ucscKnowngeneFilename = $ARGV[0];
my $outputJunctionFilename = $ARGV[1];

my $line;
open (KNOWNGENE_FILE, "<$ucscKnowngeneFilename") || die();
open (JUNCTION_FILE, ">$outputJunctionFilename") || die();

my %validChroms = (
   "chr1" => 1,
   "chr2" => 1,
   "chr3" => 1,
   "chr4" => 1,
   "chr5" => 1,
   "chr6" => 1,
   "chr7" => 1,
   "chr8" => 1,
   "chr9" => 1,
   "chr10" => 1,
   "chr11" => 1,
   "chr12" => 1,
   "chr13" => 1,
   "chr14" => 1,
   "chr15" => 1,
   "chr16" => 1,
   "chr17" => 1,
   "chr18" => 1,
   "chr19" => 1,
   "chr20" => 1,
   "chr21" => 1,
   "chr22" => 1,
   "chrX" => 1,
   "chrY" => 1,
   "chrM" => 1
);


while (my $line = <KNOWNGENE_FILE>) {

   my @parts = split(/\t/, $line);

   my $transcript = $parts[0];
   my $chrom = $parts[1];

   if ($validChroms{$chrom}) {

      my $strand = $parts[2];
      my $exonCount = $parts[7];

      my $exonStartText = $parts[8];
      my $exonEndText = $parts[9];

      my @exonStarts = split(/,/, $exonStartText);
      my @exonEnds = split(/,/, $exonEndText);

      for my $i (0 .. ($exonCount - 2)) {
         print JUNCTION_FILE $chrom . "\t" . ($exonEnds[$i] - 1) . "\t" . $exonStarts[$i + 1] . "\t" . $strand . "\n"; 
      }

   }

}

close (KNOWNGENE_FILE) || die();
close (JUNCTION_FILE) || die();

print "Done.\n";
