#!/usr/bin/perl

use warnings;
use strict;
use Getopt::Long;

# subSetFasta.pl
# Select a subset of a FASTA file based on a provided list file.
# The list file may be an include list or exclude list.

my $fastaFile;
my $listFile;
my $mode = "i";

my %list;

GetOptions("fasta=s"	=> \$fastaFile,
		   "list=s"		=> \$listFile,
		   "mode=s"		=> \$mode);

$mode = lc($mode);
if (($mode ne 'i') && ($mode ne 'e')) {
 	die "The mode argument, $mode, was not recognized.  It should be 'i' or 'e'.\n";
}

if (!defined $fastaFile) {
	die "You must enter a FASTA filename withe the -f/--fasta option.\n";
}

if (!defined $listFile) {
	die "You must provide a file contianing a list of the reads you wish to include/exclude using the -l/--list option.\n";
}

open (LIST, "<$listFile") || die "The list file $listFile can not be opened; $!\n";

while (my $id = <LIST>) {
	chomp $id;
	$list{$id}++;
}

close LIST;

# Set the record separator to the FASTA new record indicator ">"
# but only to remove the very first one from the file.

$/ = ">";

open (FASTA, "<$fastaFile") || die "The FASTA file $fastaFile can not be opened; $!\n";

my $discardFirstOne =<FASTA>;

# Now change the record separator to "\n>" which will only consider ">" characters
# at the begining of a line.  This is to account for deflines which may improperly
# include a ">" in them (e.g. 5' -> 3' which I have seen a lot, even from NCBI).

$/ = "\n>";

while (<FASTA>) {
	chomp;
	my ($defline,@seqlines) = split /\n/, $_;
	my ($id) = $defline =~ m/^(\S+)/; # for generic fasta
	
	if (($mode eq 'i') && (defined $list{$id})) {
		&outputSeq($defline,@seqlines);
	} elsif (($mode eq 'e') && (! defined $list{$id})) {
		&outputSeq($defline,@seqlines);
	}
}

sub outputSeq {
	my ($defline,@seqlines) = @_;
	print ">$defline\n";
	foreach (@seqlines) {
		print "$_\n";
	}
}
