#!/usr/bin/perl use warnings; use strict; use Getopt::Long; # subSetFasta.pl # Select a subset of a FASTA file based on a provided list file. # The list file may be an include list or exclude list. my $fastaFile; my $listFile; my $mode = "i"; my %list; GetOptions("fasta=s" => \$fastaFile, "list=s" => \$listFile, "mode=s" => \$mode); $mode = lc($mode); if (($mode ne 'i') && ($mode ne 'e')) { die "The mode argument, $mode, was not recognized. It should be 'i' or 'e'.\n"; } if (!defined $fastaFile) { die "You must enter a FASTA filename withe the -f/--fasta option.\n"; } if (!defined $listFile) { die "You must provide a file contianing a list of the reads you wish to include/exclude using the -l/--list option.\n"; } open (LIST, "<$listFile") || die "The list file $listFile can not be opened; $!\n"; while (my $id = ) { chomp $id; $list{$id}++; } close LIST; # Set the record separator to the FASTA new record indicator ">" # but only to remove the very first one from the file. $/ = ">"; open (FASTA, "<$fastaFile") || die "The FASTA file $fastaFile can not be opened; $!\n"; my $discardFirstOne =; # Now change the record separator to "\n>" which will only consider ">" characters # at the begining of a line. This is to account for deflines which may improperly # include a ">" in them (e.g. 5' -> 3' which I have seen a lot, even from NCBI). $/ = "\n>"; while () { chomp; my ($defline,@seqlines) = split /\n/, $_; my ($id) = $defline =~ m/^(\S+)/; # for generic fasta if (($mode eq 'i') && (defined $list{$id})) { &outputSeq($defline,@seqlines); } elsif (($mode eq 'e') && (! defined $list{$id})) { &outputSeq($defline,@seqlines); } } sub outputSeq { my ($defline,@seqlines) = @_; print ">$defline\n"; foreach (@seqlines) { print "$_\n"; } }