Originally posted by Richard Finney
View Post
Improved version ... now called bampe2fqworphans
supports regions, must provide output file names as arguments.
I should really open a github account.
supports regions, must provide output file names as arguments.
Code:
/* requires : http://piumarta.com/software/tree/ how to make: Download an install Ian Piumarta's tree library from http://piumarta.com/software/tree/ (it has been known to go off line for a few days). Edit line below to point to samtools and tree-1.0 for headers and libraries gcc -Wall -O2 -I/h1/finneyr/samtools-0.1.18/ -I./tree-1.0 bampe2fqworphans.c -o bampe2fqworphans -lm /h1/finneyr/samtools-0.1.18/libbam.a /h1/finneyr//zlib-1.2.5/libz.a usage: bampe2fqworphans in.bam out_pair1.fq out_pair2.fq out_orphans.fq [region] examples ./bampe2fqworphans input.bam 1.fq 2.fq orphans.fq ./bampe2fqworphans cancercure.bam 1.fq 2.fq orphans.fq chr7:10000-20000 */ #include <stdio.h> #include <stdlib.h> off_t ftello(FILE *stream); // shuts up messages in old compilers #include "sam.h" #include <sys/types.h> #include <unistd.h> #include <tree.h> // piumarta's AVL tree routines static char compli(char ch) // compliment sequence { if (ch == 'A') return 'T'; else if (ch == 'T') return 'A'; else if (ch == 'C') return 'G'; else if (ch == 'G') return 'C'; else if (ch == 'N') return 'N'; else if (ch == 'a') return 't'; else if (ch == 't') return 'a'; else if (ch == 'c') return 'g'; else if (ch == 'g') return 'c'; return '.'; } static void reverse_and_compliment(char s[], char outs[]) { register int i,j; i = strlen(s); i--; j = 0; while (i >= 0) { outs[j] = compli(s[i]); j++; i--; } outs[j] = (char)0; return; } static void reverse(char s[], char outs[]) { register int i,j; i = strlen(s); i--; j = 0; while (i >= 0) { outs[j] = compli(s[i]); j++; i--; } outs[j] = (char)0; return; } static int pid; static FILE *fp1; // first mate pair fastq output file static FILE *fp2; // second mate pair fastq output file struct data_type { char *name; int flag; char *seq; char *qual; }; static long int incnt = 0; static long int outcnt = 0; static long int live = 0; static void info(char *name) { // you can turn on next line with "if 1" to turn OFF output , if you can't stand the progress "entertainment" output #if 0 return; #endif fprintf(stderr,"output %ld in=%ld live=%ld fq pairs , %s \n",outcnt,incnt,live,name); fflush(stderr); #if 0 // turn this on to view memory statistics - see the "pmap" routine available in linux/unix // how much memory do we use ? char m[512]; sprintf(m,"pmap %d | grep total",pid); system(m); fflush(stdout); #endif } typedef struct _Node { char *name; char *seq; char *qual; int flag; TREE_ENTRY(_Node) linkage; } Node; typedef TREE_HEAD(_Tree, _Node) Tree; TREE_DEFINE(_Node, linkage); int fence = 0; char global_seq[500]; char global_qual[500]; int flag; #if 0 // debug - testing for if strings are not null terminated void *mystrdup(void *src_ptr, int len) { char *dest; int SIZE; SIZE = len + 2; dest = (void *)malloc(SIZE); if (!dest) {fprintf(stderr,"ERROR - no more memory\n"); fflush(stderr); exit(0); } ; memset(dest,0,SIZE); strcpy(dest,src_ptr); return (void *)dest; } #endif Node *Node_new(char *name, int len) { Node *self = (Node *)calloc(1, sizeof(Node)); #if 0 self->name = (void *)mystrdup(name,strlen(name)); self->seq = (void *)mystrdup(global_seq,len); self->qual = (void *)mystrdup(global_qual,len); #else self->name = (void *)strdup(name); self->seq = (void *)strdup(global_seq); self->qual = (void *)strdup(global_qual); #endif self->flag = flag; return self; } int Node_compare(Node *lhs, Node *rhs) { return strcmp(lhs->name, rhs->name); } #if 0 // debug for dumping the tree void Node_printer(Node *self, void *stream) { if (fence) { fprintf((FILE *)stream, "%s", self->name); --fence; } } #endif int firsttime = 1; static Tree tree; int orphcnt = 0; // count of orphans void orphan_node_printer(Node *self, void *stream) { fprintf(stream,"@%s\n%s\n+\n%s\n",self->name, self->seq,self->qual); orphcnt++; } void dump_orphans(char *orphan_file_name) { FILE *fpo; // orphan fastqs fpo = fopen(orphan_file_name,"w"); TREE_FORWARD_APPLY(&tree, _Node, linkage, orphan_node_printer, fpo); fclose(fpo); fprintf(stderr,"Note: %d orphans flushed\n",orphcnt); fflush(stderr); } #if 0 // debug routine for checking if sequence got messed up void seqsanity(char *s, int len, char *msg1, char *msg2) { int i; for (i = 0; i < len; i++) { if ((s[i] == 'C') || (s[i] == 'A') || (s[i] == 'G') || (s[i] == 'T') || (s[i] == 'N') ) { } else { fflush(stdout); fprintf(stderr,"ERROR: at %d [%s][%s][%s]\n",i,s,msg1,msg2); fprintf(stderr,"ERROR: found char as decimal [%d] \n", s[i]); fflush(stderr); exit(0); } } } #endif void static add(char *name,int flag_arg, char *seqarg, char *qualarg, int len) { char *z1,*z2,*z3; flag = flag_arg; if (firsttime == 1) { static Tree treep = TREE_INITIALIZER(Node_compare); tree = treep; firsttime = 0; pid = getpid(); } incnt++; if ((incnt % 10000000) == 0) info(name); Node test = { name }; Node *ptr = TREE_FIND(&tree, _Node, linkage, &test); if (ptr) { outcnt++; if (flag_arg == 1) { fprintf(fp1,"@%s/1\n%s\n+\n%s\n",ptr->name, ptr->seq,ptr->qual); fprintf(fp2,"@%s/2\n%s\n+\n%s\n",name, seqarg,qualarg); } else { fprintf(fp1,"@%s/1\n%s\n+\n%s\n",name, seqarg,qualarg); fprintf(fp2,"@%s/2\n%s\n+\n%s\n",ptr->name, ptr->seq,ptr->qual); } z1 = ptr->name; z2 = ptr->seq; z3 = ptr->qual; TREE_REMOVE(&tree, _Node, linkage, ptr); if (z1) free(z1); if (z2) free(z2); if (z3) free(z3); free(ptr); live--; return; } TREE_INSERT(&tree, _Node, linkage, Node_new(name,len)); live++; return; } static int fetch_func(const bam1_t *b, void *data) { uint32_t *cigar = bam1_cigar(b); const bam1_core_t *c = &b->core; int i, l; // NO!!!!!!!!!if (b->core.tid < 0) return 0; for (i = l = 0; i < c->n_cigar; ++i) { int op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) l += cigar[i]>>4; } // l_qseq = length of the query sequence // l_qname = length of the query name uint8_t *s = bam1_seq(b); uint8_t *t = bam1_qual(b); global_qual[0] = (char)0; global_seq[0] = (char)0; for (i = 0; i < c->l_qseq; i++) global_seq[i] = bam_nt16_rev_table[bam1_seqi(s, i)]; global_seq[i] = (char)0; i = 0; for (i = 0; i < c->l_qseq; ++i) global_qual[i] = t[i] + 33; global_qual[i] = (char)0; if (c->flag&BAM_FREVERSE) { char tmps[512]; reverse_and_compliment(global_seq, tmps); strcpy(global_seq,tmps); reverse(global_qual, tmps); strcpy(global_qual,tmps); } if (c->flag&BAM_FREAD1) add(bam1_qname(b),0,global_seq,global_qual,c->l_qseq); else add(bam1_qname(b),1,global_seq,global_qual,c->l_qseq); /* printf("%s\t%d\t%d\t%s\t%d\t%c\n", fp->header->target_name[c->tid], c->pos, c->pos + l, bam1_qname(b), c->qual, (c->flag&BAM_FREVERSE)? '-' : '+'); */ return 0; } int main(int argc, char *argv[]) { samfile_t *fp; if ( (argc < 5) || (argc > 6) ) { fprintf(stderr, "Usage: bampe2fqworphans in.bam pair1.fq pair2.fq orphans.fq [region]\n"); fprintf(stderr, "Example without region: ./bampe2fqworphans cure2cancer.bam pair1.fq pair2.fq orphans.fq\n"); fprintf(stderr, "Example with region: ./bampe2fqworphans cure2cancer.bam pair1.fq pair2.fq orphans.fq chr19:chr7:55054219-55242525\n"); fprintf(stderr, "Example: ./bampe2fqworphans /tcga_next_gen/NG_buckets/bucket11/bam/TCGA-AA-3858-01A-01W-0900-09_IlluminaGA-DNASeq_exome.bam 1.fq 2.fq orphans.fq chr19:chr7:55054219-55242525\n"); return 1; } if ((fp = samopen(argv[1], "rb", 0)) == 0) { fprintf(stderr, "bampe2fqworphans: Fail to open BAM file %s\n", argv[1]); return 1; } fp1 = fopen(argv[2],"w"); fp2 = fopen(argv[3],"w"); if (argc == 5) { bam1_t *b = bam_init1(); while (samread(fp, b) >= 0) fetch_func(b, fp); bam_destroy1(b); fclose(fp1); fclose(fp2); dump_orphans(argv[4]); // test info("done"); printf("DONE\n"); } else { int ref, beg, end; bam_index_t *idx; if ((idx = bam_index_load(argv[1])) == 0) { fprintf(stderr, "bampe2fqworphans: BAM indexing file is not available.\n"); return 1; } bam_parse_region(fp->header, argv[5], &ref, &beg, &end); if (ref < 0) { fprintf(stderr, "bampe2fqworphans: Invalid region %s\n", argv[2]); return 1; } bam_fetch(fp->x.bam, idx, ref, beg, end, fp, fetch_func); fclose(fp1); fclose(fp2); dump_orphans(argv[4]); // test info("done"); printf("DONE\n"); bam_index_destroy(idx); } samclose(fp); return 0; }
Comment