Seqanswers Leaderboard Ad

Collapse

Announcement

Collapse
No announcement yet.
X
 
  • Filter
  • Time
  • Show
Clear All
new posts

  • #31
    Originally posted by Richard Finney View Post
    Improved version ... now called bampe2fqworphans

    supports regions, must provide output file names as arguments.
    Code:
    /*
    requires :
        http://piumarta.com/software/tree/
    
    how to make:
        Download an install Ian Piumarta's tree library from http://piumarta.com/software/tree/  (it has been known to go off line for a few days).
        Edit line below to point to samtools and tree-1.0 for headers and libraries
    
        gcc -Wall -O2 -I/h1/finneyr/samtools-0.1.18/ -I./tree-1.0 bampe2fqworphans.c -o bampe2fqworphans -lm /h1/finneyr/samtools-0.1.18/libbam.a /h1/finneyr//zlib-1.2.5/libz.a
    
    usage:
         bampe2fqworphans in.bam out_pair1.fq out_pair2.fq out_orphans.fq [region]
    
    examples
        ./bampe2fqworphans input.bam 1.fq 2.fq orphans.fq
        ./bampe2fqworphans cancercure.bam 1.fq 2.fq orphans.fq chr7:10000-20000
    */
    
    #include <stdio.h>
    #include <stdlib.h>
    off_t ftello(FILE *stream); // shuts up messages in old compilers
    
    #include "sam.h"
    #include <sys/types.h>
    #include <unistd.h>
    #include <tree.h>  // piumarta's AVL tree routines
    
    static char compli(char ch) // compliment sequence
    {
        if (ch == 'A') return 'T';
        else if (ch == 'T') return 'A';
        else if (ch == 'C') return 'G';
        else if (ch == 'G') return 'C';
        else if (ch == 'N') return 'N';
        else if (ch == 'a') return 't';
        else if (ch == 't') return 'a';
        else if (ch == 'c') return 'g';
        else if (ch == 'g') return 'c';
        return '.';
    }
    
    static void reverse_and_compliment(char s[], char outs[])
    {
        register int i,j;
    
        i = strlen(s);
        i--;
        j = 0;
        while (i >= 0)
        {
            outs[j] = compli(s[i]);
            j++;
            i--;
        }
        outs[j] = (char)0;
        return;
    }
    
    static void reverse(char s[], char outs[])
    {
        register int i,j;
    
        i = strlen(s);
        i--;
        j = 0;
        while (i >= 0)
        {
            outs[j] = compli(s[i]);
            j++;
            i--;
        }
        outs[j] = (char)0;
        return;
    }
    
    static int pid;
    static FILE *fp1;  // first mate pair fastq output file
    static FILE *fp2;  // second mate pair fastq output file
    
    struct data_type
    {
        char *name;
        int flag;
        char *seq;
        char *qual;
    };
    
    static long int incnt = 0;
    static long int outcnt = 0;
    static long int live = 0;
    
    static void info(char *name)
    {
    // you can turn on next line with "if 1" to turn OFF output , if you can't stand the progress "entertainment" output
    #if 0
    return;
    #endif
    
    fprintf(stderr,"output %ld in=%ld live=%ld fq pairs , %s \n",outcnt,incnt,live,name);
    fflush(stderr);
    
    #if 0 // turn this on to view memory statistics - see the "pmap" routine available in linux/unix
    // how much memory do we use ?
    char m[512];
    sprintf(m,"pmap %d | grep total",pid);
    system(m);
    fflush(stdout);
    #endif
    }
    
    
    typedef struct _Node
    {
        char *name;
        char *seq;
        char *qual;
        int flag;
        TREE_ENTRY(_Node)      linkage;
    } Node;
    
    typedef TREE_HEAD(_Tree, _Node) Tree;
    
    TREE_DEFINE(_Node, linkage);
    
    int fence = 0;
    
    char global_seq[500];
    char global_qual[500];
    int flag;
    
    #if 0
    // debug - testing for if strings are not null terminated
    void *mystrdup(void *src_ptr, int len)
    {
        char *dest;
        int SIZE;
    
        SIZE = len + 2;
        dest = (void *)malloc(SIZE);
        if (!dest) {fprintf(stderr,"ERROR - no more memory\n");  fflush(stderr);  exit(0); } ;
        memset(dest,0,SIZE);
        strcpy(dest,src_ptr);
        return (void *)dest;
    }
    #endif
    
    Node *Node_new(char *name, int len)
    {
           Node *self = (Node *)calloc(1, sizeof(Node));
    #if 0
           self->name = (void *)mystrdup(name,strlen(name));
           self->seq = (void *)mystrdup(global_seq,len);
           self->qual = (void *)mystrdup(global_qual,len);
    #else
           self->name = (void *)strdup(name);
           self->seq = (void *)strdup(global_seq);
           self->qual = (void *)strdup(global_qual);
    #endif
           self->flag = flag;
           return self;
    }
    
    int Node_compare(Node *lhs, Node *rhs)
    {
        return strcmp(lhs->name, rhs->name);
    }
    
    #if 0
    // debug for dumping the tree
    void Node_printer(Node *self, void *stream)
    {
        if (fence)
        {
            fprintf((FILE *)stream, "%s", self->name);
            --fence;
        }
    }
    #endif
    
    int firsttime = 1;
    static Tree tree;
    
    int orphcnt = 0; // count of orphans
    
    void orphan_node_printer(Node *self, void *stream)
    {
        fprintf(stream,"@%s\n%s\n+\n%s\n",self->name, self->seq,self->qual);
        orphcnt++;
    }
    
    void dump_orphans(char *orphan_file_name)
    {
        FILE *fpo;  // orphan fastqs
        fpo = fopen(orphan_file_name,"w");
        TREE_FORWARD_APPLY(&tree, _Node, linkage, orphan_node_printer, fpo);
        fclose(fpo);
        fprintf(stderr,"Note: %d orphans flushed\n",orphcnt);
        fflush(stderr);
    }
    
    #if 0
    // debug routine for checking if sequence got messed up
    void seqsanity(char *s, int len, char *msg1, char *msg2)
    {
        int i;
    
            for (i = 0; i < len; i++)
            {
               if ((s[i] == 'C')
                || (s[i] == 'A')
                || (s[i] == 'G')
                || (s[i] == 'T')
                || (s[i] == 'N')
                  )
               {
               }
               else
               {
               fflush(stdout);
               fprintf(stderr,"ERROR: at %d [%s][%s][%s]\n",i,s,msg1,msg2);
               fprintf(stderr,"ERROR: found char as decimal [%d] \n", s[i]);
               fflush(stderr);
               exit(0);
               }
            }
    }
    #endif
    
    void static add(char *name,int flag_arg, char *seqarg, char *qualarg, int len)
    {
        char *z1,*z2,*z3;
        flag = flag_arg;
    
        if (firsttime == 1)
        {
            static Tree treep = TREE_INITIALIZER(Node_compare);
            tree = treep;
            firsttime = 0;
            pid = getpid();
        }
        incnt++;
    
    if ((incnt % 10000000) == 0) info(name);
        Node test = { name };
        Node *ptr = TREE_FIND(&tree, _Node, linkage, &test);
        if (ptr)
        {
            outcnt++;
            if (flag_arg == 1)
            {
                fprintf(fp1,"@%s/1\n%s\n+\n%s\n",ptr->name, ptr->seq,ptr->qual);
                fprintf(fp2,"@%s/2\n%s\n+\n%s\n",name, seqarg,qualarg);
            }
            else
            {
                fprintf(fp1,"@%s/1\n%s\n+\n%s\n",name, seqarg,qualarg);
                fprintf(fp2,"@%s/2\n%s\n+\n%s\n",ptr->name, ptr->seq,ptr->qual);
            }
    
            z1 = ptr->name;
            z2 = ptr->seq;
            z3 = ptr->qual;
            TREE_REMOVE(&tree, _Node, linkage, ptr);
            if (z1) free(z1);
            if (z2) free(z2);
            if (z3) free(z3);
            free(ptr);
    
            live--;
            return;
        }
    
        TREE_INSERT(&tree, _Node, linkage, Node_new(name,len));
        live++;
        return;
    }
    
    
    static int fetch_func(const bam1_t *b, void *data)
    {
        uint32_t *cigar = bam1_cigar(b);
        const bam1_core_t *c = &b->core;
        int i, l;
    // NO!!!!!!!!!if (b->core.tid < 0) return 0;
        for (i = l = 0; i < c->n_cigar; ++i) {
            int op = cigar[i]&0xf;
            if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP)
                l += cigar[i]>>4;
        }
    
    // l_qseq = length of the query sequence
    // l_qname = length of the query name
    
            uint8_t *s = bam1_seq(b);
            uint8_t *t = bam1_qual(b);
            global_qual[0] = (char)0;
            global_seq[0] = (char)0;
            for (i = 0; i < c->l_qseq; i++) global_seq[i] = bam_nt16_rev_table[bam1_seqi(s, i)];
            global_seq[i] = (char)0;
            i = 0;
        for (i = 0; i < c->l_qseq; ++i) global_qual[i] = t[i] + 33;
            global_qual[i] = (char)0;
    if (c->flag&BAM_FREVERSE)
    {
        char tmps[512];
    
        reverse_and_compliment(global_seq, tmps);
        strcpy(global_seq,tmps);
        reverse(global_qual, tmps);
        strcpy(global_qual,tmps);
    }
    
            if (c->flag&BAM_FREAD1) add(bam1_qname(b),0,global_seq,global_qual,c->l_qseq);
            else                    add(bam1_qname(b),1,global_seq,global_qual,c->l_qseq);
    /*
        printf("%s\t%d\t%d\t%s\t%d\t%c\n", fp->header->target_name[c->tid],
            c->pos, c->pos + l, bam1_qname(b), c->qual, (c->flag&BAM_FREVERSE)? '-' : '+');
    */
        return 0;
    }
    
    int main(int argc, char *argv[])
    {
        samfile_t *fp;
    
        if ( (argc < 5) || (argc > 6) )
        {
            fprintf(stderr, "Usage: bampe2fqworphans in.bam pair1.fq pair2.fq orphans.fq [region]\n");
            fprintf(stderr, "Example without region: ./bampe2fqworphans cure2cancer.bam pair1.fq pair2.fq orphans.fq\n");
            fprintf(stderr, "Example with region: ./bampe2fqworphans cure2cancer.bam pair1.fq pair2.fq orphans.fq chr19:chr7:55054219-55242525\n");
            fprintf(stderr, "Example: ./bampe2fqworphans /tcga_next_gen/NG_buckets/bucket11/bam/TCGA-AA-3858-01A-01W-0900-09_IlluminaGA-DNASeq_exome.bam  1.fq 2.fq orphans.fq  chr19:chr7:55054219-55242525\n");
            return 1;
        }
        if ((fp = samopen(argv[1], "rb", 0)) == 0)
        {
            fprintf(stderr, "bampe2fqworphans: Fail to open BAM file %s\n", argv[1]);
            return 1;
        }
    
        fp1 = fopen(argv[2],"w");
        fp2 = fopen(argv[3],"w");
        if (argc == 5)
        {
            bam1_t *b = bam_init1();
            while (samread(fp, b) >= 0) fetch_func(b, fp);
            bam_destroy1(b);
            fclose(fp1);
            fclose(fp2);
            dump_orphans(argv[4]);  // test
            info("done");
            printf("DONE\n");
        } else {
            int ref, beg, end;
            bam_index_t *idx;
            if ((idx = bam_index_load(argv[1])) == 0) {
                fprintf(stderr, "bampe2fqworphans: BAM indexing file is not available.\n");
                return 1;
            }
    
            bam_parse_region(fp->header, argv[5], &ref, &beg, &end);
            if (ref < 0) {
                fprintf(stderr, "bampe2fqworphans: Invalid region %s\n", argv[2]);
                return 1;
            }
            bam_fetch(fp->x.bam, idx, ref, beg, end, fp, fetch_func);
            fclose(fp1);
            fclose(fp2);
            dump_orphans(argv[4]);  // test
            info("done");
            printf("DONE\n");
            bam_index_destroy(idx);
        }
        samclose(fp);
        return 0;
    }
    I should really open a github account.
    Trying this code I have noticed that it applies a reverse complement to the quality string for alignments which have the reverse flag set, i.e. about half of the produced quality strings will be nonsense for mapped paired end reads. I assume the reverse() function should not be calling compli().

    Comment


    • #32
      Originally posted by gt1 View Post
      Trying this code I have noticed that it applies a reverse complement to the quality string for alignments which have the reverse flag set, i.e. about half of the produced quality strings will be nonsense for mapped paired end reads. I assume the reverse() function should not be calling compli().
      That is quite troubling...is it hard to fix this? I've been using it for my extractions...

      Comment


      • #33
        fixed for now; I'll test fix.

        Comment


        • #34
          Originally posted by Richard Finney View Post
          fixed for now; I'll test fix.
          Thanks Richard. Let me know when you have a stable release as I would definitely download it and re-extract the libraries I had previously re-extracted.

          Comment


          • #35
            bampe2fqs.c

            --Hi

            i don't know where the file bampe2fqs.c is located or from which package ?
            Do you have a link to download it ?

            thank you --

            Comment

            Latest Articles

            Collapse

            • seqadmin
              Recent Advances in Sequencing Analysis Tools
              by seqadmin


              The sequencing world is rapidly changing due to declining costs, enhanced accuracies, and the advent of newer, cutting-edge instruments. Equally important to these developments are improvements in sequencing analysis, a process that converts vast amounts of raw data into a comprehensible and meaningful form. This complex task requires expertise and the right analysis tools. In this article, we highlight the progress and innovation in sequencing analysis by reviewing several of the...
              05-06-2024, 07:48 AM
            • seqadmin
              Essential Discoveries and Tools in Epitranscriptomics
              by seqadmin




              The field of epigenetics has traditionally concentrated more on DNA and how changes like methylation and phosphorylation of histones impact gene expression and regulation. However, our increased understanding of RNA modifications and their importance in cellular processes has led to a rise in epitranscriptomics research. “Epitranscriptomics brings together the concepts of epigenetics and gene expression,” explained Adrien Leger, PhD, Principal Research Scientist...
              04-22-2024, 07:01 AM

            ad_right_rmr

            Collapse

            News

            Collapse

            Topics Statistics Last Post
            Started by seqadmin, Yesterday, 07:03 AM
            0 responses
            15 views
            0 likes
            Last Post seqadmin  
            Started by seqadmin, 05-10-2024, 06:35 AM
            0 responses
            37 views
            0 likes
            Last Post seqadmin  
            Started by seqadmin, 05-09-2024, 02:46 PM
            0 responses
            45 views
            0 likes
            Last Post seqadmin  
            Started by seqadmin, 05-07-2024, 06:57 AM
            0 responses
            39 views
            0 likes
            Last Post seqadmin  
            Working...
            X