Seqanswers Leaderboard Ad

Collapse

Announcement

Collapse
No announcement yet.
X
 
  • Filter
  • Time
  • Show
Clear All
new posts

  • #31
    Originally posted by Richard Finney View Post
    Improved version ... now called bampe2fqworphans

    supports regions, must provide output file names as arguments.
    Code:
    /*
    requires :
        http://piumarta.com/software/tree/
    
    how to make:
        Download an install Ian Piumarta's tree library from http://piumarta.com/software/tree/  (it has been known to go off line for a few days).
        Edit line below to point to samtools and tree-1.0 for headers and libraries
    
        gcc -Wall -O2 -I/h1/finneyr/samtools-0.1.18/ -I./tree-1.0 bampe2fqworphans.c -o bampe2fqworphans -lm /h1/finneyr/samtools-0.1.18/libbam.a /h1/finneyr//zlib-1.2.5/libz.a
    
    usage:
         bampe2fqworphans in.bam out_pair1.fq out_pair2.fq out_orphans.fq [region]
    
    examples
        ./bampe2fqworphans input.bam 1.fq 2.fq orphans.fq
        ./bampe2fqworphans cancercure.bam 1.fq 2.fq orphans.fq chr7:10000-20000
    */
    
    #include <stdio.h>
    #include <stdlib.h>
    off_t ftello(FILE *stream); // shuts up messages in old compilers
    
    #include "sam.h"
    #include <sys/types.h>
    #include <unistd.h>
    #include <tree.h>  // piumarta's AVL tree routines
    
    static char compli(char ch) // compliment sequence
    {
        if (ch == 'A') return 'T';
        else if (ch == 'T') return 'A';
        else if (ch == 'C') return 'G';
        else if (ch == 'G') return 'C';
        else if (ch == 'N') return 'N';
        else if (ch == 'a') return 't';
        else if (ch == 't') return 'a';
        else if (ch == 'c') return 'g';
        else if (ch == 'g') return 'c';
        return '.';
    }
    
    static void reverse_and_compliment(char s[], char outs[])
    {
        register int i,j;
    
        i = strlen(s);
        i--;
        j = 0;
        while (i >= 0)
        {
            outs[j] = compli(s[i]);
            j++;
            i--;
        }
        outs[j] = (char)0;
        return;
    }
    
    static void reverse(char s[], char outs[])
    {
        register int i,j;
    
        i = strlen(s);
        i--;
        j = 0;
        while (i >= 0)
        {
            outs[j] = compli(s[i]);
            j++;
            i--;
        }
        outs[j] = (char)0;
        return;
    }
    
    static int pid;
    static FILE *fp1;  // first mate pair fastq output file
    static FILE *fp2;  // second mate pair fastq output file
    
    struct data_type
    {
        char *name;
        int flag;
        char *seq;
        char *qual;
    };
    
    static long int incnt = 0;
    static long int outcnt = 0;
    static long int live = 0;
    
    static void info(char *name)
    {
    // you can turn on next line with "if 1" to turn OFF output , if you can't stand the progress "entertainment" output
    #if 0
    return;
    #endif
    
    fprintf(stderr,"output %ld in=%ld live=%ld fq pairs , %s \n",outcnt,incnt,live,name);
    fflush(stderr);
    
    #if 0 // turn this on to view memory statistics - see the "pmap" routine available in linux/unix
    // how much memory do we use ?
    char m[512];
    sprintf(m,"pmap %d | grep total",pid);
    system(m);
    fflush(stdout);
    #endif
    }
    
    
    typedef struct _Node
    {
        char *name;
        char *seq;
        char *qual;
        int flag;
        TREE_ENTRY(_Node)      linkage;
    } Node;
    
    typedef TREE_HEAD(_Tree, _Node) Tree;
    
    TREE_DEFINE(_Node, linkage);
    
    int fence = 0;
    
    char global_seq[500];
    char global_qual[500];
    int flag;
    
    #if 0
    // debug - testing for if strings are not null terminated
    void *mystrdup(void *src_ptr, int len)
    {
        char *dest;
        int SIZE;
    
        SIZE = len + 2;
        dest = (void *)malloc(SIZE);
        if (!dest) {fprintf(stderr,"ERROR - no more memory\n");  fflush(stderr);  exit(0); } ;
        memset(dest,0,SIZE);
        strcpy(dest,src_ptr);
        return (void *)dest;
    }
    #endif
    
    Node *Node_new(char *name, int len)
    {
           Node *self = (Node *)calloc(1, sizeof(Node));
    #if 0
           self->name = (void *)mystrdup(name,strlen(name));
           self->seq = (void *)mystrdup(global_seq,len);
           self->qual = (void *)mystrdup(global_qual,len);
    #else
           self->name = (void *)strdup(name);
           self->seq = (void *)strdup(global_seq);
           self->qual = (void *)strdup(global_qual);
    #endif
           self->flag = flag;
           return self;
    }
    
    int Node_compare(Node *lhs, Node *rhs)
    {
        return strcmp(lhs->name, rhs->name);
    }
    
    #if 0
    // debug for dumping the tree
    void Node_printer(Node *self, void *stream)
    {
        if (fence)
        {
            fprintf((FILE *)stream, "%s", self->name);
            --fence;
        }
    }
    #endif
    
    int firsttime = 1;
    static Tree tree;
    
    int orphcnt = 0; // count of orphans
    
    void orphan_node_printer(Node *self, void *stream)
    {
        fprintf(stream,"@%s\n%s\n+\n%s\n",self->name, self->seq,self->qual);
        orphcnt++;
    }
    
    void dump_orphans(char *orphan_file_name)
    {
        FILE *fpo;  // orphan fastqs
        fpo = fopen(orphan_file_name,"w");
        TREE_FORWARD_APPLY(&tree, _Node, linkage, orphan_node_printer, fpo);
        fclose(fpo);
        fprintf(stderr,"Note: %d orphans flushed\n",orphcnt);
        fflush(stderr);
    }
    
    #if 0
    // debug routine for checking if sequence got messed up
    void seqsanity(char *s, int len, char *msg1, char *msg2)
    {
        int i;
    
            for (i = 0; i < len; i++)
            {
               if ((s[i] == 'C')
                || (s[i] == 'A')
                || (s[i] == 'G')
                || (s[i] == 'T')
                || (s[i] == 'N')
                  )
               {
               }
               else
               {
               fflush(stdout);
               fprintf(stderr,"ERROR: at %d [%s][%s][%s]\n",i,s,msg1,msg2);
               fprintf(stderr,"ERROR: found char as decimal [%d] \n", s[i]);
               fflush(stderr);
               exit(0);
               }
            }
    }
    #endif
    
    void static add(char *name,int flag_arg, char *seqarg, char *qualarg, int len)
    {
        char *z1,*z2,*z3;
        flag = flag_arg;
    
        if (firsttime == 1)
        {
            static Tree treep = TREE_INITIALIZER(Node_compare);
            tree = treep;
            firsttime = 0;
            pid = getpid();
        }
        incnt++;
    
    if ((incnt % 10000000) == 0) info(name);
        Node test = { name };
        Node *ptr = TREE_FIND(&tree, _Node, linkage, &test);
        if (ptr)
        {
            outcnt++;
            if (flag_arg == 1)
            {
                fprintf(fp1,"@%s/1\n%s\n+\n%s\n",ptr->name, ptr->seq,ptr->qual);
                fprintf(fp2,"@%s/2\n%s\n+\n%s\n",name, seqarg,qualarg);
            }
            else
            {
                fprintf(fp1,"@%s/1\n%s\n+\n%s\n",name, seqarg,qualarg);
                fprintf(fp2,"@%s/2\n%s\n+\n%s\n",ptr->name, ptr->seq,ptr->qual);
            }
    
            z1 = ptr->name;
            z2 = ptr->seq;
            z3 = ptr->qual;
            TREE_REMOVE(&tree, _Node, linkage, ptr);
            if (z1) free(z1);
            if (z2) free(z2);
            if (z3) free(z3);
            free(ptr);
    
            live--;
            return;
        }
    
        TREE_INSERT(&tree, _Node, linkage, Node_new(name,len));
        live++;
        return;
    }
    
    
    static int fetch_func(const bam1_t *b, void *data)
    {
        uint32_t *cigar = bam1_cigar(b);
        const bam1_core_t *c = &b->core;
        int i, l;
    // NO!!!!!!!!!if (b->core.tid < 0) return 0;
        for (i = l = 0; i < c->n_cigar; ++i) {
            int op = cigar[i]&0xf;
            if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP)
                l += cigar[i]>>4;
        }
    
    // l_qseq = length of the query sequence
    // l_qname = length of the query name
    
            uint8_t *s = bam1_seq(b);
            uint8_t *t = bam1_qual(b);
            global_qual[0] = (char)0;
            global_seq[0] = (char)0;
            for (i = 0; i < c->l_qseq; i++) global_seq[i] = bam_nt16_rev_table[bam1_seqi(s, i)];
            global_seq[i] = (char)0;
            i = 0;
        for (i = 0; i < c->l_qseq; ++i) global_qual[i] = t[i] + 33;
            global_qual[i] = (char)0;
    if (c->flag&BAM_FREVERSE)
    {
        char tmps[512];
    
        reverse_and_compliment(global_seq, tmps);
        strcpy(global_seq,tmps);
        reverse(global_qual, tmps);
        strcpy(global_qual,tmps);
    }
    
            if (c->flag&BAM_FREAD1) add(bam1_qname(b),0,global_seq,global_qual,c->l_qseq);
            else                    add(bam1_qname(b),1,global_seq,global_qual,c->l_qseq);
    /*
        printf("%s\t%d\t%d\t%s\t%d\t%c\n", fp->header->target_name[c->tid],
            c->pos, c->pos + l, bam1_qname(b), c->qual, (c->flag&BAM_FREVERSE)? '-' : '+');
    */
        return 0;
    }
    
    int main(int argc, char *argv[])
    {
        samfile_t *fp;
    
        if ( (argc < 5) || (argc > 6) )
        {
            fprintf(stderr, "Usage: bampe2fqworphans in.bam pair1.fq pair2.fq orphans.fq [region]\n");
            fprintf(stderr, "Example without region: ./bampe2fqworphans cure2cancer.bam pair1.fq pair2.fq orphans.fq\n");
            fprintf(stderr, "Example with region: ./bampe2fqworphans cure2cancer.bam pair1.fq pair2.fq orphans.fq chr19:chr7:55054219-55242525\n");
            fprintf(stderr, "Example: ./bampe2fqworphans /tcga_next_gen/NG_buckets/bucket11/bam/TCGA-AA-3858-01A-01W-0900-09_IlluminaGA-DNASeq_exome.bam  1.fq 2.fq orphans.fq  chr19:chr7:55054219-55242525\n");
            return 1;
        }
        if ((fp = samopen(argv[1], "rb", 0)) == 0)
        {
            fprintf(stderr, "bampe2fqworphans: Fail to open BAM file %s\n", argv[1]);
            return 1;
        }
    
        fp1 = fopen(argv[2],"w");
        fp2 = fopen(argv[3],"w");
        if (argc == 5)
        {
            bam1_t *b = bam_init1();
            while (samread(fp, b) >= 0) fetch_func(b, fp);
            bam_destroy1(b);
            fclose(fp1);
            fclose(fp2);
            dump_orphans(argv[4]);  // test
            info("done");
            printf("DONE\n");
        } else {
            int ref, beg, end;
            bam_index_t *idx;
            if ((idx = bam_index_load(argv[1])) == 0) {
                fprintf(stderr, "bampe2fqworphans: BAM indexing file is not available.\n");
                return 1;
            }
    
            bam_parse_region(fp->header, argv[5], &ref, &beg, &end);
            if (ref < 0) {
                fprintf(stderr, "bampe2fqworphans: Invalid region %s\n", argv[2]);
                return 1;
            }
            bam_fetch(fp->x.bam, idx, ref, beg, end, fp, fetch_func);
            fclose(fp1);
            fclose(fp2);
            dump_orphans(argv[4]);  // test
            info("done");
            printf("DONE\n");
            bam_index_destroy(idx);
        }
        samclose(fp);
        return 0;
    }
    I should really open a github account.
    Trying this code I have noticed that it applies a reverse complement to the quality string for alignments which have the reverse flag set, i.e. about half of the produced quality strings will be nonsense for mapped paired end reads. I assume the reverse() function should not be calling compli().

    Comment


    • #32
      Originally posted by gt1 View Post
      Trying this code I have noticed that it applies a reverse complement to the quality string for alignments which have the reverse flag set, i.e. about half of the produced quality strings will be nonsense for mapped paired end reads. I assume the reverse() function should not be calling compli().
      That is quite troubling...is it hard to fix this? I've been using it for my extractions...

      Comment


      • #33
        fixed for now; I'll test fix.

        Comment


        • #34
          Originally posted by Richard Finney View Post
          fixed for now; I'll test fix.
          Thanks Richard. Let me know when you have a stable release as I would definitely download it and re-extract the libraries I had previously re-extracted.

          Comment


          • #35
            bampe2fqs.c

            --Hi

            i don't know where the file bampe2fqs.c is located or from which package ?
            Do you have a link to download it ?

            thank you --

            Comment

            Latest Articles

            Collapse

            • seqadmin
              Essential Discoveries and Tools in Epitranscriptomics
              by seqadmin




              The field of epigenetics has traditionally concentrated more on DNA and how changes like methylation and phosphorylation of histones impact gene expression and regulation. However, our increased understanding of RNA modifications and their importance in cellular processes has led to a rise in epitranscriptomics research. “Epitranscriptomics brings together the concepts of epigenetics and gene expression,” explained Adrien Leger, PhD, Principal Research Scientist...
              04-22-2024, 07:01 AM
            • seqadmin
              Current Approaches to Protein Sequencing
              by seqadmin


              Proteins are often described as the workhorses of the cell, and identifying their sequences is key to understanding their role in biological processes and disease. Currently, the most common technique used to determine protein sequences is mass spectrometry. While still a valuable tool, mass spectrometry faces several limitations and requires a highly experienced scientist familiar with the equipment to operate it. Additionally, other proteomic methods, like affinity assays, are constrained...
              04-04-2024, 04:25 PM

            ad_right_rmr

            Collapse

            News

            Collapse

            Topics Statistics Last Post
            Started by seqadmin, Yesterday, 10:49 AM
            0 responses
            17 views
            0 likes
            Last Post seqadmin  
            Started by seqadmin, 04-25-2024, 11:49 AM
            0 responses
            24 views
            0 likes
            Last Post seqadmin  
            Started by seqadmin, 04-24-2024, 08:47 AM
            0 responses
            20 views
            0 likes
            Last Post seqadmin  
            Started by seqadmin, 04-11-2024, 12:08 PM
            0 responses
            62 views
            0 likes
            Last Post seqadmin  
            Working...
            X