Seqanswers Leaderboard Ad

**golharam** · 06-28-2012, 07:50 AM

EMBOSS has tools available to do this

**adaptivegenome** · 06-28-2012, 07:59 AM

samtools can do this too

**tomc** · 06-28-2012, 08:09 AM

ncbi blast has start & end parameters to retrieve subseuences

**jwhite** · 06-28-2012, 08:22 AM

Originally posted by genericforms View Post

samtools can do this too

Which tool?

jwhite

**Richard Finney** · 06-28-2012, 08:59 AM

UCSC, of course, has a tool for occasional web based use:

Early Error

http://genome.ucsc.edu/cgi-bin/hgc?o=7512442&g=getDna&db=hg18

Hack the url to hg19 or whatever. You can get to it from the link at the top of a browser page.

If you need something really fast for calling from the command line, compile this file (fetchdna.c"). (I know, "it's not using a nib file" or the like, I got one! ... but this is simpler).

Code:

/*
compile : gcc -Wall -O2 -o fetchdna fetchdna.c

fetchdna - gets dna from canonical genomic fastas with line len = 50
Usage : fetchdna range directory_path_to_canonical_genomic_fastas\n");
Example : ./fetchdna chr17:15000-16000 /h1/finneyr/amd64/hg18/ \n");

Instead of typing the location of your path to the genomic fastas, make a bash file like this ...

echo "~/bin/fetchdna $1 /h1/finneyr/amd64/hg18/" > myfetch
chmod +x myfetch
use like this :  ./myfetch chr17:15000-16000
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAXBUFF 5000
#define MAX_FETCH_SIZE 1000002
char path[MAXBUFF];
char dna[MAX_FETCH_SIZE ];

static void comma_gin(char *s) // gets rid of commas in a string  - dang commanists !!!
{
    char *z;
    char tmps[MAXBUFF+10];
    while (1)
    {
        z = strstr(s,",");
        if (!z) break;
        strcpy(tmps,z+1);
        strcpy(z,tmps);
    }
    return;
}

static int parse_position(const char argposition[],char chr[],int *start,int *end)
{
    int i;
    char tmps[1024];
    char t[MAXBUFF];
    char tmps1[MAXBUFF];
    char tmps2[MAXBUFF];

    tmps[0] = tmps1[0] = tmps2[0] = t[0] = (char)0;
    strcpy(t,argposition);
       // un_escape(t);        for when parsing via a URL which often put escape codes in
    strcpy(tmps,t);
    for (i=0 ; tmps[i] ; i++) { if (tmps[i] == ':') { tmps[i] = ' '; } if (tmps[i] == '-') { tmps[i] = ' '; } }
    sscanf(tmps,"%s %s %s",chr,tmps1,tmps2);
    if (strcmp(chr,"chr23") == 0) strcpy(chr,"chrX");
    else if (strcmp(chr,"chr24") == 0) strcpy(chr,"chrY");
    else if (strcmp(chr,"chr25") == 0) strcpy(chr,"chrM");
    comma_gin(tmps1);
    *start = atoi(tmps1);
    comma_gin(tmps2);
    *end = atoi(tmps2);
    return 0;
}

void fetch(char fn[],long int pos,char dna[],int len)
{
    char s[MAXBUFF];
    register int i;
    register int k;
    int headlen;
    long spot;
    FILE *fp;

               // printf("fetch : %s %ld %d\n",fn,pos,len);
    dna[0] = (char)0;
    fp = fopen(fn,"r");
    if (fp == (void *)0) { fprintf(stderr,"%s not open error.  Invalid filename. \n",fn); exit(0); }
    fgets(s,1022,fp);  // eat head line
    headlen = strlen(s) ;
    spot = headlen +  pos + (pos/50);
               // printf("spot = %ld headlen = %d pos = %ld  len=%d\n",spot,headlen,pos,len);
    fseek(fp,spot,SEEK_SET);
    i  = 0;
    while (i < len)
    {
        k = fgetc(fp);
        if ((char)k == '\r') continue;
        if ((char)k == '\n') continue;
        dna[i++] = (char)k;
    }
    dna[i+1] = (char)0;
    fclose(fp);
    return;
}


int fetchwrap(char *chr,int s, int e)
{
    int k;
    char fn[512]; //  file name to canonical fasta for a chromsome
    if ((e-s)>MAX_FETCH_SIZE-2)
    {
        fprintf(stderr,"ERROR- too big %s %d %d , cant be bigger than %d \n",chr,s,e,MAX_FETCH_SIZE);
        return -1;
    }
    sprintf(fn,"%s/%s.fa",path,chr);
    dna[0] = (char)0;
    fetch(fn,s,dna,e-s);
    for (k=0 ; dna[k] ; k++)
    {
        printf("%c",dna[k]);
        if ((k%50) == 49) printf("\n");
    }
    printf("\n");
    return 0;
}

int main(int argc,char *argv[])
{
    char position[MAXBUFF];
    char chr[MAXBUFF];
    int start,end;

    if (argc != 3)
    {
        fprintf(stderr,"ERROR: usage example is ./fetchdna chr17:15000-16000 /h1/finneyr/amd64/hg18/ \n");
        fprintf(stderr,"Usage is fetchdna range directory_path_to_canonical_genomic_fastas\n");
        fprintf(stderr,"Note: start of chromosome is 1, not 0\n");
        return 1;
    }

    strcpy(position,argv[1]);      // buffer overflow , careful if webizing
    strcpy(path,argv[2]);
    parse_position(position,chr,&start,&end); // eg.: "position=chrX:37301314-37347604"
    start = start - 1;
    end = end ;
    fetchwrap(chr,start,end);
    return 0;
}

**pbluescript** · 06-28-2012, 09:01 AM

Just to add another to the list, bedtools getfasta will do this, and it is very easy to integrate into a variety of scripts.

**jwhite** · 06-28-2012, 11:15 AM

Thanks for your answers folks.

The c program and remarks about samtools, bedtools were helpful..

Just by chance I noticed in the samtools docs that faidx not only indexes sequences, but will retrieve sequences if given a region (eg chr2:1234-1266).
This works very nicely:

samtools faidx fasta.fa region

This can be scripted easily.

Just wanted to pass this on for future reference.

Cheers,
Joe

Topics	Statistics	Last Post
Expanding the Horizons of Cellular Research with the Single Cell Atlas by seqadmin Started by seqadmin, 04-25-2024, 11:49 AM	0 responses 19 views 0 likes	Last Post by seqadmin 04-25-2024, 11:49 AM
Genetic Variants and Diabetes Risk in Childhood Cancer Survivors by seqadmin Started by seqadmin, 04-24-2024, 08:47 AM	0 responses 18 views 0 likes	Last Post by seqadmin 04-24-2024, 08:47 AM
Cancer Metastasis: A Deep Dive into Cellular Plasticity by seqadmin Started by seqadmin, 04-11-2024, 12:08 PM	0 responses 62 views 0 likes	Last Post by seqadmin 04-11-2024, 12:08 PM
Proteogenomic Profiles Offer New Clues in Prostate Cancer by seqadmin Started by seqadmin, 04-10-2024, 10:19 PM	0 responses 60 views 0 likes	Last Post by seqadmin 04-10-2024, 10:19 PM

Seqanswers Leaderboard Ad

Announcement

extract subsequence from genomic fasta file

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Latest Articles

ad_right_rmr

News