Hi everyone,
I am a novice at biopython, but have gotten a few things to work so far. Previously, I used biopython to pull nucleotide and protein sequences from a number of gene that were differentially expressed in my RNA-seq analysis. I am now trying to perform GO analysis on my dataset, and am trying to use biopython to gather the Entrez gene IDs (needed for the gene-2-go annotations in the GO analysis R package) from the nucleotide genbank nucleotide IDs.
My script seems to be working fine, but the problem comes after about 10-60s of running. At that point, it appears to stop querying the database and becomes "stuck". I've attempted to put in a "try-except" loop for when it gets stuck, but this doesn't seem to work. I'll post my code below along with the error message after I control-c to exit the program.
NOTE: my output file is correct up to the point where biopython stops querying the database. Every run gets "stuck" at a different point, so I don't think there is anything wrong with my files.
what the file looks like that needs to be parsed:
>ABCA2|NM_001606.4
ATGGGC...TGA
>ABHD15|NM_198147.2
ATGCCG...TAG
etc...
the output file will be identical, but with the additional Entrez IDs after the genbank IDs, e.g.:
>ABCA2|NM_001606.4|20
etc...
my code:
and the error:
any help would be great!
I am a novice at biopython, but have gotten a few things to work so far. Previously, I used biopython to pull nucleotide and protein sequences from a number of gene that were differentially expressed in my RNA-seq analysis. I am now trying to perform GO analysis on my dataset, and am trying to use biopython to gather the Entrez gene IDs (needed for the gene-2-go annotations in the GO analysis R package) from the nucleotide genbank nucleotide IDs.
My script seems to be working fine, but the problem comes after about 10-60s of running. At that point, it appears to stop querying the database and becomes "stuck". I've attempted to put in a "try-except" loop for when it gets stuck, but this doesn't seem to work. I'll post my code below along with the error message after I control-c to exit the program.
NOTE: my output file is correct up to the point where biopython stops querying the database. Every run gets "stuck" at a different point, so I don't think there is anything wrong with my files.
what the file looks like that needs to be parsed:
>ABCA2|NM_001606.4
ATGGGC...TGA
>ABHD15|NM_198147.2
ATGCCG...TAG
etc...
the output file will be identical, but with the additional Entrez IDs after the genbank IDs, e.g.:
>ABCA2|NM_001606.4|20
etc...
my code:
Code:
from Bio import Entrez import glob import re Entrez.email = "[email protected]" filenames = glob.glob("*_cds.fas") for file in filenames: print "working on %s"%file ofile = open(file) wfile = open(file+"_entrez",'w') n=0 for line in ofile: if line.startswith(">"): line = [x.strip() for x in line.split("|")] handle = Entrez.esearch(db="gene",term=line[1].strip()) EntrezID = Entrez.read(handle) EntrezID = EntrezID["IdList"][0]+"\n" wfile.write('|'.join(x for x in line+[EntrezID])) n+=1 if n%100 == 0: print "processed %s sequences"%n else: wfile.write(line) print "finished, processed %s entries"%n ofile.close() wfile.close()
Code:
KeyboardInterrupt Traceback (most recent call last)
/Users/XXX/Desktop/XXX/XXX/XXX/XXX/Add_Entrez_IDs.py in <module>()
21 if line.startswith(">"):
22 line = [x.strip() for x in line.split("|")]
---> 23 handle = Entrez.esearch(db="gene",term=line[1].strip())
24 EntrezID = Entrez.read(handle)
25 EntrezID = EntrezID["IdList"][0]+"\n"
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/Bio/Entrez/__init__.pyc in esearch(db, term, **keywds)
187 'term': term}
188 variables.update(keywds)
--> 189 return _open(cgi, variables)
190
191
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/Bio/Entrez/__init__.pyc in _open(cgi, params, post)
464 # HTTP GET
465 cgi += "?" + options
--> 466 handle = _urlopen(cgi)
467 except _HTTPError as exception:
468 raise exception
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
152 else:
153 opener = _opener
--> 154 return opener.open(url, data, timeout)
155
156 def install_opener(opener):
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
429 req = meth(req)
430
--> 431 response = self._open(req, data)
432
433 # post-process response
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in _open(self, req, data)
447 protocol = req.get_type()
448 result = self._call_chain(self.handle_open, protocol, protocol +
--> 449 '_open', req)
450 if result:
451 return result
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
407 func = getattr(handler, meth_name)
408
--> 409 result = func(*args)
410 if result is not None:
411 return result
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in http_open(self, req)
1225
1226 def http_open(self, req):
-> 1227 return self.do_open(httplib.HTTPConnection, req)
1228
1229 http_request = AbstractHTTPHandler.do_request_
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.pyc in do_open(self, http_class, req, **http_conn_args)
1192
1193 try:
-> 1194 h.request(req.get_method(), req.get_selector(), req.data, headers)
1195 except socket.error, err: # XXX what error?
1196 h.close()
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in request(self, method, url, body, headers)
1051 def request(self, method, url, body=None, headers={}):
1052 """Send a complete request to the server."""
-> 1053 self._send_request(method, url, body, headers)
1054
1055 def _set_content_length(self, body, method):
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in _send_request(self, method, url, body, headers)
1091 for hdr, value in headers.iteritems():
1092 self.putheader(hdr, value)
-> 1093 self.endheaders(body)
1094
1095 def getresponse(self, buffering=False):
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in endheaders(self, message_body)
1047 else:
1048 raise CannotSendHeader()
-> 1049 self._send_output(message_body)
1050
1051 def request(self, method, url, body=None, headers={}):
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in _send_output(self, message_body)
891 msg += message_body
892 message_body = None
--> 893 self.send(msg)
894 if message_body is not None:
895 #message_body was not a string (i.e. it is a file) and
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in send(self, data)
853 if self.sock is None:
854 if self.auto_open:
--> 855 self.connect()
856 else:
857 raise NotConnected()
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.pyc in connect(self)
830 """Connect to the host and port specified in __init__."""
831 self.sock = self._create_connection((self.host,self.port),
--> 832 self.timeout, self.source_address)
833
834 if self._tunnel_host:
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.pyc in create_connection(address, timeout, source_address)
564 if source_address:
565 sock.bind(source_address)
--> 566 sock.connect(sa)
567 return sock
568
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.pyc in meth(name, self, *args)
226
227 def meth(name,self,*args):
--> 228 return getattr(self._sock,name)(*args)
229
230 for _m in _socketmethods:
KeyboardInterrupt:
Comment