#!/usr/bin/python
#author: Brian Granger
#use: retrieve all the pids for the proteins of a particular taxon id
#usage: python ncbi_pid_retrieve.py taxonid outputfile

import sys
import time
import math
import os
from Bio import Entrez
Entrez.email = "x@y.z" #an email for entrez to contact. You should change it to your email.

plist = []
specid = sys.argv[1]

t0 = time.time()# time the run, for testing for practicality 
out = open(sys.argv[2], 'w') #open the output file

#format our search
try:
	handle = Entrez.esearch(db="protein", term="txid"+specid+"[Organism:exp]", retmax="1")#just return "1" for now, only interested in the count. Could probably do esummary
except IOError as e:
	print "An error occurred while retrieving protein gis"
	print "The error returned was %s" % e

record = Entrez.read(handle)
count = record["Count"]
print count
percent = 0
outc = 0
errors = open('ncbi_pid.err', 'w')
for i in range(0, int(count), 1000):
	record = None
	handle = None
	#return the results 1000 ids at a time
	while handle == None:
		try:
			handle = Entrez.esearch(db="protein", term="txid"+specid+"[Organism:exp]", retmax = "1000", retstart=i)
		except:
			print "failed esearch, trying again"
	try:
		record = Entrez.read(handle)
	except:
		print "An error occurred while retrieving fasta"
#		print "The error returned was %s" % e
		errors.write(str(i) +', '+ str(i+1000) +', '+specid +'\n')
#	print record
	cur_percent = (float(i)/float(count))*100

	cur_percent = math.floor(cur_percent)
	if cur_percent > percent:
		percent = cur_percent
		print str(percent) + '% in ' + str(time.time()-t0) + ' seconds'

	if record:
	        for id in record["IdList"] :
			out.write(str(id)+'\n')
			outc +=1

errors.close()
while os.stat('ncbi_pid.err')[6]!=0:
	print '-------------------------'
	print os.stat('ncbi_pid.err')[6]
	errors = open('ncbi_pid.err', 'r')
	errs = {}
	for line in errors:
		print line
		line = line.strip().split(', ')
		errs[line[0]] = line[2]
	errors.close()
	errors = open('ncbi_pid.err', 'w')
	
	for err in errs:
		record = None
	        #return the results 1000 ids at a time
        	handle = Entrez.esearch(db="protein", term="txid"+errs[err]+"[Organism:exp]", retmax = "1000", retstart=err)

	        try:
        	        record = Entrez.read(handle)
	        except:
        	        print "An error occurred while retrieving fasta :-("
#               	print "The error returned was %s" % e
	                errors.write(str(err) +', '+ str(int(err)+1000) +', '+errs[err] +'\n')

	        if record:
	                for id in record["IdList"] :
        	                out.write(str(id)+'\n')
                	        outc +=1
	errors.close()

out.close()
print outc
print time.time() - t0, ' seconds to retrieve pids'
