Friday 29 May 2015

Parsing large files having different contigs( contigs contain details about genes)

Following  python source code counts the total number of contigs present in a file and extracts number of genes, Complementary DNA strand, and other information. This program will run on files downloaded from NCBI.

#Nadia Baig
fr=open("C:\\Users\\XYZ\\Desktop\\chromosome.gbk,r") #Path and file name
a=fr.read()
b=a.split("//\n")  #split the whole file on //
length=len(b)#counting length

i=0
genes=[]
for base in b:
 a=base.split("\n")
 for bb in a:
    if bb.startswith("     gene            ") :
     i=i+1
    
 genes.append(i)

print(genes)  # printing information about contigs having genes

total=0
j=0
while j<len(genes):
    if genes[j]==0:
        total=total+1   #counting contigs having no genes
    j=j+1
Tgene=sum(genes)
print("total number of genes present in file",Tgene)   #total number of genes present in contig
print("total number of contigs having no genes are",total)

# printing those genes which have an attached cds or mRNA

#Having attached CDS
cds=[]
i=0
for CDS in b:
    c=CDS.split("\n")
    for cds1 in c:
        if cds1.startswith("     CDS             " ):
            i=i+1
           

    cds.append(i)
    i=0
print("details of CDS")      
print(cds)    #showing list of genes having attached cds
nad=sum(cds)  #TOtal number of cds
print('total number of attached cds',nad)
#genes having attached mRNA
mrna=[]
i=0
for CDS in b:
    c=CDS.split("\n")
    for cds1 in c:
        if cds1.startswith("     mRNA            "):
            i=i+1
           

    mrna.append(i)
    i=0

print("details of genes having attached mRNA")      
print(mrna) # list of total number of attached mrna,s in each contig
msum=sum(mrna)  #Total number of attached mrna,s in all contigs
print('total number of attached mrnas',msum)
        
        
   
      
        

     

No comments:

Post a Comment