3
y^|                 @   s|   d dl mZmZmZmZ d dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZejeZG dd dZdS )    )absolute_importdivisionprint_functionunicode_literalsNc               @   s(   e Zd ZdZdd Zdd Zdd ZdS )	Trinity_fasta_parsera  
    Parses a Trinity.fasta file and stores the transcript name, sequence, and node path info.

    Instance member:

        trinity_gene_to_isoform_seqs : (defaultdict(list)) stores key,val of transcript_name,path_struct

        where path_struct has structure:
             {
                 'transcript_name' : accession,
                 'path' : path_str,
                 'seq' : sequence
             }
    c             C   s  t jt| _t }t|}d}d}x|D ]~}|j }|d dkr|dkr|dkrtj|j	dj
 }||krtjjdj| n| j|| d||< |}d}q*||7 }q*W |dkrtj|j	dj
 }||krtjjdj| n| j|| d||< W d Q R X d S )N r   >zutf-8z-warning, ignoring duplicate sequence entry {}T)collectionsdefaultdictlisttrinity_gene_to_isoform_seqsdictopenrstriphashlibsha224encode	hexdigestsysstderrwriteformatadd_trinity_seq_entry)selfZtrinity_fasta_filenameseenfhheadersequencelineZshaval r   X/broad/hptmp/bhaas/trinityrnaseq/Analysis/SuperTranscripts/pylib/Trinity_fasta_parser.py__init__"   s.    

zTrinity_fasta_parser.__init__c       	      C   s   t jd|}|stdj||jd}t jd|}|sFtdj||jd}t jdd|}||krttdj|| j| }|||d	}|j| d
S )aU  
        entry looks like so:
        >TRINITY_DN16_c0_g1_i2 len=266 path=[1:0-48 27:49-49 28:50-50 27:51-51 28:52-52 27:53-53 28:54-54 27:55-55 28:56-56 27:57-57 28:58-58 27:59-59 28:60-60 27:61-61 29:62-265] [-1, 1, 27, 28, 27, 28, 27, 28, 27, 28, 27, 28, 27, 28, 27, 29, -2]
        CTGTTGTGTGGGGGGTGCGCTTGTTTTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTC
        TCAAGTTGATTCCTCCATGTTGCTTTACAGAGACCTGCCAACTACCCAGGAATGTAAAAG
        CATTCATAGTATTTGTCTAGTAGAGATGCTGTATGAAAAATGCCAAAACCAAAAAGAGAA
        AGAAGGAAAGAGAGATAGATAGATGACATAGATGACGGATGGATGGGTGGGTGGGTGGAT
        GGATGGATGGATGGATGGAGGGGGGC
        z^>(\S+)z-Error, cannot parse accession from header: {}   zpath=\[([^\]]+)\]z5Error, cannot parse path info from header of line: {}z_i\d+$r   z<Error, couldn't remove isoform ID from Trinity accession: {})transcript_namepathseqN)researchRuntimeErrorr   groupsubr   append)	r   r   r   mZ	accessionZpath_strgene_idZisoform_list
iso_structr   r   r    r   H   s     


z*Trinity_fasta_parser.add_trinity_seq_entryc             C   s   | j S )N)r   )r   r   r   r     get_trinity_gene_to_isoform_infom   s    z5Trinity_fasta_parser.get_trinity_gene_to_isoform_infoN)__name__
__module____qualname____doc__r!   r   r/   r   r   r   r    r      s   &%r   )
__future__r   r   r   r   osr   r&   loggingargparser	   numpytimer   	getLoggerr0   loggerr   r   r   r   r    <module>   s   
