3
ëyû^ÚA  ã               @   s„   d dl mZmZmZmZ d dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZejeƒZG dd„ dƒZdS )é    )Úabsolute_importÚdivisionÚprint_functionÚunicode_literalsNc               @   sö   e Zd ZdZdZdd„ Zdd„ Zdd„ Zd	d
„ Zdd„ Z	e
dd„ ƒZe
dd„ ƒZe
dd„ ƒZe
dd„ ƒZdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd „ Zd!d"„ Zd#d$„ Zd%d&„ Zd'd(„ Zd)d*„ Zd+d,„ Zd-d.„ Zd8d0d1„Zd2d3„ Zd4d5„ Zd6d7„ ZdS )9ÚNode_alignmentaë  
    Object has two members:

        transcript_names = [ transA,
                             transB,
                             transC,
                             ...
                             ]

        aligned_nodes = [ [transA_node_1, transA_node_2, ... ],
                          [transB_node_1, transB_node_2, ... ],
                          [ None,         transC_node_1, ... ],  
                        ]

    Note, can have None at node positions to include gaps.

    Nc             C   s   || _ || _|| _d S )N)Úgene_idÚtranscript_namesÚaligned_nodes)Úselfr   Ztranscript_name_listÚnode_obj_matrix© r   úR/broad/hptmp/bhaas/trinityrnaseq/Analysis/SuperTranscripts/pylib/Node_alignment.pyÚ__init__)   s    zNode_alignment.__init__c             C   s   | j S )N)r   )r
   r   r   r   Úget_gene_id.   s    zNode_alignment.get_gene_idc             C   s
   || _ d S )N)r   )r
   r   r   r   r   Úset_gene_id1   s    zNode_alignment.set_gene_idc             C   s   | j S )N)r   )r
   r   r   r   Úget_transcript_names4   s    z#Node_alignment.get_transcript_namesc             C   s   | j S )N)r	   )r
   r   r   r   Úget_aligned_nodes8   s    z Node_alignment.get_aligned_nodesc             C   s>   t ƒ }x| jƒ D ]}|j|ƒ qW | jƒ }t||g|gƒ}|S )zÈ
        Factory method:
           constructs a Node_alignment object from a Node_path object

           mostly just reshaping the info for use with the multiple alignment methods.
        
        )ÚlistÚget_pathÚappendÚget_transcript_namer   )Zpath_objÚ	node_listÚnode_objÚtranscript_namer
   r   r   r   Úget_single_seq_node_alignment<   s    
z,Node_alignment.get_single_seq_node_alignmentc             C   s8   t j| ƒ}t j|ƒ}t j|ƒ}t j|ƒ}tj||ƒ}|S )zT
        given to Node_alignment objects, counts the number of shared nodes
        )r   Úget_node_setÚget_node_loc_idsÚsetÚintersection)Zalign_AZalign_BÚ
node_set_aÚ
node_set_bÚcommon_nodesr   r   r   Úcompute_number_common_nodesP   s    



z*Node_alignment.compute_number_common_nodesc             C   s*   t ƒ }x| D ]}|jƒ }|j|ƒ qW |S )zb
        private static method
        gets the list of loc_id among all nodes in the set
        )r   Ú
get_loc_idÚadd)Únode_setZloc_ids_setÚnodeÚloc_idr   r   r   r   a   s
    
zNode_alignment.get_node_loc_idsc             C   sb   t | ƒ}| jƒ }tƒ }xFtd|ƒD ]8}x2td|ƒD ]$}| j| | }|dk	r2|j|ƒ q2W q"W |S )zW
        extracts a list of unique Node objects from the Node_alignment object
        r   N)ÚlenÚwidthr   Úranger	   r$   )Z	align_objZ	num_transZalignment_widthr%   Z	align_numZ	align_posr   r   r   r   r   p   s    zNode_alignment.get_node_setc             C   sB   t ƒ }x6tdt| ƒƒD ]$}| j| | }|dk	r|j|ƒ qW |S )z\
        At a given column of the Node_alignment, extracts the list of unique nodes
        r   N)r   r*   r(   r	   r$   )r
   Úcol_posÚ	node_objsÚir   r   r   r   Úget_node_set_at_column_pos„   s    	z)Node_alignment.get_node_set_at_column_posc             C   s   t | j|ƒƒ}|d S )Nr   )r   r.   )r
   r+   r   r   r   r   Úget_representative_column_node•   s    z-Node_alignment.get_representative_column_nodec             C   s:   t ƒ }x.tdt| ƒƒD ]}| j| | }|j|ƒ qW |S )Nr   )r   r*   r(   r	   r   )r
   r+   r,   r-   r   r   r   r   Úget_node_LIST_at_column_posœ   s
    z*Node_alignment.get_node_LIST_at_column_posc             C   s@   | j |ƒ}tƒ }x*|D ]"}|d kr.|jdƒ q|jdƒ qW |S )NFT)r0   r   r   )r
   r+   r   Zoccupancy_listr&   r   r   r   Ú get_node_occupancy_at_column_pos¥   s    

z/Node_alignment.get_node_occupancy_at_column_posc             C   s,   x&t dt| ƒƒD ]}| j| j|ƒ qW d S )Nr   )r*   r(   r	   r   )r
   r   r-   r   r   r   Úappend_node_to_each_entry³   s    z(Node_alignment.append_node_to_each_entryc             C   sJ   xDt dt| ƒƒD ]2}|| dkr2| j| j|ƒ q| j| jd ƒ qW d S )Nr   T)r*   r(   r	   r   )r
   r   Zoccupancy_patternr-   r   r   r   Ú*append_node_according_to_occupancy_pattern¸   s    z9Node_alignment.append_node_according_to_occupancy_patternc             C   sH   t | ƒ}t |ƒ|krtdƒ‚x&td|ƒD ]}| j| j|| ƒ q(W d S )Nz.Error, column size differs from num_alignmentsr   )r(   ÚRuntimeErrorr*   r	   r   )r
   Zcolumn_node_listÚnum_alignmentsr-   r   r   r   Ú
add_columnÃ   s
    zNode_alignment.add_columnc             C   s
   t | jƒS )zD
        number of transcripts represented in the alignment
        )r(   r   )r
   r   r   r   Ú__len__Ì   s    zNode_alignment.__len__c             C   s   t | jd ƒS )z<
        width of the alignment (number of columns)
        r   )r(   r	   )r
   r   r   r   r)   Ó   s    zNode_alignment.widthc             C   sì   t | jƒ}dj|dj| jƒƒ}| jƒ }d}xŽtd||ƒD ]~}xptd|ƒD ]b}| j| }| j| }|dj|ƒ7 }x2t||| ƒD ] }	|	|krŒP |dj||	 ƒ7 }q~W |d7 }qHW |d7 }q8W x,td|ƒD ]}| j|ƒ}
||
jƒ d 7 }qÆW |S )Nz/
# Alignment obj contains: {} transcripts: {}

ú,é
   r   z{}z	{}Ú
)	r(   r   ÚformatÚjoinr)   r*   r	   r/   ÚtoString)r
   Únum_transcriptsZret_textÚalign_widthZNODES_PER_LINEr-   Újr   Zaligned_nodes_entryÚxÚ	repr_noder   r   r   Ú__repr__Ú   s&    



zNode_alignment.__repr__c             C   s.  t | ƒ}| jƒ }tƒ }xtd|ƒD ]}|jg ƒ q"W t| jƒ | jƒ |ƒ}g }| jdƒ}x2td|ƒD ]$}| j|ƒ}||kr‚|j|ƒ |}qbW |j|ƒ t	j
dj|ƒƒ tƒ }	x~td|d ƒD ]l}||krd}
t |	ƒdkrætjj|	ƒ}
n|	d }
| j|d ƒ}|j|
|ƒ tƒ }	||k rº|	j| j|ƒƒ qºW |S )z:
        merge unbranched nodes into single nodes
        r   é   zBlock_breakpoints: {}N)r(   r)   r   r*   r   r   r   r   r1   ÚloggerÚdebugr;   ÚTNodeÚmerge_nodesr3   r/   )r
   r>   r)   r   r-   Zsqueezed_alignmentZblock_breakpointsZprev_col_node_setZnode_column_setZblocked_nodesZnode_to_addZblocked_node_occupancyr   r   r   Úsqueeze   s8    




zNode_alignment.squeezec             C   s  | j ƒ }d}tƒ }tƒ }tƒ }tƒ }x.|D ]&}d||< d||< tƒ ||< d||< q*W xbtd| jƒ ƒD ]N}	| j|	ƒ}
|
jƒ }t|ƒdkrštdt	|
ƒ ƒ‚|
j
ƒ }| j|	ƒ}t|ƒd }||
jƒ 7 }t|ƒ}xètdt|ƒƒD ]Ö}|| }|| dkrˆ||  dj|ddt	|ƒt	|ƒd	d
d	dj||ƒg	ƒ7  < ||  |7  < t|| ƒ}|}|t|ƒ d }||  |7  < || jdj|||ƒƒ qÜx(tdt|ƒƒD ]}||  d	7  < q˜W qÜW qfW dj|jƒ ƒ}d}x@|D ]8}|| }|| }dj|ƒ}|dj|t|ƒ||ƒ7 }qÒW ||||fS )NÚ r   z%Error, node seq of length zero: node=rD   Tú	ZTrinity_geneZexonÚ.ú+z!gene_id "{}"; transcript_id "{}"
z{}:{}-{}r:   ú z>{} len={} path=[{}]
{}
)r   Údictr   r*   r)   r/   Úget_seqr(   r4   Ústrr#   r1   r<   r;   r   Úvalues)r
   Ú	gene_namer   Úgene_seqZtranscript_to_gtf_linesZtranscript_to_malignÚtranscript_to_Trinity_fa_seqZtranscript_to_Trinity_fa_pathr   r-   r   Únode_seqÚnode_idÚnode_occupancyZ	pos_startZpos_endr@   Zcdna_seq_lenZrel_node_startZrel_node_endrA   Zgene_gtfZtrinity_fasta_textÚtranscript_seqÚ	path_listZpath_list_textr   r   r   Úto_gene_fasta_and_gtf4  sZ    





z$Node_alignment.to_gene_fasta_and_gtfc             C   s4   x.t d| jƒ ƒD ]}| j|ƒ}|jt|ƒƒ qW d S )Nr   )r*   r)   r/   Ú
set_loc_idrQ   )r
   r-   rB   r   r   r   Ú$reassign_node_loc_ids_by_align_order  s    
z3Node_alignment.reassign_node_loc_ids_by_align_orderFc             C   s  | j ƒ }| jƒ }tj|ƒ}tƒ }xltd|ƒD ]^}| j|ƒ}tjdj|j	ƒ ƒƒ |j
ƒ }	|jƒ }
|rndt|ƒ }
|j|	|
|jƒ ƒ}|j|ƒ q,W xT|D ]L}d }xBtd|ƒD ]4}|| d kr¨|d krÔ|j|g|| gƒ || }q¨W q”W tjdƒ x|D ]}tj|j	ƒ ƒ qôW |S )Nr   zrepr node: {}Zloc_zNew graph node listing:)r   r)   ÚTGraphr   r*   r/   rE   rF   r;   r=   Úget_transcriptsr#   rQ   Úget_noderP   r   Ú	add_edges)r
   rS   Úreset_node_idsr	   r)   Zrefined_tgraphZnew_node_listr-   rB   Útranscriptsr'   Únew_nodeZiso_node_alignmentÚprevr&   r   r   r   Úto_splice_graph‰  s0    




zNode_alignment.to_splice_graphc       
      C   s¼   | j ƒ }tƒ }x|D ]}d||< qW x’td| jƒ ƒD ]€}| j|ƒ}|jƒ }t|ƒdkrftdt|ƒ ƒ‚|j	ƒ }| j
|ƒ}x:tdt|ƒƒD ](}	||	 }||	 dkrˆ||  |7  < qˆW q4W |S )NrJ   r   z%Error, node seq of length zero: node=T)r   rO   r*   r)   r/   rP   r(   r4   rQ   r#   r1   )
r
   r   rU   r   r-   r   rV   rW   rX   r@   r   r   r   Úget_transcript_seqs¸  s     


z"Node_alignment.get_transcript_seqsc             C   sÐ   t ƒ }| jƒ }tƒ }x@|D ]8}|| }||krJtjjdj|ƒƒ |j|ƒ d||< qW |rÈtƒ }tƒ }xHt	dt
| jƒƒD ]4}| j| }	| j| }
|	|krx|j|	ƒ |j|
ƒ qxW || _|| _| jƒ  dS dS d S )NzWwarning, transcript polishing yielded duplicate seq entry... targeting {} for removal.
Tr   F)r   rg   rO   ÚsysÚstderrÚwriter;   r$   r   r*   r(   r   r	   r   Ú!remove_empty_aligned_node_columns)r
   Ztranscripts_removeZtranscript_seqsÚseenZtranscript_accrY   Zrevised_transcript_namesZrevised_aligned_nodesr-   r   r	   r   r   r   Úremove_redundant_sequencesÖ  s.    




z)Node_alignment.remove_redundant_sequencesc             C   s~   t ƒ }x:tdt| ƒƒD ](}t | j|ƒƒ}t|ƒdkr|j|ƒ qW t|ƒdkrz|jƒ  x"|D ]}x| jD ]
}||= qhW q\W d S )Nr   )r   r*   r(   r.   r   Úreverser	   )r
   Zcolumn_indices_to_remover+   r   ÚidxZnode_rowr   r   r   rk   ú  s    
z0Node_alignment.remove_empty_aligned_node_columns)F) Ú__name__Ú
__module__Ú__qualname__Ú__doc__ZGAPr   r   r   r   r   Ústaticmethodr   r"   r   r   r.   r/   r0   r1   r2   r3   r6   r7   r)   rC   rI   r[   r]   rf   rg   rm   rk   r   r   r   r   r      s8   		&4M
/$r   )Ú
__future__r   r   r   r   Úosrh   ÚreÚloggingÚargparseÚcollectionsÚnumpyÚtimerG   r^   Ú	getLoggerrp   rE   r   r   r   r   r   Ú<module>   s   
