3
y^>                 @   s   d dl mZmZmZmZ d dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlZd dlZejeZejej  G dd dZdS )    )absolute_importdivisionprint_functionunicode_literalsN)GraphCycleExceptionc               @   sr   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	e
dd Ze
dd Ze
dd Ze
dddZdS )Gene_splice_modelerz
    Builds supertranscipts.

    object instance members:

        gene_id : str

        alignments : list of Node_alignment objects

    c             C   sP   || _ t | _tjdj| x,|D ]$}|j }tjj|}| jj	| q$W dS )a?  
        initialize alignments list with simple single 'alignment' objects with
        each path as an individual alignment with just its path nodes.

        params:

        gene_id : str

        node_path_obj_list : list of Node_path objects, each Node_path corresponding to an individual Trinity isoform

        zGene_splice_modeler inputs: {}N)
gene_idlist
alignmentsloggerdebugformatZget_transcript_nameNode_alignmentZget_single_seq_node_alignmentappend)selfr   node_path_obj_listZnode_path_objtranscript_nameZalignment_obj r   W/broad/hptmp/bhaas/trinityrnaseq/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.py__init__&   s    
zGene_splice_modeler.__init__c             C   s   | j S )N)r   )r   r   r   r   get_gene_idA   s    zGene_splice_modeler.get_gene_idc             C   s8   | j  s,y| j S  tk
r(   | j S X n| j S dS )a'  
        method to construct the super transcript.

        Tries 2 approaches:
            a.  If there isn't an obvious repetitive node structure and so the graph formas a DAG,
                we build a splice graph and perform topological sorting of the nodes.
            b.  If there is some repetitive structure, we resort to performing a multiple alignment-based method to
                organize relationships among nodes in isoforms, and the multiple alignment produces the linear ordering
                for the supertranscript.

        N)alignment_contains_repeat_nodetopological_order_splice_modelr   multiple_alignment_splice_model)r   r   r   r   build_splice_modelE   s    z&Gene_splice_modeler.build_splice_modelc             C   sZ   xT| j D ]J}t }x>td|j D ],}|j|}|j }||krDdS |j| q"W qW dS )Nr   TF)r
   setrangewidthZget_representative_column_node
get_loc_idadd)r   	alignmentloc_idsinode_objloc_idr   r   r   r   `   s    
z2Gene_splice_modeler.alignment_contains_repeat_nodec             C   s*  t jd | j }tj|}x| jD ]}t jdt|  |j d }|j d }t jdt|  xtdt	|D ]}|| }|j
 }|j|||j }	t jdt|	  |dkr||d  }
|j||
j
 |
j }|	j| |t	|d k rv||d  }|j||j
 |j }|	j| qvW q&W t jdt|  tjj|j }t jdt|  t }x,tdt	|D ]}|| j
 }|||< qpW t }t }xj| jD ]`}|j|j d  d	d
 |D }x.|j d D ]}|j
 }|| }|||< qW |j| qW tj|||}t jdt|  |S )zW
        Build supertranscript using simpler topological sorting of the nodes.
        z 	using topological sort method.
z1topological_order_splice_model, input alignment: r   z+topological_order_splice_model, node list: zgeneric node:    zBefore sorting nodes: zTopologically sorted nodes: c             S   s   g | ]}d qS )Nr   ).0r"   r   r   r   
<listcomp>   s    zFGene_splice_modeler.topological_order_splice_model.<locals>.<listcomp>zSplice graph model: )r   r   r   TGraphr
   strZget_aligned_nodesget_transcript_namesr   lenr   get_nodeget_seqadd_prev_nodeadd_next_nodeTopological_sortZtopologically_sortget_all_nodesdictr	   r   r   )r   r   Zgraphr    	node_listr   r"   r#   r$   Zgeneric_nodeprev_node_objZprev_generic_nodenext_node_objZnext_generic_nodeZtopologically_sorted_nodesZaligned_loc_id_posZnew_alignmentsZtranscript_idsZnew_alignmentnodeZnew_idxZsplice_graph_modelr   r   r   r   n   sR    


z2Gene_splice_modeler.topological_order_splice_modelc             C   s\  t jd | j}t|dkr$|d S tj| j}t jdt|  xt|dkr2x"tdt|D ]}d|| |< qbW tt	j
|}t|}t|| }|| }|| }|| }	tj||	}
t }x0tdt|D ]}|||fkr|j||  qW |j|
 |}t jdt|  tj|}t jdt|  qDW t|dkrTtdjt||d S )z
        Multiple alignment algorithm for dealing with repeat nodes:
        For each best matching pair of transcripts (or aligned transcripts),
        perform alignment, and replace aligned pair with a single alignment object.
        z	using mult alignment method.
r%   r   zSimilarity matrix:
z
Updated alignments:
zHError, should only have one alignment but have {} alignments after merge)r   r   r
   r+   r   compute_similarity_matrixr)   r   intnumpyargmaxmerge_alignmentsr	   r   RuntimeErrorr   )r   r
   Zsimilarity_matrixr"   Zbest_pair_idxnum_alignmentsZbest_pair_idx_1Zbest_pair_idx_2align_aalign_bZalign_mergedZnew_alignment_listr   r   r   r      s8    


z3Gene_splice_modeler.multiple_alignment_splice_modelc       	      C   s   t | }tj||fdd}x`td|d D ]N}| | }x@t|d |D ].}| | }tjj||}t |}||| |< qFW q*W |S )zc
        similarity matrix indicates number of shared nodes between each pair of isoforms.
        int_)dtyper   r%   )r+   r:   zerosr   r   Zcompute_number_common_nodes)	Zalignments_listr>   Z
sim_matrixr"   Zalign_ijZalign_jZcommon_nodesZnum_common_nodesr   r   r   r8      s    z-Gene_splice_modeler.compute_similarity_matrixc             C   s  t jdj| | t| j }t|j }tj||sFtdj||| j }|j }tjj	||}x&t
d|d D ]}d|| d d< qtW x&t
d|d D ]}d|d | d< qW xt
d|d D ]}xt
d|d D ]}tj| |d ||d }	||d  |d  d |	 }
|| |d  d }||d  | d }|	dkr||
|kr||
|kr||
|| | d< d	|| | d< q||kr||| | d< d|| | d< q||| | d< d|| | d< qW qW |}|}|}|}t }x|dks|dkr|| | }| j|d }|j|d }t }|d }|d	krX|d8 }|d8 }|| }n|dkr|d8 }||7 }xtt
dt|D ]}|jd
 qW nP|dkr|d8 }x"t
dt|D ]}|jd
 qW ||7 }ntdj|||j| qW |j  t jdt|  | j |j  }t }xFt
dt|D ]4}t }x|D ]}|j||  qHW |j| q8W t jdt|  tj| j ||}t jdt|  |S )z
        Computes a mismatch-free multiple alignment (just matches and gaps) between two Node_alignment objects

        returns single Node_alignment object containing the contents of aligned align_a and align_b as aligned.
        
        zMerging alignments {} and {}zEError, transcripts in alignments to merge are not disjoint: {} and {}r%   ZDEL_Br   btZDEL_AscoreZDIAGNzbt: ({},{}), bt_dir not definedzMerged alignment nodes list: zmerged alignment node matrix:
zmerged alignment obj:
)r   r   r   r   r*   
isdisjointr=   r   	DP_matrixZbuild_DP_matrixr   r   get_match_scorer	   Zget_node_LIST_at_column_posr+   r   reverser)   r   r   )r?   r@   Ztranscript_names_align_AZtranscript_names_align_BZwidth_aZwidth_bZ	dp_matrixr"   rD   Zscore_cell_matchZ
score_diagZscore_del_aZscore_del_bmax_iZmax_jZall_merged_alignment_nodes_listZscore_structZnodes_align_aZnodes_align_bZalign_nodesZbt_dirxZmerged_transcript_name_listZnode_obj_matrixrowZnode_obj_listZmerged_alignment_objr   r   r   r<      s    	






z$Gene_splice_modeler.merge_alignmentsc             C   sD   | j |}|j |}tjj|}tjj|}tj||r<dS dS dS )z]
        just determines if indices in two transcripts have the same node identifier
        r%   r   N)Zget_node_set_at_column_posr   Zget_node_loc_idsr   intersection)r?   Zidx_ar@   Zidx_bZ
node_set_aZ
node_set_br   r   r   rI     s    

z#Gene_splice_modeler.get_match_scored   c       
      C   s   t |j }t||d  }d}d}xX||k r|x:|D ]2}|| |t|||  }	||d |	 d 7 }q4W |d7 }||7 }q&W |jdj| | dS )zW
        writes the multiply aligned isoform sequences to an output filehandle
        r    	
z
// {}

{}
N)r	   keysr+   minwriter   )
	gene_namemalign_dictofhZalign_widthZtranscript_namesZalignment_lengthZalign_startZ
align_textr   Zalign_regionr   r   r   write_malign  s    

z Gene_splice_modeler.write_malignN)rO   )__name__
__module____qualname____doc__r   r   r   r   r   r   staticmethodr8   r<   rI   rY   r   r   r   r   r      s   >; r   )
__future__r   r   r   r   ossysreloggingargparsecollectionsr:   timer(   TNode	Node_pathr   r   r0   rH   	getLoggerrZ   r   
addHandlerNullHandlerr   r   r   r   r   <module>   s    
