1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
|
/* ***************************************************************************
*
* KisSplice
* de-novo calling alternative splicing events from RNA-seq data.
*
* ***************************************************************************
*
* Copyright INRIA
* contributors : Vincent Lacroix
* Pierre Peterlongo
* Gustavo Sacomoto
* Vincent Miele
* Alice Julien-Laferriere
* David Parsons
*
* pierre.peterlongo@inria.fr
* vincent.lacroix@univ-lyon1.fr
*
* This software is a computer program whose purpose is to detect alternative
* splicing events from RNA-seq data.
*
* This software is governed by the CeCILL license under French law and
* abiding by the rules of distribution of free software. You can use,
* modify and/ or redistribute the software under the terms of the CeCILL
* license as circulated by CEA, CNRS and INRIA at the following URL
* "http://www.cecill.info".
* As a counterpart to the access to the source code and rights to copy,
* modify and redistribute granted by the license, users are provided only
* with a limited warranty and the software's author, the holder of the
* economic rights, and the successive licensors have only limited
* liability.
* In this respect, the user's attention is drawn to the risks associated
* with loading, using, modifying and/or developing or reproducing the
* software by the user in light of its specific status of free software,
* that may mean that it is complicated to manipulate, and that also
* therefore means that it is reserved for developers and experienced
* professionals having in-depth computer knowledge. Users are therefore
* encouraged to load and test the software's suitability as regards their
* requirements in conditions enabling the security of their systems and/or
* data to be ensured and, more generally, to use and operate it in the
* same conditions as regards security.
*
* The fact that you are presently reading this means that you have had
* knowledge of the CeCILL license and that you accept its terms.
*/
// ===========================================================================
// Include Libraries
// ===========================================================================
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <memory>
// ===========================================================================
// Include Project Files
// ===========================================================================
#include "debug.h"
#include "NGraph.h"
#include "CGraph.h"
#include "CycleCompression.h"
#include "SplitBcc.h"
#include "Utils.h"
// ===========================================================================
// Define Miscellaneous Functions
// ===========================================================================
void read_edges_and_nodes( char* edges_fname, char* nodes_fname, const int k,
vector<LabelledCEdge>& allEdges, vector<char*>& seqs )
{
File edge_file = File::open_path(edges_fname, "r");
File node_file = File::open_path(nodes_fname, "r");
read_edge_file( edge_file.get_ptr(), allEdges );
read_node_file( node_file.get_ptr(), seqs, k );
}
int main( int argc, char** argv )
{
const char* base_name = "./bcc/graph";
bool output_context = false;
if (argc < 4) {
fprintf( stderr, "Wrong number of arguments!\n" );
fprintf( stderr, "Usage: ./run_modules edge_file node_file k_value path_to_output [--output-context]\n" );
return EXIT_SUCCESS;
}
if (argc >= 5) {
base_name = argv[4];
}
if ( argc == 6 && strcmp(argv[5], "--output-context") == 0 )
{
output_context = true;
}
const int k_value = atoi( argv[3] );
// Read input files
std::vector<char*> seqs;
std::vector<LabelledCEdge> allEdges;
read_edges_and_nodes( argv[1], argv[2], k_value, allEdges, seqs );
// Creating & Initializing the graph with the edges reads
CGraph graph {(int)seqs.size(), allEdges, k_value};
// Decompose graph into BCCs
fprintf(stdout, "Searching biconnected components...\n");
std::vector<std::vector<CEdge>> bcc = find_bcc(graph);
fprintf(stdout, "Number of biconnected components found: %zu\n\n", bcc.size());
graph.destroy_adj_list();
// P1 - descriptor files
File contents_edge_file = File::open_path_sprintf("%s_contents_edges_bcc", "w", base_name);
File contents_node_file = File::open_path_sprintf("%s_contents_nodes_bcc", "w", base_name);
int lines_written_edges = 0;
int lines_written_nodes = 0;
contents_edge_file.fprintf("%d\n", lines_written_edges);
contents_node_file.fprintf("%d\n", lines_written_nodes);
// P2 - data files
// Optimization : write one bcc data per file, unless this would exceed NUMBEROFFILES in which case multiple are written per file.
// FIXME find a less convoluted way to deal with that
int output_bcc_file_count;
int records_per_bcc_file;
if (bcc.size() == 0) {
std::fprintf(stderr, "Warning: No BCC to handle, stopping\n");
return EXIT_FAILURE;
} else if (bcc.size() == 1) {
output_bcc_file_count = 1;
records_per_bcc_file = 1;
} else {
output_bcc_file_count = std::min(static_cast<int>(bcc.size()), NUMBEROFFILES);
records_per_bcc_file = (int)bcc.size() / (output_bcc_file_count-1); // this division may have a remainder, then extra file (+1) required
}
int current_bcc_file_id = 0;
File current_total_edge_file = File::open_path_sprintf("%s_all_edges_bcc_%d", "w", base_name, current_bcc_file_id+1);
File current_total_node_file = File::open_path_sprintf("%s_all_nodes_bcc_%d", "w", base_name, current_bcc_file_id+1);
File total_log_file = File::open_path_sprintf("%s_all_log_bcc", "w", base_name);
File info_snp_file = File::open_path_sprintf("%s_info_snp_bcc", "w", base_name);
// P3 - ascii info file to get all parameters to read the big files
File info_file = File::open_path_sprintf("%s_info_bcc", "w", base_name);
info_file.fprintf("%zu\n", bcc.size());
info_file.fprintf("%d\n", records_per_bcc_file);
// For each BCC, ...
for ( int i = 0 ; i < (int)bcc.size() ; i++ )
{
fprintf(stdout, "Processing component %d...\n", i+1);
// Build uncompact graph corresponding to current BCC
auto component = NGraph(graph, seqs, allEdges, bcc[i]);
fprintf(stdout, "Initial size: %d nodes.\n", component.getNbNodes() );
// Compress linear paths of size > 2
fprintf(stdout, "Compressing linear paths...\n");
int original_size = component.getNbNodes();
component.compress_all_paths();
fprintf( stdout, "Number of compressed nodes: %d.\n", original_size - component.getNbNodes() );
// Compress bubbles
fprintf( stdout, "Compressing simple bubbles...\n" );
int n_compressed_bubbles = 0;
component.compress_all_bubbles( &n_compressed_bubbles, total_log_file.get_ptr(), i+1, output_context );
fprintf( stdout, "Number of compressed bubbles: %d.\n", n_compressed_bubbles );
int nbsnps = component.getNbOutput();
if (nbsnps) {
info_snp_file.fprintf("%d\t%d\n", i+1, nbsnps); // considering (i+1) and not i
}
// Recompress linear paths of size > 2
fprintf(stdout, "Recompressing linear paths...\n");
original_size = component.getNbNodes();
component.compress_all_paths();
fprintf( stdout, "Number of compressed nodes: %d.\n", original_size - component.getNbNodes() );
if (component.getNbNodes() >= 4) {
// IO optimization
component.print_graph_edges_new( &lines_written_edges, contents_edge_file.get_ptr(),current_total_edge_file.get_ptr(), NULL, NULL, false );
component.print_graph_nodes_new( &lines_written_nodes, contents_node_file.get_ptr(),current_total_node_file.get_ptr(), NULL, NULL, false );
} else {
contents_edge_file.fprintf("%d\n", lines_written_edges);
contents_node_file.fprintf("%d\n", lines_written_nodes);
}
fprintf(stdout, "Final size: %d nodes.\n", component.getNbNodes());
info_file.fprintf("%d %d\n", i+1, component.getNbNodes()); // outuput size of bcc for further use
fprintf(stdout, "Done!\n\n");
// IO optimization: check if output bcc file has to be changed - considering (i+1) and not i
if (((i+1) % records_per_bcc_file == 0) && ((current_bcc_file_id+1) < output_bcc_file_count)) {
current_bcc_file_id += 1;
current_total_edge_file = File::open_path_sprintf("%s_all_edges_bcc_%d", "w", base_name, current_bcc_file_id+1);
current_total_node_file = File::open_path_sprintf("%s_all_nodes_bcc_%d", "w", base_name, current_bcc_file_id+1);
lines_written_edges = 0;
lines_written_nodes = 0;
//write an additional 0 in the first place
contents_edge_file.fprintf("%d\n", lines_written_edges);
contents_node_file.fprintf("%d\n", lines_written_nodes);
}
}
for (char* seq : seqs) {
delete[] seq;
}
return EXIT_SUCCESS;
}
|