File: BubbleEnumeration.cpp

package info (click to toggle)
kissplice 2.6.7-2
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 16,752 kB
sloc: cpp: 8,783; python: 1,618; perl: 389; sh: 72; makefile: 18
file content (856 lines) | stat: -rw-r--r-- 28,878 bytes
parent folder | download | duplicates (5)
/* ***************************************************************************
 *
 *                              KisSplice
 *      de-novo calling alternative splicing events from RNA-seq data.
 *
 * ***************************************************************************
 *
 * Copyright INRIA
 *  contributors :  Vincent Lacroix
 *                  Pierre Peterlongo
 *                  Gustavo Sacomoto
 *                  Vincent Miele
 *                  Alice Julien-Laferriere
 *                  David Parsons
 *
 * pierre.peterlongo@inria.fr
 * vincent.lacroix@univ-lyon1.fr
 *
 * This software is a computer program whose purpose is to detect alternative
 * splicing events from RNA-seq data.
 *
 * This software is governed by the CeCILL license under French law and
 * abiding by the rules of distribution of free software. You can  use,
 * modify and/ or redistribute the software under the terms of the CeCILL
 * license as circulated by CEA, CNRS and INRIA at the following URL
 * "http://www.cecill.info".

 * As a counterpart to the access to the source code and  rights to copy,
 * modify and redistribute granted by the license, users are provided only
 * with a limited warranty  and the software's author,  the holder of the
 * economic rights,  and the successive licensors  have only  limited
 * liability.

 * In this respect, the user's attention is drawn to the risks associated
 * with loading,  using,  modifying and/or developing or reproducing the
 * software by the user in light of its specific status of free software,
 * that may mean  that it is complicated to manipulate,  and  that  also
 * therefore means  that it is reserved for developers  and  experienced
 * professionals having in-depth computer knowledge. Users are therefore
 * encouraged to load and test the software's suitability as regards their
 * requirements in conditions enabling the security of their systems and/or
 * data to be ensured and,  more generally, to use and operate it in the
 * same conditions as regards security.
 *
 * The fact that you are presently reading this means that you have had
 * knowledge of the CeCILL license and that you accept its terms.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <vector>
#include <map>
#include <set>
#include <queue>
#include <unistd.h>
#include <climits>
#include "LabelledCEdge.h"
#include "WeightedDigraph.h"
#include "CGraph.h"
#include "BubbleEnumeration.h"
#include "Utils.h"
#include <sys/resource.h>
#define MAX_DIST 10000000
#define MAX 1024
using namespace std;

int nbBubbles = 0;

//these 3 variables represent beta, a1 and a2. We have a1Global and a2Global so that it does not confuse with some locals a1 and a2
int beta = 0;
int a1Global = 0;
int a2Global = 0;

int MAX_BUBBLES;
int UL_MAX;
int LL_MAX;
int LL_MIN;
int MIN_DIST;
int BUBBLE_COUNT_OFFSET;
int MAX_BRANCHES;
int MAX_MEMORY;
int OUTPUT_SNPS;

bool OUTPUT_CONTEXT;
bool OUTPUT_PATH;
bool OUTPUT_BRANCH;
bool EXPERIMENTAL_ALG;

// OUTPUT  FILE, One for SNPs, One for other type of Bubble
FILE *seq_output_file_type0;
FILE *seq_output_file_type1234;

FILE *path_output_file;
string comment = "";

vector<char*> seqs;
int k_value;
int nb_nodes;

////////////////////////////////////////////////////////////////////////////////////////////////////////////////


/* Returns the sequence for node "i", stored in the seqs vector. There
   is a catch, the vector does not store the sequence of the
   complementary node. The function has to convert it. */
string getSeq(int i)
{
  return (i < (int)seqs.size()) ? string(seqs[i]) : reverse_complement(string(seqs[i % (int)seqs.size()]));
}



/*!
 * \brief Return the path sequence ( with or without context)
 * \param path: nodes ids path
 * \param all_nodes : output_context boolean
 * \param output: if seq gotten for output, then there is one nt more on each side of the path
 * to detect various switching nodes
  */
string path2seq(vector<int>& path, bool all_nodes, bool output)
{
  string pseq = getSeq( path[0] );
  for (int i = 1; i < (int)path.size(); i++)
  {
    pseq += getSeq(path[i]).substr(k_value-1);
  }
  if (!all_nodes)
  {
    int len_first;
    int len_last ;
    if ( output )
    {
      len_first = getSeq( path[0] ).size() - (k_value  );
      len_last = getSeq( path[(int)path.size()-1] ).size() - (k_value  );

    }
    else
    {
      len_first = getSeq( path[0] ).size() - (k_value );
      len_last = getSeq( path[(int)path.size()-1] ).size() - (k_value );
    }
    return pseq.substr(len_first, (int) pseq.size() - (len_first + len_last) );
  }
  return pseq;
}

void print_path(FILE* stream, vector<int>& path)
{
  for (int i = 0; i < (int)path.size(); i++)
  {
    fprintf(stream, " %d", path[i]);
  }
  fprintf(stream, "\n");
}

void print_formated_path(FILE* stream, string& bcc, int cycle_num, string& type, string path_name, int len, string& bcount, string& seq)
{
  fprintf(stream, ">%s|Cycle_%d|Type_%s|%s_length_%d%s\n", bcc.c_str(), cycle_num, type.c_str(), path_name.c_str(), len, bcount.c_str());
  fprintf(stream, "%s\n", seq.c_str());
}  

void output_bubble(FILE* stream, string& upper, int upper_b, string& lower, int lower_b, string type, int num, int contextFirst, int contextLast)
{
  // passing in lower case context
  if(contextFirst != 0 || contextLast != 0)
  {
    upper = toLowerContext( upper, contextFirst, contextLast);
    lower = toLowerContext( lower, contextFirst, contextLast);
  }

  string upper_bcount = "", lower_bcount = "";
  if (OUTPUT_BRANCH)
  {
    upper_bcount = "_branches_" + to_string(upper_b);
    lower_bcount = "_branches_" + to_string(lower_b);
  }  
  
  print_formated_path(stream, comment, num, type, "upper_path", (int)upper.size() - (contextFirst + contextLast), upper_bcount, upper);
  print_formated_path(stream, comment, num, type, "lower_path", (int)lower.size() - (contextFirst + contextLast), lower_bcount, lower);
}

/*
 *  for output node-ids option
 */
void output_bubble(FILE* stream, vector<int>& upper, int upper_b, vector<int>& lower, int lower_b, string type, int num)
{
  fprintf(stream, ">%s|Cycle_%d|Type_%s|upper_path_length_%d\n", comment.c_str(), num, type.c_str(), (int)upper.size());
  print_path(stream, upper);

  fprintf(stream, ">%s|Cycle_%d|Type_%s|lower_path_length_%d\n", comment.c_str(), num, type.c_str(), (int)lower.size());
  print_path(stream, lower);
}

string classify_bubble(vector<int>& p1_nodes, vector<int>& p2_nodes) {
    string upper_seq = path2seq(p1_nodes, false, false);
    string lower_seq = path2seq(p2_nodes, false, false);
    /* truncated -1 nt left and -1nt right, they only serve to distinguish Swithcing Nodes,
     it has nothing to do with quantification
     */
    upper_seq = upper_seq.substr(1, upper_seq.size() - 2);
    lower_seq = lower_seq.substr(1, lower_seq.size() - 2);

    if (upper_seq.size() < lower_seq.size())
        swap(upper_seq, lower_seq);

    string type;
    int u_len = (int) upper_seq.size(), l_len = (int) lower_seq.size();

    if (l_len <= (2 * k_value - 2)) // lower path of at most 2k-2: splicing, repeats, and indels
    {
        int d1, d2;
        if ((u_len - l_len) > 0 && (((u_len - l_len) <= 2) || ((u_len - l_len) == 4) || ((u_len - l_len) == 5))) //indel
            type = "3";
        else if (
            (d1 = edit_distance(upper_seq.c_str(), l_len, lower_seq.c_str(), l_len, sizeof (char), comp)) <= MIN_DIST ||
            (d2 = edit_distance(upper_seq.c_str() + u_len - l_len, l_len, lower_seq.c_str(), l_len, sizeof (char), comp)) <= MIN_DIST) // tandem repeat
            type = "2";
        else //splicing
            type = "1";
    } else if (u_len == l_len && // SNP
            //specific condition for Type_0a
            ((u_len == 2 * k_value - 1) ||
            //specific condition for Type_0b, at most 10% differences in the variable region. Arbitrary TODO: check this
            (hamming_distance(upper_seq.c_str()+(k_value-1), l_len-(2*k_value)+2, 
                             lower_seq.c_str()+(k_value-1), l_len-(2*k_value)+2,
                             sizeof (char), comp) <= (0.1 * (l_len-(2*k_value)+2))))) {
        if (u_len > 2 * k_value - 1) // multiple SNPs
            type = "0b";
        else
            type = "0a";
    } else if (l_len <= LL_MAX) // others with lower path of length bigger than 2k-2 and smaller or equal to LL_MAX
        type = "4";
    else //here we have bubbles such that the lower path is bigger than LL_MAX and is not a Type0b. This can only happen by default if we were searching for Type0bs and a bubble is not a Type0b
        type = "undefined";
    return type;
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////

set<int> in_sub;

bool contains(set<int> &s, int elem)
{
  return s.find(elem) != s.end();
}


/* Used by the heap, as the pair <priority, key>. */
typedef pair<int, int> ii;

/* Dijsktra's algorithm. Returns the distance (shortest path) from
   source to every node. The results are in dist. */
void dijkstra(int max_dist, int source, map<int,int>& dist, WeightedDigraph& G)
{
  priority_queue<ii,vector<ii>, greater<ii> > Q;

  dist[source] = 0;
  Q.push(ii(0, source));

  while (!Q.empty())
  {
    ii top = Q.top();
    Q.pop();
    int v = top.second;

    for (int i = 0; i < (int)G.adj_list[v].size(); i++)
      if (!G.adj_list[v][i].removed)
      {
        int w = G.adj_list[v][i].node, cost = G.adj_list[v][i].cost;
        if (!G.removed[w] && contains(in_sub, w))
        {
	  if (dist.find(w) == dist.end())
	    dist[w] = MAX_DIST;

          if (dist[w] > dist[v] + cost)
          {
            dist[w] = dist[v] + cost;
            Q.push(ii(dist[w], w));
          }
        }
      }
  }
}

/* Checks if the distance from s to t is smaller than max_dist */
bool dist_st(int max_dist, int s, int t, WeightedDigraph& G)
{
  map<int,int> dist;
  dist[t] = MAX_DIST;

  dijkstra(max_dist, s, dist, G);

  return dist[t] <= max_dist + G.node_cost[t];
}

/* Check if there exist a pair compatible paths from s1, s2 respecting
   a1, a2. Check the paper for a precise definition of pair of
   compatible paths. */
bool testPairCompatible(int s1, int a1, int s2, int a2, WeightedDigraph& G)
{
  map<int,int> dist1, dist2;

  dijkstra(a1, s1, dist1, G);
  dijkstra(a2, s2, dist2, G);

  map<int,int>::iterator it;
  for (it = dist1.begin(); it != dist1.end(); it++)
    if (dist2.find(it->first) != dist2.end())
    {
      int i = it->first;
      if (dist1[i] <= (a1 + G.node_cost[i]) && dist2[i] <= (a2 + G.node_cost[i]))
	return true;
    }

  return false;
}

/* In-degree is equal to out-degree of the completary node, because
   it's a de Bruijn graph.*/
int inDegree(int node, WeightedDigraph& G)
{
  if (node < nb_nodes)
    return G.outDegree(node + nb_nodes);
  else
    return G.outDegree(node - nb_nodes);
}

int nbBranchingNodes(vector<int>& p_nodes, WeightedDigraph& G)
{
  int nb = 0;

  // Disregard the initial and final nodes.
  for (int i = 1; i < (int)p_nodes.size()-1; i++)
    if (G.outDegree(p_nodes[i]) != 1 ||  inDegree(p_nodes[i],G) != 1)
      nb++;
  return nb;
}

void processBubbleFound(vector<int> &p1_nodes, vector<int> &p2_nodes, WeightedDigraph &G) {
    string type = classify_bubble(p1_nodes, p2_nodes);

    if (type != "undefined" && (type[0] != '0' || OUTPUT_SNPS)) {
        string upper_seq = path2seq(p1_nodes, OUTPUT_CONTEXT, true);
        string lower_seq = path2seq(p2_nodes, OUTPUT_CONTEXT, true);
        int upper_b = nbBranchingNodes(p1_nodes, G), lower_b = nbBranchingNodes(p2_nodes, G);
        bool swp = false;

        if (upper_seq.size() < lower_seq.size()) {
            swap(upper_seq, lower_seq);
            swap(upper_b, lower_b);
            swp = true;
        }

        FILE *selection = (type[0] == '0') ? seq_output_file_type0 : seq_output_file_type1234; // Writting in different files if type 0 or else
        if (OUTPUT_CONTEXT) {
            // getting the context: size of the first (or last ) node - (k-1) right
            int contextLeft = getSeq(p1_nodes[0]).size() - (k_value);
            int contextRight = getSeq(p1_nodes[(int) p1_nodes.size() - 1]).size() - (k_value);

            output_bubble(selection, upper_seq, upper_b, lower_seq, lower_b, type, BUBBLE_COUNT_OFFSET + nbBubbles, contextLeft, contextRight);
        } else {
            output_bubble(selection, upper_seq, upper_b, lower_seq, lower_b, type, BUBBLE_COUNT_OFFSET + nbBubbles, 0, 0); // size of the context if null
        }
        if (OUTPUT_PATH) {
            output_bubble(path_output_file, swp ? p2_nodes : p1_nodes, upper_b, swp ? p1_nodes : p2_nodes, lower_b, type, BUBBLE_COUNT_OFFSET + nbBubbles);
        }
    }
    nbBubbles++;
    if (nbBubbles > MAX_BUBBLES)
        exit(15);
}



/*!
 * \brief list all pair of compatible paths (see paper), classify the events found
 * and output them
 * \param s1
 * \param a1
 * \param p1
 * \param p1_nodes
 * \param s2
 * \param a2
 * \param p2
 * \param p2_nodes
 * \param G
 *
 *
 *  List all pair of compatible paths from s1,s2 respecting a1,a2. It's
     a recursive algorithm based on the bipartition method. Check the
   paper for a full description. */
void listPairCompatible(int s1, int a1, int p1, vector<int>& p1_nodes, int s2, int a2, int p2, vector<int>& p2_nodes, WeightedDigraph& G)
{
  if (nbBranchingNodes(p1_nodes, G) > MAX_BRANCHES || nbBranchingNodes(p2_nodes, G) > MAX_BRANCHES)
    return;

  if (s1 == s2 && (p1_nodes.size() != 1 || p2_nodes.size() != 1))
  {
    if ( p1 >= (beta + G.node_cost[s1]) && p2 >= (beta + G.node_cost[s2]) )
    {
        processBubbleFound(p1_nodes, p2_nodes, G);
    }
    return;
  }

  if (G.adjListSz(s1) == 0 && G.adjListSz(s2) == 0)
    return;

  int u = (G.adjListSz(s1) != 0) ? s1 : s2;

  G.removed[u] = true;
  for (int i = 0; i < (int)G.adj_list[u].size(); i++)
  {
    int v = G.adj_list[u][i].node, cost = G.adj_list[u][i].cost;
    if (!G.removed[v] && !G.adj_list[u][i].removed && contains(in_sub, v))
    {
      if (u == s1 && testPairCompatible(v, a1 - cost, s2, a2, G))
      {
        p1_nodes.push_back(v);
        listPairCompatible(v, a1 - cost, p1 + cost, p1_nodes, s2, a2, p2, p2_nodes, G);
        p1_nodes.pop_back();
      }
      else if (u == s2 && testPairCompatible(s1, a1, v, a2 - cost, G))
      {
        p2_nodes.push_back(v);
        listPairCompatible(s1, a1, p1, p1_nodes, v, a2 - cost, p2 + cost, p2_nodes, G);
        p2_nodes.pop_back();
      }
    }
  }
  G.removed[u] = false;

  if ((u == s1 && p1 >= (beta + G.node_cost[s1])) || (u == s2 && p2 >= (beta + G.node_cost[s2])))
  {
    vector<WeightedEdge> adj_u;
    adj_u.swap(G.adj_list[u]);
    if ((u == s1 && dist_st(a2, s2, s1, G)) || (u == s2 && dist_st(a1, s1, s2, G)))
      listPairCompatible(s1, a1, p1, p1_nodes, s2, a2, p2, p2_nodes, G);
    G.adj_list[u].swap(adj_u);
  }
}


void bfs(WeightedDigraph &G, int source, set<int> &in_sub)
{
  queue<int> Q;
  map<int, int> dist;

  Q.push(source);
  dist[source] = 0;
  while (!Q.empty())
  {
    int u = Q.front();
    Q.pop();

    in_sub.insert(u);
    int d = dist[u];

    for (int i = 0; i < (int)G.adj_list[u].size(); i++)
    {
      int v = G.adj_list[u][i].node;
      if (!contains(in_sub, v))
      {
	Q.push(v);
	dist[v] = d + 1;
      }
    }
    // We don't compress non-branching paths to an edge, but to a
    // vertex, this means that each non-branching vertex maybe
    // followed by a branching vertex. Implying that a path with
    // MAX_BRANCHES contains at most 2*MAX_BRANCHES+1
    // vertices. However, we don't count the first and last vertices
    // (that are always branching), so the length is 2 *
    // (MAX_BRANCHES+1).
    if (d > 2*(MAX_BRANCHES+1))
      return;
  }
  dist.clear();
}

/* List all bubbles satisfying the path constraints. It does so using
   listPairCompatible function. */
void listAllBubbles(WeightedDigraph& G, int k_value, int UL_MAX, int LL_MAX, int LL_MIN)
{
  //local a1 and a2
  int a1 = LL_MAX - (k_value - 1),
      a2 = UL_MAX - (k_value - 1);
  beta = LL_MIN - (k_value - 1);

  for (int v = 0; v < (int)G.adj_list.size(); v++)
  {
    in_sub.clear();
    bfs(G, v, in_sub);

    vector<int> p1_nodes, p2_nodes;
    p1_nodes.push_back(v); p2_nodes.push_back(v);
    
    //checks if the user wants to output type0b or not
    if (OUTPUT_SNPS!=2) {
        //no, call normally
        listPairCompatible(v, a1, 0, p1_nodes, v, a2, 0, p2_nodes, G);
    }else {
        //yes. With this algorithm, maybe the only way to do this is to reexecute it passing a2 as LL_MIN and filtering the bubbles found s.t. lower path length > a1
        listPairCompatible(v, a2, 0, p1_nodes, v, a2, 0, p2_nodes, G);
    }
  }
}

/////////////////////////////////////////////////////////////////////////////////////////////////////

void printSummary( FILE* stream, int num_bubbles )
{
  fprintf( stream, "============================================================================\n" );
  fprintf( stream, "Summary of results\n" );
  fprintf( stream, "============================================================================\n" );

  fprintf( stream, "No of bubbles: %d\n", num_bubbles );

  fprintf( stream, "============================================================================\n" );
}

void printUsageAndExit( char * name )
{
  fprintf( stderr, "Usage: %s infofile contents_file_edges contents_file_nodes basename_edges basename_nodes number_to_read k_value output_prefix edit_distance_threshold comment numbering_offset [-u UL_MAX] [-L LL_MAX] [-l LL_MIN] [-M MAX_BUBBLES] [-s] [-p] [-c] [-b] [-e MAX_MEMORY]\n", name );
  fprintf( stderr, "\t [-u UL_MAX] Maximal length of the upper path of each bubble. Default: 1000000\n" );
  fprintf( stderr, "\t [-L LL_MAX] Maximal length of the lower path of each bubble. Default: 2k-1\n" );
  fprintf( stderr, "\t [-l LL_MIN] Minimal length of the lower path of each bubble. Default: 2k-8\n" );
  fprintf( stderr, "\t [-M MAX_BUBBLES] Stop the process after this number of bubbles. Default: 10000\n" );
  fprintf( stderr, "\t [-b MAX_BRANCHES] Maximum number of branches for each bubble. Default: 5\n");
  fprintf( stderr, "\t [-v] Outputs the number of branching nodes in each path\n");
  fprintf( stderr, "\t [-e MAX_MEMORY] Use an experimental algorithm that find bubbles by listing paths. You must provide the maximum size of the process's virtual memory (address space) in megabytes.\n");
  exit( EXIT_FAILURE );
}

//Path-enumeration algorithm to list bubbles - forward declaration
void listAllBubblesUsingPath(WeightedDigraph& G);

int main( int argc, char** argv )
{
  if ( argc < 12 )
    printUsageAndExit( argv[0] );

  vector<int> label;
  vector<LabelledCEdge> allEdges;

  string output_prefix = argv[8];
  k_value = atoi(argv[7]);
  seq_output_file_type0 = fopen( string(output_prefix + "_type0.fa").c_str(), "w" );
  seq_output_file_type1234 = fopen( string(output_prefix + "_type1234.fa").c_str(), "w" );
  MIN_DIST = atoi(argv[9]);
  comment = argv[10];
  BUBBLE_COUNT_OFFSET = atoi(argv[11]);

  LL_MAX = 2 * k_value - 1;
  LL_MIN = 2 * k_value - 10;
  UL_MAX = 1000000;
  MAX_BUBBLES = 10000;
  MAX_BRANCHES = 5;
  OUTPUT_CONTEXT = false;
  OUTPUT_SNPS = 0;
  OUTPUT_PATH = false;
  OUTPUT_BRANCH = false;
  EXPERIMENTAL_ALG = false;
  MAX_MEMORY = 0;
  
  int required_sequence = atoi( argv[6] );

  int temoin;
  while ( (temoin = getopt ( argc-11, &argv[11], "u:L:l:M:b:e:s:cpv" )) != -1 )
  {
    switch ( temoin )
    {
    case 'u' :
    {
      UL_MAX = atoi( optarg );
      break;
    }
    case 'L' :
    {
      LL_MAX = atoi( optarg );
      break;
    }
    case 'l' :
    {
      LL_MIN = atoi( optarg );
      break;
    }
    case 'M':
    {
      MAX_BUBBLES = atoi( optarg );
      break;
    }
    case 'b':
    {
      MAX_BRANCHES = atoi( optarg );
      break;
    }
    case 's':
    {
      OUTPUT_SNPS = atoi( optarg );
      switch (OUTPUT_SNPS) {
          case 0:
            fprintf(stderr, "Will not output SNPs and sequencing errors\n" );
            break;
          case 1:
            fprintf(stderr, "Will output Type0a-SNPs\n" );
            break;
          case 2:
            fprintf(stderr, "Will output Type0a and Type0b SNPs\n" );
            break;
      }
      break;
    }
    case 'c':
    {
      OUTPUT_CONTEXT = true;
      fprintf(stderr, "Will output bubble contexts!\n");
      break;
    }
    case 'p':
    {
      OUTPUT_PATH = true;
      path_output_file = fopen( string(output_prefix + ".path").c_str(), "w" );
      break;
    }
    case 'v':
    {
      OUTPUT_BRANCH = true;
      break;
    }
    case 'e':
    {
        EXPERIMENTAL_ALG = true;
        
        if (strcmp(optarg, "unlimited") == 0)
			MAX_MEMORY = -1;
        else
			MAX_MEMORY = atoi( optarg );
        break;
    }
    default:
    {
      printUsageAndExit( argv[0] );
    }
    }
  }
  fprintf(stdout, "\t Enumerating bubbles with at most %d branching nodes in each path!\n", MAX_BRANCHES);
  
  if (EXPERIMENTAL_ALG) {
    fprintf(stdout, "\t Using the experimental algorithm with maximum memory = ");
	if (MAX_MEMORY==-1) printf("unlimited.\n"); else printf("%d MB.\n", MAX_MEMORY);
	
	struct rlimit vmLimit;
	vmLimit.rlim_cur = vmLimit.rlim_max = (MAX_MEMORY==-1 ? RLIM_INFINITY : MAX_MEMORY * 1024 * 1024);
	setrlimit(RLIMIT_AS, &vmLimit);
  }
  
  EdgeLoader edgeloader(allEdges);
  NodeLoader nodeloader(seqs, label);
  bool atleast4nodes = read_edges_and_nodes_withoptimIO<EdgeLoader,NodeLoader>
    (argv[1],argv[2],argv[3],argv[4],argv[5],&required_sequence, edgeloader, nodeloader);
  if (atleast4nodes){
    nb_nodes = (int)label.size();
    map<int, int> label_to_node;

    // Set the label mapping, necessary only to read the edges
    for (int i = 0; i < nb_nodes; i++)
      label_to_node[label[i]] = i;

    // This is a edge-weighted directed graph. The forward and reverse
    // node in the bidirected de Bruijn graph are split in two nodes.
    WeightedDigraph G(2 * nb_nodes);

    for (int i = 0; i < (int)allEdges.size(); i++)
    {
      int u = label_to_node[allEdges[i].getFirst()]  +  (allEdges[i].label[0] == 'F' ? 0 : nb_nodes);
      int v = label_to_node[allEdges[i].getSecond()] +  (allEdges[i].label[1] == 'F' ? 0 : nb_nodes);
      int cost = strlen(seqs[ label_to_node[allEdges[i].getSecond()] ]) - (k_value - 1);

      G.adj_list[u].push_back(WeightedEdge(v, cost));
    }

    // We add weights for the nodes only for convenience. All algorithms
    // work under the assumption of a edge-weighted directed graph.
    for (int i = 0; i < (int)G.adj_list.size(); i++)
      G.node_cost[i] = strlen(seqs[i % nb_nodes]) - (k_value - 1);

    a1Global = LL_MAX - (k_value - 1);
    a2Global = UL_MAX - (k_value - 1);
    beta = LL_MIN - (k_value - 1);
    
    
    if (!EXPERIMENTAL_ALG) {
        listAllBubbles(G, k_value, UL_MAX, LL_MAX, LL_MIN);
	}
    else {
        listAllBubblesUsingPath(G);
	}
  }
  printSummary(stdout, nbBubbles);

  for (unsigned int i=0; i<seqs.size(); i++) delete[] seqs[i]; // cleaning
  if (OUTPUT_PATH)
    fclose(path_output_file);
  fclose(seq_output_file_type0);
  fclose(seq_output_file_type1234);

  return 0;
}









//////////////////////////////////////////////////////////////////////////////////////////////
//Warning:
//The following are functions related to a new experimental path-enumeration algorithm to list bubbles
//They may not be in a final state yet
//////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////
//declare some global variable just for the sake of simplicity
vector<Path> allPaths; //will contain all paths from s to any t with bounded length == a2 with at most b branching vertices
vector<bool> explored; //will keep track of the nodes that were already explored on currentPath
Path currentPath;
//////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////
//helper functions
//check if a node is branching or not
int isBranching (int node, WeightedDigraph& G)
{
    return (G.outDegree(node) != 1 ||  inDegree(node, G) != 1) ? 1 : 0;
}

//check if both paths are vertex-disjoint
bool vertexDisjoint (const vector<int> &p1, const vector<int> &p2) {
    for (int i=1;i<(int)p1.size()-1;i++) {
        if (find(p2.begin(), p2.end(), p1[i]) != p2.end())
            return false;
    }
    return true;
}

//get the number of branching nodes if a new vertex is added to this path
int getNbBranchingNodesIfAddAVertex(const Path &path, WeightedDigraph& G) {
    if (path.nodes.size() <= 1)
        return 0;
    return path.branchingNodes + isBranching(path.nodes.back(), G);
}
//////////////////////////////////////////////////////////////////////////////////////////////



//Recursive function that finds all simple paths from a source s such that the path respects MAX_BRANCHES, beta and a2
void DFSEnum(WeightedDigraph& G, int s) {
    //add s to the path
    explored[s]=true; //s is in the current Path
    
    //add this path to allPaths
    if (currentPath.distance >= beta)
        allPaths.push_back(currentPath);
    
    for (int e = 0; e < (int)G.adj_list[s].size(); e++) { //iterates over all edges of s
        int v = G.adj_list[s][e].node;

        //check if v should be explored
        int cost = G.node_cost[s];
        int newNbBranchingNodes = getNbBranchingNodesIfAddAVertex(currentPath, G);
        int newCost = (currentPath.nodes.size() > 1 ? currentPath.distance + cost : 0);
        if (!explored[v] && //if v is not already in the path
            newNbBranchingNodes <= MAX_BRANCHES && newCost <= a2Global) {
            //yes, v should be explored
            //Configure the currentPath accordingly
            int oldDistance = currentPath.distance;
            currentPath.distance = newCost;
            int oldBranchingNodes = currentPath.branchingNodes;
            currentPath.branchingNodes = newNbBranchingNodes;
            currentPath.nodes.push_back(v);
            
            //call DFS
            DFSEnum(G, v);
            
            //DesConfigure the currentPath accordingly
            currentPath.distance = oldDistance;
            currentPath.branchingNodes = oldBranchingNodes;
            currentPath.nodes.pop_back();

        }
    }
    //already explored all paths with this prefix
    explored[s]=false; //s is not in the currentPath
}

//list all bubbles by listing all simple paths
void findAllBubblesUsingSimpleQueueDFS(WeightedDigraph& G, int source) {
    //initialize the global variables
    allPaths.clear();
    fill(explored.begin(), explored.end(), false);
    currentPath.branchingNodes=0;
    currentPath.distance=0;
    currentPath.nodes.clear();
    
    //find all paths
    currentPath.nodes.push_back(source);
    DFSEnum(G, source);

    //now list all bubbles
    map<int, vector<Path*> > targetToDistPath;
    for (vector<Path>::iterator it = allPaths.begin(); it != allPaths.end(); ++it)
        targetToDistPath[it->nodes.back()].push_back(&(*it));

    for (map<int, vector<Path*> >::iterator targetToDistPathIt = targetToDistPath.begin();
            targetToDistPathIt != targetToDistPath.end();
            ++targetToDistPathIt) {
        vector<Path*> allSTPaths = targetToDistPathIt->second;
        for (vector<Path*>::iterator i = allSTPaths.begin(); i != allSTPaths.end(); ++i) {
            vector<Path*>::iterator j = i;
            for (++j; j != allSTPaths.end(); ++j) {
                Path* lowerPath;
                Path* upperPath;

                if ( (*i)->distance < (*j)->distance) {
                    lowerPath = *i;
                    upperPath = *j;
                } else {
                    lowerPath = *j;
                    upperPath = *i;
                }

                if (    (lowerPath->distance <= a1Global || (OUTPUT_SNPS==2 && lowerPath->distance==upperPath->distance) ) && //if we respect LL_MAX (this is for type 0a, 1, 2 and 3) OR we should output all Type0b and both paths have the same length (this is only for type0b)
                        (upperPath->nodes.size() >= 3 || lowerPath->nodes.size() >= 3) &&
                        vertexDisjoint(lowerPath->nodes, upperPath->nodes)) {
                    processBubbleFound(upperPath->nodes, lowerPath->nodes, G);
                }
            }
        }
    }
}

void listAllBubblesUsingPath(WeightedDigraph& G) {
    explored = vector<bool>(G.adj_list.size(), false);

    //list all bubbles starting with each node v
    for (int v = 0; v < (int) G.adj_list.size(); v++) {
        if (G.outDegree(v) >= 2) //pruning
            findAllBubblesUsingSimpleQueueDFS(G, v);
    }
}