1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493
|
/***************************************************************************
* PHAST: PHylogenetic Analysis with Space/Time models
* Copyright (c) 2002-2005 University of California, 2006-2010 Cornell
* University. All rights reserved.
*
* This source code is distributed under a BSD-style license. See the
* file LICENSE.txt for details.
***************************************************************************/
/* $Id: maf.h,v 1.7 2009-01-09 22:01:00 mt269 Exp $ */
/** \file maf.h
Reading of alignments from MAF ("Multiple Alignment Format")
files to generate Sufficient Statistics.
(See http://www.bx.psu.edu/miller_lab.) These functions are primarily
concerned with extracting sufficient statistics from a MAF file
(see sufficient_stats.c). They avoid representing the alignment
explicitly, and as a result allow large MAF files (e.g., spanning
whole mammalian chromosomes) to be read and stored fairly
efficiently. A "reference sequence" alignment is currently
assumed, with the reference sequence appearing first in each
alignment block, and always in the positive (rather than reverse
complemented) orientation (this is the convention with MULTIZ).
\ingroup msa
*/
#ifndef MAF_H
#define MAF_H
#include "stdio.h"
#include "msa.h"
#include "hashtable.h"
#include "gff.h"
/** Hold data for a single block within a MAF file */
typedef struct {
String *text; /**< Maf Block contents, text[i] contains line i of the block */
int numline, /**< Number of lines in Maf Block */
seqlen; /**< Sequence length of sequence in Maf Block */
int *specmap; /**< Maps Maf Block contents (by line) to species.
specmap[i]=j if line[i] of block contains information
about species j. If line i isn't species-specific
(ie, the first line), specmap[i] = -1 */
char **seq; /**< Sequence data for all species within Maf Block.
seq[i] points to sequence for species i (pointer to
position in text array) */
char **quality; /**< Quality scores for all species within Maf Block.
quality[i] points to string of quality scores for
species[i], or NULL if no data */
char **qline, /**< Pointer to lines starting with 'q' for each species */
**sline, /**< Pointer to lines starting with 's' for each species */
**iline, /**< Pointer to lines starting with 'i' for each species */
**eline; /**< Pointer to lines starting with 'e' for each species */
} MAF_BLOCK;
/** @name MAF File reading
\{ */
/** Read subset of an Alignment from a MAF file; subset selected by sequence and feature names.
@pre The MAF file must be sorted with respect to the reference sequence.
@param[in] F MAF file
@param[in] REFSEQ (Optional) reference sequence. If
non-NULL, the indicated file will
be used to define bases in regions
of no alignment (not represented in
the MAF). File format is expected
to be FASTA. Ignored if
store_order == FALSE. If NULL and
store_order == TRUE, then bases in
reference seq not present in MAF
are represented as Ns
@param[in] tuple_size tuple size for sufficient statistics
@param[in] alphabet (Optional) alphabet for alignment; if NULL, DEFAULT_ALPABET is assumed
@param[in] gff (Optional) GFF_Set. If non-NULL,
category-specific counts will be
collected (cm must be non-NULL
also). The gff is assumed to use
the indexing system of the
reference sequence (sequence 1).
Currently, a non-NULL gff implies
gap_strip_mode == 1 (projection
onto reference sequence).
@param[in] cm Used for category-specific counts, ignored otherwise
@param[in] cycle_size Label site categories
12...<cycle_size>...12...<cycle_size>
instead of using gff and cm.
Useful when stats are to be
collected for non-overlapping
tuples. Use -1 to ignore.
@param[in] store_order Whether to store order in which
tuples occur. Storing order
requires more memory and larger
files, and is not necessary in many
cases.
@param[in] reverse_groups Tag defining groups in gff;
indicates groups on negative strand
should be reverse complemented.
Ignored if NULL. Useful when
collecting counts for
strand-specific categories. Can't
be used if store_order == TRUE
@param[in] gap_strip_mode Gap stripping mode. Currently,
if store_order == 1, may only have
value NO_STRIP or 1 (indicating
projection onto sequence 1, the
reference sequence). This is
simply to avoid some complexity in
coordinate mapping.
@param[in] keep_overlapping If TRUE, keep overlapping blocks,
otherwise keep only first instance.
Must be FALSE if store_order ==
TRUE or gff != NULL or
cycle_size != -1
@param[in] cats_to_do If non-NULL, only loads elements of the
alignment with features in this list.
Requires gff != NULL and cm != NULL
@param[in] seqnames If non-NULL, this is the list of sequence names
to keep (if seq_keep==TRUE) or discard
(if seq_keep==FALSE)
@param[in] seq_keep Only used if seqnames != NULL, determines whether
to keep only or discard seqnames
@note NOTE: for now, if a GFF is defined, then all blocks are projected
onto the reference seq (gff != NULL -> gap_strip_mode == 1). This
simplifies things somewhat, and it's rare that you want
category-specific counts without projecting (because gaps in the
reference sequence make it difficult to assign sites to categories
rationally).
@note The alignment won't be constructed explicitly; instead, a sufficient-statistics representation will be extracted directly from the MAF.
@warning Any blocks falling out of order, or which are redundant with previous blocks, will be discarded.
*/
MSA *maf_read_cats_subset(FILE *F, FILE *REFSEQF, int tuple_size, char *alphabet,
GFF_Set *gff, CategoryMap *cm, int cycle_size, int store_order,
char *reverse_groups, int gap_strip_mode, int keep_overlapping,
List* cats_to_do, List *seqnames, int seq_keep);
/** Read a subset of an Alignment from a MAF file subset selected by feature names.
@pre The MAF file must be sorted with respect to the reference sequence.
@param[in] F MAF file
@param[in] REFSEQ (Optional) reference sequence. If
non-NULL, the indicated file will
be used to define bases in regions
of no alignment (not represented in
the MAF). File format is expected
to be FASTA. Ignored if
store_order == FALSE. If NULL and
store_order == TRUE, then bases in
reference seq not present in MAF
are represented as Ns
@param[in] tuple_size tuple size for sufficient statistics
@param[in] alphabet (Optional) alphabet for alignment; if NULL, DEFAULT_ALPABET is assumed
@param[in] gff (Optional) GFF_Set. If non-NULL,
category-specific counts will be
collected (cm must be non-NULL
also). The gff is assumed to use
the indexing system of the
reference sequence (sequence 1).
Currently, a non-NULL gff implies
gap_strip_mode == 1 (projection
onto reference sequence).
@param[in] cm Used for category-specific counts, ignored otherwise
@param[in] cycle_size Label site categories
12...<cycle_size>...12...<cycle_size>
instead of using gff and cm.
Useful when stats are to be
collected for non-overlapping
tuples. Use -1 to ignore.
@param[in] store_order Whether to store order in which
tuples occur. Storing order
requires more memory and larger
files, and is not necessary in many
cases.
@param[in] reverse_groups Tag defining groups in gff;
indicates groups on negative strand
should be reverse complemented.
Ignored if NULL. Useful when
collecting counts for
strand-specific categories. Can't
be used if store_order == TRUE
@param[in] gap_strip_mode Gap stripping mode. Currently,
if store_order == 1, may only have
value NO_STRIP or 1 (indicating
projection onto sequence 1, the
reference sequence). This is
simply to avoid some complexity in
coordinate mapping.
@param[in] keep_overlapping If TRUE, keep overlapping blocks,
otherwise keep only first instance.
Must be FALSE if store_order ==
TRUE or gff != NULL or
cycle_size != -1
@param[in] cats_to_do If non-NULL, only loads elements of the
alignment with features in this list.
Requires gff != NULL and cm != NULL
@note NOTE: for now, if a GFF is defined, then all blocks are projected
onto the reference seq (gff != NULL -> gap_strip_mode == 1). This
simplifies things somewhat, and it's rare that you want
category-specific counts without projecting (because gaps in the
reference sequence make it difficult to assign sites to categories
rationally).
@note The alignment won't be constructed explicitly; instead, a sufficient-statistics representation will be extracted directly from the MAF.
@warning Any blocks falling out of order, or which are redundant with previous blocks, will be discarded.
*/
MSA *maf_read_cats(FILE *F, FILE *REFSEQF, int tuple_size, char *alphabet,
GFF_Set *gff, CategoryMap *cm, int cycle_size, int store_order,
char *reverse_groups, int gap_strip_mode, int keep_overlapping,
List* cats_to_do);
/** Read An Alignment from a MAF file.
@pre The MAF file must be sorted with respect to the reference sequence.
@param[in] F MAF file
@param[in] REFSEQ (Optional) reference sequence. If
non-NULL, the indicated file will
be used to define bases in regions
of no alignment (not represented in
the MAF). File format is expected
to be FASTA. Ignored if
store_order == FALSE. If NULL and
store_order == TRUE, then bases in
reference seq not present in MAF
are represented as Ns
@param[in] tuple_size tuple size for sufficient statistics
@param[in] alphabet (Optional) alphabet for alignment; if NULL, DEFAULT_ALPABET is assumed
@param[in] gff (Optional) GFF_Set. If non-NULL,
category-specific counts will be
collected (cm must be non-NULL
also). The gff is assumed to use
the indexing system of the
reference sequence (sequence 1).
Currently, a non-NULL gff implies
gap_strip_mode == 1 (projection
onto reference sequence).
@param[in] cm Used for category-specific counts, ignored otherwise
@param[in] cycle_size Label site categories
12...<cycle_size>...12...<cycle_size>
instead of using gff and cm.
Useful when stats are to be
collected for non-overlapping
tuples. Use -1 to ignore.
@param[in] store_order Whether to store order in which
tuples occur. Storing order
requires more memory and larger
files, and is not necessary in many
cases.
@param[in] reverse_groups Tag defining groups in gff;
indicates groups on negative strand
should be reverse complemented.
Ignored if NULL. Useful when
collecting counts for
strand-specific categories. Can't
be used if store_order == TRUE
@param[in] gap_strip_mode Gap stripping mode. Currently,
if store_order == 1, may only have
value NO_STRIP or 1 (indicating
projection onto sequence 1, the
reference sequence). This is
simply to avoid some complexity in
coordinate mapping.
@param[in] keep_overlapping If TRUE, keep overlapping blocks,
otherwise keep only first instance.
Must be FALSE if store_order ==
TRUE or gff != NULL or
cycle_size != -1
@note NOTE: for now, if a GFF is defined, then all blocks are projected
onto the reference seq (gff != NULL -> gap_strip_mode == 1). This
simplifies things somewhat, and it's rare that you want
category-specific counts without projecting (because gaps in the
reference sequence make it difficult to assign sites to categories
rationally).
@note The alignment won't be constructed explicitly; instead, a sufficient-statistics representation will be extracted directly from the MAF.
@warning Any blocks falling out of order, or which are redundant with previous blocks, will be discarded.
*/
MSA *maf_read(FILE *F, FILE *REFSEQF, int tuple_size, char *alphabet,
GFF_Set *gff, CategoryMap *cm, int cycle_size, int store_order,
char *reverse_groups, int gap_strip_mode, int keep_overlapping);
/** Read a subset Alignment from a MAF file; subset selected by feature name; not necessarily read in order wrt reference sequence.
@pre The MAF file must be sorted with respect to the reference sequence.
@param[in] F MAF file
@param[in] REFSEQ (Optional) reference sequence. If
non-NULL, the indicated file will
be used to define bases in regions
of no alignment (not represented in
the MAF). File format is expected
to be FASTA. Ignored if
store_order == FALSE. If NULL and
store_order == TRUE, then bases in
reference seq not present in MAF
are represented as Ns
@param[in] tuple_size tuple size for sufficient statistics
@param[in] alphabet (Optional) alphabet for alignment; if NULL, DEFAULT_ALPABET is assumed
@param[in] gff (Optional) GFF_Set. If non-NULL,
category-specific counts will be
collected (cm must be non-NULL
also). The gff is assumed to use
the indexing system of the
reference sequence (sequence 1).
Currently, a non-NULL gff implies
gap_strip_mode == 1 (projection
onto reference sequence).
@param[in] cm Used for category-specific counts, ignored otherwise
@param[in] cycle_size Label site categories
12...<cycle_size>...12...<cycle_size>
instead of using gff and cm.
Useful when stats are to be
collected for non-overlapping
tuples. Use -1 to ignore.
@param[in] store_order Whether to store order in which
tuples occur. Storing order
requires more memory and larger
files, and is not necessary in many
cases.
@param[in] reverse_groups Tag defining groups in gff;
indicates groups on negative strand
should be reverse complemented.
Ignored if NULL. Useful when
collecting counts for
strand-specific categories. Can't
be used if store_order == TRUE
@param[in] gap_strip_mode Gap stripping mode. Currently,
if store_order == 1, may only have
value NO_STRIP or 1 (indicating
projection onto sequence 1, the
reference sequence). This is
simply to avoid some complexity in
coordinate mapping.
@param[in] keep_overlapping If TRUE, keep overlapping blocks,
otherwise keep only first instance.
Must be FALSE if store_order ==
TRUE or gff != NULL or
cycle_size != -1
@param[in] cats_to_do If non-NULL, only loads elements of the
alignment with features in this list.
Requires gff != NULL and cm != NULL
@note NOTE: for now, if a GFF is defined, then all blocks are projected
onto the reference seq (gff != NULL -> gap_strip_mode == 1). This
simplifies things somewhat, and it's rare that you want
category-specific counts without projecting (because gaps in the
reference sequence make it difficult to assign sites to categories
rationally).
@note The alignment won't be constructed explicitly; instead, a sufficient-statistics representation will be extracted directly from the MAF.
@warning Any blocks falling out of order, or which are redundant with previous blocks, will be discarded.
*/
MSA *maf_read_unsorted(FILE *f, FILE *REFSEQF, int tuple_size, char *alphabet,
GFF_Set *gff, CategoryMap *cm, int cycle_size, int store_order,
char *reverse_groups, int gap_strip_mode, int keep_overlapping,
List *cats_to_do);
/** Read An Alignment from a MAF file; not necessarily read in order wrt reference sequence.
@pre The MAF file must be sorted with respect to the reference sequence.
@param[in] F MAF file
@param[in] REFSEQ (Optional) reference sequence. If
non-NULL, the indicated file will
be used to define bases in regions
of no alignment (not represented in
the MAF). File format is expected
to be FASTA. Ignored if
store_order == FALSE. If NULL and
store_order == TRUE, then bases in
reference seq not present in MAF
are represented as Ns
@param[in] tuple_size tuple size for sufficient statistics
@param[in] alphabet (Optional) alphabet for alignment; if NULL, DEFAULT_ALPABET is assumed
@param[in] gff (Optional) GFF_Set. If non-NULL,
category-specific counts will be
collected (cm must be non-NULL
also). The gff is assumed to use
the indexing system of the
reference sequence (sequence 1).
Currently, a non-NULL gff implies
gap_strip_mode == 1 (projection
onto reference sequence).
@param[in] cm Used for category-specific counts, ignored otherwise
@param[in] cycle_size Label site categories
12...<cycle_size>...12...<cycle_size>
instead of using gff and cm.
Useful when stats are to be
collected for non-overlapping
tuples. Use -1 to ignore.
@param[in] store_order Whether to store order in which
tuples occur. Storing order
requires more memory and larger
files, and is not necessary in many
cases.
@param[in] reverse_groups Tag defining groups in gff;
indicates groups on negative strand
should be reverse complemented.
Ignored if NULL. Useful when
collecting counts for
strand-specific categories. Can't
be used if store_order == TRUE
@param[in] gap_strip_mode Gap stripping mode. Currently,
if store_order == 1, may only have
value NO_STRIP or 1 (indicating
projection onto sequence 1, the
reference sequence). This is
simply to avoid some complexity in
coordinate mapping.
@param[in] keep_overlapping If TRUE, keep overlapping blocks,
otherwise keep only first instance.
Must be FALSE if store_order ==
TRUE or gff != NULL or
cycle_size != -1
@note NOTE: for now, if a GFF is defined, then all blocks are projected
onto the reference seq (gff != NULL -> gap_strip_mode == 1). This
simplifies things somewhat, and it's rare that you want
category-specific counts without projecting (because gaps in the
reference sequence make it difficult to assign sites to categories
rationally).
@note The alignment won't be constructed explicitly; instead, a sufficient-statistics representation will be extracted directly from the MAF.
@warning Any blocks falling out of order, or which are redundant with previous blocks, will be discarded.
*/
MSA *maf_read_old(FILE *F, FILE *REFSEQF, int tuple_size, char *alphabet,
GFF_Set *gff, CategoryMap *cm, int cycle_size, int store_order,
char *reverse_groups, int gap_strip_mode, int keep_overlapping);
/** Read a block from an MAF file.
@param[in] F File containing MAF data to read
@param[out] mini_msa MAF Block is stored here
@param[out] name_hash Hash table mapping sequence names to sequence indices (prefix of name wrt '.' character)
@param[out] start_idx stating coord of reference sequence
@param[out] length Length of reference sequence
@param[in] do_toupper Make sequences read in all upper case
@note Allocates memory for sequences if they are NULL (as with first block)
@note Reads to next "a" line or EOF
@result 0 if successful, EOF if no more blocks available
*/
int maf_read_block(FILE *F, MSA *mini_msa, Hashtable *name_hash,
int *start_idx, int *length, int do_toupper);
/** Add sequences from a MAF file to an existing MAF block
@param[in] F File containing MAF data to read
@param[in,out] mini_msa MAF Block is stored here
@param[out] name_hash Hash table mapping sequence names to sequence indices (prefix of name wrt '.' character)
@param[out] start_idx stating coord of reference sequence
@param[out] length Length of reference sequence
@param[in] do_toupper Make sequences read in all upper case
@param[in] skip_new_species If mini_msa already contains a given species, don't add it from MAF file
@note Allocates memory for sequences if they are NULL (as with first block)
@note Reads to next "a" line or EOF
@result 0 if successful, EOF if no more blocks available
*/
int maf_read_block_addseq(FILE *f, MSA *mini_msa, Hashtable *name_hash,
int *start_idx, int *length, int do_toupper,
int skip_new_species);
/** Get partial list of sequence names (roots of names only) and length of refseq.
@param[in] F File containing MAF Block contents
@param[out] names List of sequence names
@param[out] name_hash Hash Table mapping names to sequence index
@param[out] nseqs Number of sequences in MAF Block
@param[out] refseqlen Length of Reference Sequence
@param[in] add_seqs Whether to add sequences to names and name_hash
@warning May re-order names to put refseq first even if add_seqs == 0
*/
void maf_quick_peek(FILE *f, char ***names, Hashtable *name_hash,
int *nseqs, int *refseqlen, int add_seqs);
/** Get complete list of sequence names (roots of names only) and length of refseq.
@param[in] F File containing MAF Block contents
@param[out] names List of sequence names
@param[out] name_hash Hash Table mapping names to sequence index (Hash Table must be preallocated)
@param[out] nseqs Number of sequences in MAF Block
@param[out] map (Optional) Coordinate map for the reference sequence (map must be pre-allocated)
@param[out] redundant_blocks List containing indices of blocks that overlap previous blocks in 'redundant_blocks' list
@param[in] keep_overlapping Whether to keep overlapping blocks. If 0 overlapping blocks discarded, otherwise retained
@param[out] refseqlen Length of Reference Sequence
@warning May re-order names to put refseq first even if add_seqs == 0
*/
void maf_peek(FILE *F, char ***names, Hashtable *name_hash,
int *nseqs, msa_coord_map *map, List *redundant_blocks,
int keep_overlapping, int *refseqlen);
/** \} */
/** Extracts features from gff relevant to a specified interval.
@pre sub_gff is allocated, with an empty feature list
@pre gff is sorted
@param[out] sub_gff Subset of features in gff object
@param[in] gff Feature Set take subset from
@param[in] start_idx Starting of interval for which to extract features
@param[in] end_idx Ending of interval for which to extract features
@param[in] gff_idx Feature number to start extracting from in GFF
@param[in] cm Category Map used for category ranges of features
@param[in] reverse_compl Whether to reverse complement '-' strands
@param[in] tuple_size Size of tuples, used in reverse complementing
@note Shifts all coords such that start_idx is position 1
*/
void maf_block_sub_gff(GFF_Set *sub_gff, GFF_Set *gff, int start_idx,
int end_idx, int *gff_idx, CategoryMap *cm,
int reverse_compl, int tuple_size);
#endif
|