1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
|
exception Unknown_format of string
open Ppatteries
(* NOTE: zero is the beginning of the alignment! *)
(* NOTE: strings have maximum length of 16777211 (on stoke) *)
(* alignments are arrays of (name, seq)'s. better than hashtables because
* numbers are useful *)
type t = (string * string) array
type seq_type = Nucleotide_seq | Protein_seq
let nstates_of_seq_type = function | Nucleotide_seq -> 4 | Protein_seq -> 20
let seq_type_to_str = function
| Nucleotide_seq -> "nucleotide" | Protein_seq -> "protein"
(* ***** utils ***** *)
(* string_break: break str in to chunks of length chunk_len *)
let string_break str chunk_len =
let rec aux next_str =
let next_len = String.length next_str in
if next_len <= chunk_len then
[ next_str ] (* terminate *)
else
( String.sub next_str 0 chunk_len ) :: (
aux ( String.sub next_str chunk_len ( next_len - chunk_len ) ) )
in
aux str
(* ***** basics ***** *)
let get_name align i = fst (align.(i))
let get_seq align i = snd (align.(i))
let iteri = Array.iteri
let get_name_arr align = Array.map fst align
let forget_names align = Array.map snd align
let same_lengths_unnamed align =
if align = [||] then true
else (
let lengths = Array.map String.length align in
Array.fold_left (
fun same_so_far len ->
same_so_far && ( len = lengths.(0) )
) true lengths
)
let same_lengths align = same_lengths_unnamed ( forget_names align )
let n_seqs align = Array.length align
let length align =
if n_seqs align = 0 then 0
else if same_lengths align then String.length (get_seq align 0)
else failwith "length: not all same length"
let pair_uppercase (name, seq) = (name, String.uppercase seq)
let uppercase aln = Array.map pair_uppercase aln
let list_of_any_file fname =
let has_suffix suffix = MaybeZipped.check_suffix fname suffix in
if has_suffix ".fasta" || has_suffix ".fa" then
Fasta.of_file fname
else if has_suffix ".sth" || has_suffix ".sto" then
Stockholm.of_file fname
else
failwith ("unfamiliar suffix on " ^ fname)
let uppercase_list l = List.map pair_uppercase l
let upper_list_of_any_file fname = uppercase_list (list_of_any_file fname)
let aln_of_any_file fname = Array.of_list (list_of_any_file fname)
let upper_aln_of_any_file fname = uppercase (aln_of_any_file fname)
let to_map_by_name aln =
Array.fold_right
(fun (name, seq) m ->
if StringMap.mem name m then
failwith ("name "^name^" duplicated in alignment!");
StringMap.add name seq m)
aln
StringMap.empty
(* alternate, for wrapped fasta
List.iter
(fun line -> Printf.fprintf ch "%s\n" line)
(string_break seq 60); *)
let write_fasta_line ch (name, seq) =
Printf.fprintf ch ">%s\n" name;
Printf.fprintf ch "%s\n" seq
let to_fasta align fname =
let ch = open_out fname in
Array.iter (write_fasta_line ch) align;
close_out ch
let to_phylip align fname =
let ch = open_out fname in
Printf.fprintf ch "%d %d\n" (n_seqs align) (length align);
Array.iter (
fun (name, seq) ->
Printf.fprintf ch "%s %s\n" (Bytes.to_string (StringFuns.left_pad 14 ' ' name)) seq;
) align;
close_out ch
let stack align1 align2 =
let a = Array.append align1 align2 in
if same_lengths a then a
else failwith "stack: alignment not rectangular!"
(* string_mask: mask is a bool array whose ith elt tells if we should include
* that char into the output *)
let string_mask mask str =
assert(Array.length mask = String.length str);
StringFuns.of_char_array (
Array.filteri (fun i _ -> mask.(i)) (StringFuns.to_char_array str))
(* mask_align : mask the alignment *)
let mask_align mask align =
Array.map (fun (name, seq) -> (name, string_mask mask seq)) align
(* XXX *)
let check_for_repeats name_arr =
let _ =
Array.fold_left
(fun s name ->
if StringSet.mem name s then
failwith("repeated taxon name in alignment: "^name)
else
StringSet.add name s)
StringSet.empty
name_arr
in
()
(* check to make sure that each site contains a nucleotide type symbol *)
let is_nuc_align aln =
try
Array.iter
(fun (_,seq) ->
String.iter
(fun nuc ->
let _ = CharMap.find nuc Nuc_models.nuc_map in ())
seq)
aln;
true
with
| Not_found -> false
(* make_aln_index_map: make a map which maps from the node number to the row
* number of the alignment. *)
let make_aln_index_map taxon_map aln_name_arr =
let n_tree = IntMap.cardinal taxon_map
and n_aln = Array.length aln_name_arr in
if n_tree <> n_aln then
failwith
(Printf.sprintf "tree has %d taxa, and ref align has %d." n_tree n_aln);
check_for_repeats aln_name_arr;
IntMap.map
(fun tax_name ->
match ArrayFuns.find_all_indices tax_name aln_name_arr with
| [idx] -> idx
| [] -> failwith ("taxon not found in alignment: '"^tax_name^"'")
| _ -> failwith ("taxon in alignment repeatedly: '"^tax_name^"'"))
taxon_map
(* a like_aln is just the corresponding array of likelihood vectors *)
let like_aln_of_align seq_type align =
let like_fun =
match seq_type with
| Nucleotide_seq -> Nuc_models.lv_of_nuc
| Protein_seq -> Prot_models.lv_of_aa
in
Array.map
(fun (_, seq) ->
Array.map like_fun (StringFuns.to_char_array seq))
align
(* getting empirical frequencies from alignments
*)
let emper_freq nstates like_map align =
let no_missing_normed =
CharMap.remove '-' (
CharMap.remove '?' ( (* we don't remove 'X'... *)
CharMap.map (
fun like_vect ->
Linear_utils.alloc_l1_normalize like_vect) like_map)) in
let total = Gsl.Vector.create ~init:0. nstates in
Array.iter (
fun (name, seq) ->
String.iter (
fun base ->
if base <> '-' && base <> '?' then
if CharMap.mem base no_missing_normed then
Gsl.Vector.add total (CharMap.find base no_missing_normed)
else
failwith (Printf.sprintf "'%c' not a known base in %s!" base name)
) seq
) align;
Linear_utils.l1_normalize total;
if !verbosity > 1 then
Format.fprintf
Format.std_formatter
"emper freqs: %a@."
Linear_utils.ppr_gsl_vector
total;
let a = Gsl.Vector.to_array total in
if Array.exists (fun x -> x < 1e-10) a then
dprintf
"\nWarning: zero-indexed state %d is zero in empirical frequency.\n\n"
(Array.findi (fun x -> x < 1e-10) a);
total
let identity s1 s2 =
let s1' = StringFuns.to_char_array s1
and s2' = StringFuns.to_char_array s2 in
let num, denom = ArrayFuns.fold_left2
(fun (num, denom) c1 c2 ->
if c1 = '-' || c2 = '-' then
num, denom
else
num + (if c1 = c2 then 1 else 0), succ denom)
(0, 0)
s1'
s2'
in
(if denom = 0 then 0. else (float_of_int num) /. (float_of_int denom)),
denom
let informative = function '?' | '-' -> false | _ -> true
let ungap = String.filter informative
let span s =
let first = ref None
and last = ref None in
String.iteri
(fun i c ->
if informative c then begin
if Option.is_none !first then
first := Some i;
last := Some i
end)
s;
match !first, !last with
| Some f, Some l -> f, l
| _, _ -> failwith "sequence contains no informative characters"
|