File: alignment.ml

package info (click to toggle)
pplacer 1.1~alpha19-8
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 17,324 kB
  • sloc: ml: 20,927; ansic: 9,002; python: 1,641; makefile: 171; sh: 77; xml: 50
file content (252 lines) | stat: -rw-r--r-- 7,369 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
exception Unknown_format of string

open Ppatteries

(* NOTE: zero is the beginning of the alignment! *)
(* NOTE: strings have maximum length of 16777211 (on stoke) *)

(* alignments are arrays of (name, seq)'s. better than hashtables because
 * numbers are useful *)

type t = (string * string) array

type seq_type = Nucleotide_seq | Protein_seq
let nstates_of_seq_type = function | Nucleotide_seq -> 4 | Protein_seq -> 20
let seq_type_to_str = function
    | Nucleotide_seq -> "nucleotide" | Protein_seq -> "protein"

(* ***** utils ***** *)
  (* string_break: break str in to chunks of length chunk_len *)
let string_break str chunk_len =
  let rec aux next_str =
    let next_len = String.length next_str in
    if next_len <= chunk_len then
      [ next_str ] (* terminate *)
    else
      ( String.sub next_str 0 chunk_len ) :: (
        aux ( String.sub next_str chunk_len ( next_len - chunk_len ) ) )
  in
  aux str

(* ***** basics ***** *)

let get_name align i = fst (align.(i))
let get_seq align i = snd (align.(i))

let iteri = Array.iteri

let get_name_arr align = Array.map fst align
let forget_names align = Array.map snd align

let same_lengths_unnamed align =
  if align = [||] then true
  else (
    let lengths = Array.map String.length align in
    Array.fold_left (
      fun same_so_far len ->
        same_so_far && ( len = lengths.(0) )
    ) true lengths
  )

let same_lengths align = same_lengths_unnamed ( forget_names align )

let n_seqs align = Array.length align

let length align =
  if n_seqs align = 0 then 0
  else if same_lengths align then String.length (get_seq align 0)
  else failwith "length: not all same length"

let pair_uppercase (name, seq) = (name, String.uppercase seq)
let uppercase aln = Array.map pair_uppercase aln

let list_of_any_file fname =
  let has_suffix suffix = MaybeZipped.check_suffix fname suffix in
  if has_suffix ".fasta" || has_suffix ".fa" then
    Fasta.of_file fname
  else if has_suffix ".sth" || has_suffix ".sto" then
    Stockholm.of_file fname
  else
    failwith ("unfamiliar suffix on " ^ fname)

let uppercase_list l = List.map pair_uppercase l
let upper_list_of_any_file fname = uppercase_list (list_of_any_file fname)

let aln_of_any_file fname = Array.of_list (list_of_any_file fname)
let upper_aln_of_any_file fname = uppercase (aln_of_any_file fname)

let to_map_by_name aln =
  Array.fold_right
    (fun (name, seq) m ->
      if StringMap.mem name m then
        failwith ("name "^name^" duplicated in alignment!");
      StringMap.add name seq m)
    aln
    StringMap.empty

(* alternate, for wrapped fasta
   List.iter
     (fun line -> Printf.fprintf ch "%s\n" line)
     (string_break seq 60); *)
let write_fasta_line ch (name, seq) =
  Printf.fprintf ch ">%s\n" name;
  Printf.fprintf ch "%s\n" seq

let to_fasta align fname =
  let ch = open_out fname in
  Array.iter (write_fasta_line ch) align;
  close_out ch

let to_phylip align fname =
  let ch = open_out fname in
  Printf.fprintf ch "%d %d\n" (n_seqs align) (length align);
  Array.iter (
    fun (name, seq) ->
      Printf.fprintf ch "%s %s\n" (Bytes.to_string (StringFuns.left_pad 14 ' ' name)) seq;
  ) align;
  close_out ch

let stack align1 align2 =
  let a = Array.append align1 align2 in
  if same_lengths a then a
  else failwith "stack: alignment not rectangular!"

(* string_mask: mask is a bool array whose ith elt tells if we should include
 * that char into the output *)
let string_mask mask str =
  assert(Array.length mask = String.length str);
  StringFuns.of_char_array (
    Array.filteri (fun i _ -> mask.(i)) (StringFuns.to_char_array str))

(* mask_align : mask the alignment *)
let mask_align mask align =
  Array.map (fun (name, seq) -> (name, string_mask mask seq)) align

(* XXX *)

let check_for_repeats name_arr =
  let _ =
    Array.fold_left
      (fun s name ->
        if StringSet.mem name s then
          failwith("repeated taxon name in alignment: "^name)
        else
          StringSet.add name s)
      StringSet.empty
      name_arr
  in
  ()

(* check to make sure that each site contains a nucleotide type symbol *)
let is_nuc_align aln =
  try
    Array.iter
      (fun (_,seq) ->
        String.iter
          (fun nuc ->
            let _ = CharMap.find nuc Nuc_models.nuc_map in ())
          seq)
    aln;
    true
  with
  | Not_found -> false

(* make_aln_index_map: make a map which maps from the node number to the row
 * number of the alignment. *)
let make_aln_index_map taxon_map aln_name_arr =
  let n_tree = IntMap.cardinal taxon_map
  and n_aln = Array.length aln_name_arr in
  if n_tree <> n_aln then
    failwith
      (Printf.sprintf "tree has %d taxa, and ref align has %d." n_tree n_aln);
  check_for_repeats aln_name_arr;
  IntMap.map
    (fun tax_name ->
      match ArrayFuns.find_all_indices tax_name aln_name_arr with
        | [idx] -> idx
        | [] -> failwith ("taxon not found in alignment: '"^tax_name^"'")
        | _ -> failwith ("taxon in alignment repeatedly: '"^tax_name^"'"))
    taxon_map

(* a like_aln is just the corresponding array of likelihood vectors *)
let like_aln_of_align seq_type align =
  let like_fun =
    match seq_type with
    | Nucleotide_seq -> Nuc_models.lv_of_nuc
    | Protein_seq -> Prot_models.lv_of_aa
  in
  Array.map
    (fun (_, seq) ->
      Array.map like_fun (StringFuns.to_char_array seq))
    align


(* getting empirical frequencies from alignments
 *)
let emper_freq nstates like_map align =
  let no_missing_normed =
    CharMap.remove '-' (
    CharMap.remove '?' ( (* we don't remove 'X'... *)
    CharMap.map (
      fun like_vect ->
        Linear_utils.alloc_l1_normalize like_vect) like_map)) in
  let total = Gsl.Vector.create ~init:0. nstates in
  Array.iter (
    fun (name, seq) ->
      String.iter (
        fun base ->
          if base <> '-' && base <> '?' then
            if CharMap.mem base no_missing_normed then
              Gsl.Vector.add total (CharMap.find base no_missing_normed)
            else
              failwith (Printf.sprintf "'%c' not a known base in %s!" base name)
      ) seq
  ) align;
  Linear_utils.l1_normalize total;
  if !verbosity > 1 then
    Format.fprintf
      Format.std_formatter
      "emper freqs: %a@."
      Linear_utils.ppr_gsl_vector
      total;
  let a = Gsl.Vector.to_array total in
  if Array.exists (fun x -> x < 1e-10) a then
    dprintf
      "\nWarning: zero-indexed state %d is zero in empirical frequency.\n\n"
      (Array.findi (fun x -> x < 1e-10) a);
  total

let identity s1 s2 =
  let s1' = StringFuns.to_char_array s1
  and s2' = StringFuns.to_char_array s2 in
  let num, denom = ArrayFuns.fold_left2
    (fun (num, denom) c1 c2 ->
      if c1 = '-' || c2 = '-' then
        num, denom
      else
        num + (if c1 = c2 then 1 else 0), succ denom)
    (0, 0)
    s1'
    s2'
  in
  (if denom = 0 then 0. else (float_of_int num) /. (float_of_int denom)),
  denom

let informative = function '?' | '-' -> false | _ -> true

let ungap = String.filter informative

let span s =
  let first = ref None
  and last = ref None in
  String.iteri
    (fun i c ->
      if informative c then begin
        if Option.is_none !first then
          first := Some i;
        last := Some i
      end)
    s;
  match !first, !last with
    | Some f, Some l -> f, l
    | _, _ -> failwith "sequence contains no informative characters"