File: sigma.ml

package info (click to toggle)
sigma-align 1.0-1
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 72 kB
  • ctags: 80
  • sloc: ml: 844; xml: 135; makefile: 41
file content (1071 lines) | stat: -rw-r--r-- 38,697 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
(***  SiGMA : Simple Greedy Multiple Alignment
 ***  Version 1.0 (March 11, 2006)
 ***  Copyright (C) Rahul Siddharthan, 2006
 ***
 ***  Requires ocaml 3.x (http://caml.inria.fr)
 ***  When compiling, link the str module as follows:
 ***  bytecode compiler    -> ocamlc str.cma sigma.ml -o sigma.bytecode
 ***  native-code compiler -> ocamlopt str.cmxa sigma.ml -o sigma
 ***  
 ***  (native is around 7x-10x faster, but no debugger support)
 ***  
 ***  To produce a static binary on linux (so that it's independent of
 ***  your machine's particular version of libc), use
 ***  ocamlopt -cc "gcc -static" str.cmxa sigma.ml -o sigma
 ***) 

(***  Strategy: find most significant common substring between two strings 
 ***  (mutations may exist but are taken care of in the scoring) 
 ***  and "align" them.  Then next largest common substring among 
 ***  existing fragments that may be "consistently" aligned.   Each time
 ***  a common substring is found it is "fused" into one fragment (with
 ***  multiple associated sequence numbers).  Repeat until nothing left.
 ***  
 ***  Actually, for reasons of efficiency a list of "best matches" is made
 ***  and aligned with consistency checks, rather than doing it one
 ***  match at a time, but that's a detail.
 ***)

module IntSet = Set.Make (struct type t=int
                                 let compare=compare end )

(*** the main data type, the sequence fragment ***)
type neigh_var = None | Frag of seq_frag
and seq_frag = { header : string ; 
                 seq : (int, string) Hashtbl.t ;
                 mutable aseq: string;
		 mutable abgw: float array;
                 mutable seqindex : IntSet.t;
                 seqlabel: (int, string) Hashtbl.t;
                 neighbours: (int * char, neigh_var) Hashtbl.t;
                 limits : (int * char, string) Hashtbl.t } 

(***  Notes:
 ***  
 ***  1. seq is a hashtbl keyed by sequence index n, giving corresponding
 ***  sequence, since after alignment there may be mismatches.
 ***  
 ***  2. aseq is a string giving the consensus of all aligned strings.
 ***  For now, a mismatch will be marked as N and subsequent bases
 ***  aligned to it will get a score zero.
 ***  
 ***  3.  seqindex is the original sequence to which the current sequence
 ***  fragment belongs; a set containing initially a single number, but,
 ***  as pairs are made, possibly multiple numbers
 ***  
 ***  4.  limits is a hashtbl keyed with pairs of sequence numbers and chars
 ***  (n,c='l'/'r') mapping  to the legal start (l) or end (r) position on 
 ***  sequence n for this fragment.
 ***  
 ***  5.  seqlabel is an array whose nth element is the label of that
 ***  fragment on sequence n, a string representation of a float between 0
 ***  and 1 (actually between 0 and 0.3).  Each fragment on each sequence
 ***  will have a unique label.
 ***  
 ***  6.  neighbours is a hashtbl that takes as keys a pair of the
 ***  sequence number and a char 'l'/'r' and outputs a ref to the
 ***  neighbouring frag on that seq in that direction
 ***  
 ***  7. abgw is the background weight, the log of the (conditional)
 ***  background probability for that base divided by 0.25.  
 ***  
 ***  The sequence starts in two datastructures, a list (whose first
 ***  element is a dummy) and a set that contains all the "real" elements.
 ***  At each alignment step two sequences (elements of the set) are
 ***  broken up into 5 sequences (a shared portion and the non-shared
 ***  boundaries, which may be zero-length).  The original list always
 ***  contains the starting element, with pointer to the next fragment.
 ***  It's like a doubly-linked list (actually, a set of lists with 
 ***  crosslinks, if you like) with end fragments pointing to a 
 ***  "dummy" fragment.
 ***)

type fasta_seq = { fheader : string; mutable fseq: Buffer.t}

(*** Global variables ***)
let prand = ref 0.002
let correlbg = ref 2
let bgfilename = ref ""
let auxfilename = ref ""
let singlesite = ref (Array.make 4 0.0)
let dinucleotide = ref (Array.make_matrix 4 4 0.0)

(***                                                              ***)
(***                                                              ***)
(***                                                              ***)

(***  best_match: finds the most significant common substring with
 ***  mismatches.  The significance is basically the background 
 ***  probability of the string multiplied by the probability of
 ***  seeing a match of that quality in two long strings of the given
 ***  lengths.  The probability for fragment size l with string lengths
 ***  l1 and l2 is 1-(1-p)^[(l1-l+1)(l2-l+1)] = approx. p*(l1-l+1)*(l2-l+1)
 ***  where p = bg prob * (l choose m), where m = number of mismatches.
 ***  Lowest answer = most significant.
 ***)

let best_match s1 s2 w1 w2 = 
  let sl1 = String.length s1 in 
  let sl2 = String.length s2 in
  let fmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 1.0 in
  let lmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 0 in
  let mmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 0 in
  let best = ref 1.0 in
  let bestm = ref 0 in
  let bestn = ref 0 in
  let lbest = ref 0 in
    for m = 1 to sl1  do
      for n = 1 to sl2  do 
        let a1 = s1.[m-1] in
        let b1 = s2.[n-1] in
        let f_old = fmat.(m-1).(n-1) in
        let m_old = mmat.(m-1).(n-1) in
        let l_new = lmat.(m-1).(n-1) + 1 in
        let m_new, f_new =
          if (a1=b1) && (a1 != 'N') 
          then 
            m_old, f_old *. sqrt(w1.(m-1)*.w2.(n-1))
              *. (float l_new) /. (float (l_new-m_old))
          else  m_old + 1, f_old *. (float l_new) /. (float (m_old + 1))
        in 
        let (f_new, l_new, m_new) = (min (f_new,l_new,m_new) (1.0,0,0)) in
          ( fmat.(m).(n) <- f_new;
            lmat.(m).(n) <- l_new; 
            mmat.(m).(n) <- m_new;
	    if (f_new < !best) then
	      ( best := f_new; bestm := m; bestn := n)
          ) 
      done
    done;
    while (fmat.(!bestm - !lbest).(!bestn - !lbest) < 1.0) 
      && (!lbest < !bestm) && (!lbest < !bestn) do
        lbest := !lbest + 1
    done;
    best := !best *. (float (sl1 - !lbest+1)) *. (float (sl2 - !lbest+1));
    if (!best < !prand)
    then  !bestm - !lbest, !bestn - !lbest, !lbest, !best
    else 0,0,0,1.0
;;

(***                                                              ***)
(***                                                              ***)
(***                                                              ***)

let compareSeq seq1 seq2 =
  (***  if seqindex is same in both cases then check fragment number
   ***  from "seqlabel", otherwise IntSet.compare *)
  (***  this is needed to create the module SeqSet, but isn't used,
   ***  except to the extent that this should not return 0 unless
   ***  two frags really are identical, which should not happen *)
    match (IntSet.compare seq1.seqindex seq2.seqindex ) with
        0 -> ( let rec complabel = function
                 |  [] -> 0
                 | a::t -> 
                     match (compare (Hashtbl.find seq1.seqlabel a)
                                    (Hashtbl.find seq2.seqlabel a)) with
                         0 -> complabel t
                       | n -> n 
               in complabel (IntSet.elements seq1.seqindex))
      | n -> n ;;


module SeqSet = Set.Make (struct type t=seq_frag
                                 let compare=compareSeq end );;

let seqset = ref SeqSet.empty ;;

(***                                                              ***)
(***                                                              ***)
(***                                                              ***)

let readlines_noblank ffilename = (* works like python's readlines *)
  let fchan = open_in ffilename in
  let lenstr = in_channel_length fchan in
  let res = String.create lenstr in 
    really_input fchan res 0 lenstr ;
    Str.split (Str.regexp "\\(\r\n\\|\n\\)+" ) res ;;

let newfseq h =   { header = h;
                    seq = Hashtbl.create 5;
		    aseq = "" ;
		    abgw = Array.create 0 0.0;
                    seqindex = IntSet.empty; 
                    seqlabel = Hashtbl.create 5;
                    neighbours = Hashtbl.create 10;
                    limits = Hashtbl.create 5 };;

let readfasta filename = (*   reads a fasta sequence, returns it as a list 
                          ***  with a dummy first element used as a marker *)
  let flines = readlines_noblank filename in
  let rec flines2flist flist flines fcurr ln =
    if List.length flines = 0 then
      fcurr::flist
    else
      match (List.hd flines).[0] with
          '>' -> 
            let fcurr2 = newfseq (List.hd flines)
            in flines2flist (fcurr::flist) (List.tl flines) fcurr2 (ln+1)
        | ';'|'#' -> flines2flist flist (List.tl flines) fcurr ln
        | _ -> 
            let fcurr2 = { header = fcurr.header; seq=fcurr.seq;
                           aseq = fcurr.aseq ^ (String.lowercase 
                                                  (List.hd flines)) ;
			   abgw = Array.create 0 0.0;
                           seqlabel= fcurr.seqlabel ;
                           neighbours = fcurr.neighbours;
                           seqindex = fcurr.seqindex;
                           limits = fcurr.limits
                         } 
            in flines2flist flist (List.tl flines) fcurr2 ln
  in 
    List.rev (flines2flist [] flines (newfseq "" ) (-1));;


let makefastaset seqlist =
  let rec list2set a s =
    match a with 
        [] -> s
      | h::t -> list2set t (SeqSet.add h s)
  in list2set (List.tl seqlist) (SeqSet.empty);;


let basenum  = function
  | 'a' | 'A' -> 0
  | 'c' | 'C' -> 1
  | 'g' | 'G' -> 2
  | 't' | 'T' -> 3
  | _ -> -1
;;


let getbgprobs_from_bgfile bgfile = (* read singlesite and dinucleotide
				       from bgfile *)
  let bglines = readlines_noblank bgfile in
  let line2val line1 =
    let line1s=Str.split (Str.regexp "[ \t]+") line1 in
      if line1s > [] && line1.[0]!= '#' && (String.length (List.hd line1s) < 3)
      then 
	let bs = List.hd line1s in
	  if (String.length bs = 1) then
	    !singlesite.(basenum bs.[0]) <-
	      (float_of_string (List.nth line1s 1))
	else 
	  !dinucleotide.(basenum bs.[0]).(basenum bs.[1]) <-
	    (float_of_string (List.nth line1s 1))
      else ()
  in 
    List.iter line2val bglines;;


let getbgprobs auxseqlist =
  (*** assume !singlesite, !dinucleotide are initialised to zero ***)
  List.iter 
    (fun f -> 
       for n = 0 to (String.length f.aseq) - 1 do
         let c = basenum f.aseq.[n] in
           if c >= 0 then 
             (!singlesite.(c) <- !singlesite.(c) +. 1.0;
              !singlesite.(3-c) <- !singlesite.(3-c) +. 1.0;
              if n>0 then
                let c1 = basenum f.aseq.[n-1] in
                  if c1 >=0 then
                    (!dinucleotide.(c1).(c) <- !dinucleotide.(c1).(c) +. 1.0;
                    )
             )
       done 
    ) auxseqlist 
;;


let normalise_bgprobs () =
  (let tots = ref 0.0 in
     (for n = 0 to 3 do
        tots := !tots +. !singlesite.(n)
      done; 
      for n = 0 to 3 do
        !singlesite.(n) <- !singlesite.(n) /. !tots
      done
     )
  );
  for n = 0 to 3 do 
    let totd = ref 0.0 in
      (for m = 0 to 3 do
         totd := !totd +. !dinucleotide.(n).(m)
       done;
       for m = 0 to 3 do
         !dinucleotide.(n).(m) <- !dinucleotide.(n).(m) /. !totd
       done
      )
  done 
;;

let getwts s =
  let w = Array.make (String.length s) 0.0 in
    ( for n = 0 to (String.length s) - 1 do
        if !correlbg = 0 then w.(n) <- 0.25
        else 
          let cn = basenum s.[n] in
            if cn<0 then w.(n) <- 1.
            else if n=0 then w.(n) <- !singlesite.(cn)
            else
              let cn1 = basenum s.[n-1] in
                if cn1 < 0 || !correlbg = 1 
	        then w.(n) <- !singlesite.(cn)
                else w.(n) <- !dinucleotide.(cn1).(cn)
      done;
      w
    )
;;


let filename2seqlist filenameset gotbgprobs =
  let fastalist1 =
    try 
      readfasta (List.hd filenameset)
    with 
	_ -> ( Printf.fprintf stderr 
                 "Error: unable to read file %s\n" (List.hd filenameset);
               exit 1)
  in     
  let fastalist2l =
    List.map 
      (fun filename ->
         try 
           readfasta filename
         with 
             _ -> ( Printf.fprintf stderr 
                      "Error: unable to read file %s\n" filename;
                    exit 1)
      ) (List.tl filenameset)
  in
  let makeseqlist = function
    | [] -> []
    | h::t ->
        List.fold_left (fun n m -> n @ (List.tl m)) (List.tl h) t
  in
  let fastalist = fastalist1 @ (makeseqlist fastalist2l) in
    ( if gotbgprobs then () else
	( getbgprobs fastalist;
          normalise_bgprobs () 
	);
      List.iter (fun f -> f.abgw <- getwts f.aseq) fastalist;
      fastalist
    )
;;



let rec blockseq seql seqnumset maxl maxw =
  (***  the main routine, recursively forms blocks until no more are
	***  possible *)
  (***  first, make a list of best pairwise matches, removing matches
	***  that conflict with better matches; then pairwise-align those, 
	***  one by one; then call self again.  If null list, quit. *)

  let seqhead = List.hd seql in
  let seqlist = List.tl seql in

  let find_neigh = function
    | None -> seqhead 
    | Frag a -> a
  in 

  let update_limits seqfrag = (* updates limits for consistent alignment *)
    let rec sweep_limits frag l_stop r_start m =
      let lim = seqfrag.limits in
        if frag==seqhead then () 
        else
          let nextfrag = find_neigh (Hashtbl.find frag.neighbours (m,'r')) in
          let thislabel = Hashtbl.find frag.seqlabel m in
          let fix_limits n =
            ( ( if thislabel <= l_stop then
                  let newlim = 
		    try Hashtbl.find lim (n,'r') 
		    with _ -> "0.3"
		  in
		  let oldlim = 
		    try Hashtbl.find frag.limits (n,'r')
		    with _ -> "0.3"
		  in 
                    if oldlim > newlim
                    then Hashtbl.replace frag.limits (n,'r') newlim);
              ( if thislabel >= r_start then
                  let newlim = 
		    try Hashtbl.find lim (n,'l') 
		    with _ -> "0"
		  in
		  let oldlim =
		    try Hashtbl.find frag.limits (n,'l')
		    with _ -> "0"
		  in 
                    if oldlim < newlim
                    then Hashtbl.replace frag.limits (n,'l') newlim))
          in 
            (IntSet.iter fix_limits seqnumset;
             sweep_limits nextfrag l_stop r_start m)

    in (
        IntSet.iter  (* Set "diagonal" limits to self *)
          (fun n -> 
             (Hashtbl.replace seqfrag.limits (n,'l') 
		(Hashtbl.find seqfrag.seqlabel n) ;
              Hashtbl.replace seqfrag.limits (n,'r') 
		(Hashtbl.find seqfrag.seqlabel  n)))
          seqfrag.seqindex;
        IntSet.iter (*   replace each limit with most stringent limit of
                     ***  a neighbouring fragment *)
          (fun n -> 
             ( IntSet.iter 
                 (fun m ->
                    (let fragl = find_neigh 
		       (Hashtbl.find seqfrag.neighbours (m,'l')) in
                     let limn = 
		       try Hashtbl.find fragl.limits (n,'l') 
		       with _ -> "0"
		     in
		     let seqfraglim = 
		       try Hashtbl.find seqfrag.limits (n,'l')
		       with _ -> "0"
		     in 
                       if limn > seqfraglim
                       then Hashtbl.replace seqfrag.limits (n,'l') limn );
                    (let fragr = find_neigh 
		       (Hashtbl.find seqfrag.neighbours (m,'r')) in
                     let limn = 
		       try Hashtbl.find fragr.limits (n,'r') 
		       with _ -> "0.3" 
		     in
		     let seqfraglim = 
		       try Hashtbl.find seqfrag.limits (n,'r') 
		       with _ -> "0.3"
		     in 
                       if limn < seqfraglim
                       then Hashtbl.replace seqfrag.limits (n,'r') limn )
                 ) seqfrag.seqindex )
          ) seqnumset;
        let do_sweep n =
          let initfrag = (List.nth seqlist n) in
	  let llimit =
	    try Hashtbl.find seqfrag.limits (n,'l')
	    with _ -> "0"
	  in 
	  let rlimit = 
	    try Hashtbl.find seqfrag.limits (n,'r')
	    with _ -> "0.3"
	  in 
            sweep_limits initfrag llimit rlimit n
        in 
          IntSet.iter do_sweep seqnumset
      )
  in 

  let inconsistent_frags frag1 frag2  =
    let rec inconsistent_indices indexlist1 indexlist2 =
      (***  check that the frags don't lie outside allowed ranges for
	    ***  one another *)
      let rec inconsistent_indices_onepair index1 indexlist2 =
        match indexlist2 with
            [] -> false
          | h::t -> 
	      let lim2l =
		try Hashtbl.find frag2.limits (index1,'l') 
		with _ -> "0"
	      in 
	      let lim2r =
		try Hashtbl.find frag2.limits (index1,'r') 
		with _ -> "0.3"
	      in 
	      let lim1l = 
		try Hashtbl.find frag1.limits (h,'l')
		with _ -> "0"
	      in 
	      let lim1r =
		try Hashtbl.find frag1.limits (h,'r')
		with _ -> "0.3"
	      in 
                if (compare (Hashtbl.find frag1.seqlabel index1) lim2l <= 0) 
                  || (compare (Hashtbl.find frag1.seqlabel index1) lim2r >= 0)
                  || (compare (Hashtbl.find frag2.seqlabel h ) lim1l <= 0)
                  || (compare (Hashtbl.find frag2.seqlabel h) lim1r >= 0)
                then true
                else inconsistent_indices_onepair index1 t
      in 
        match indexlist1 with
            [] -> false
          | h::t -> 
              if inconsistent_indices_onepair h indexlist2
              then true
              else inconsistent_indices t indexlist2
    in inconsistent_indices (IntSet.elements frag1.seqindex) 
         (IntSet.elements frag2.seqindex) 
  in 


  let rec make_block_list fraglist outlist =
    (***  make a list of best matches, to be traversed by fuse_block_list *)
    (***  note: at least one of the frags being paired must have 
	  ***  "paired"=true *)
    let rec one_block frag rest_of_list outlist = 
      match rest_of_list with
          [] -> outlist
        | a::t -> 
	    if 
	      (frag.aseq != "") && (a.aseq != "")	      
            then 
	      let n1,n2,l,p = 
		best_match frag.aseq a.aseq frag.abgw a.abgw
	      in
                if p<1.0 then
		  let s1 = IntSet.elements frag.seqindex in
		  let s2 = IntSet.elements a.seqindex in
		    one_block frag t ((p,frag,n1,s1,a,n2,s2,l)::outlist)
                else 
                  one_block frag t outlist
	    else 
	      one_block frag t outlist
    in 
      match fraglist with
          [] -> outlist
        | a::t -> 
	    let t1 = 
	      List.filter (
		fun f -> not (inconsistent_frags a f)) t in
	      make_block_list t (one_block a t1 outlist)
  in 


  let fuse_block_list blocklist =

    let rec firstfrag (p,frag1,n1,s1,frag2,n2,s2,l) =
      if n1 >= (String.length frag1.aseq) 
      then
        let frag1b= find_neigh (Hashtbl.find frag1.neighbours (List.hd s1,'r'))
        in
          firstfrag (p,frag1b,n1-(String.length frag1.aseq), s1,
                     frag2,n2,s2,l)
      else if n2 >= (String.length frag2.aseq)
      then
        let frag2b= find_neigh (Hashtbl.find frag2.neighbours (List.hd s2,'r'))
        in
          firstfrag (p,frag1,n1,s1,
                     frag2b,n2-(String.length frag2.aseq), s2,l)
      else (p,frag1,n1,s1,frag2,n2,s2,l)
    in 

    let nextfrag (p,frag1,n1,s1,frag2,n2,s2,l) =
      let f1gap = ((String.length frag1.aseq) - n1) in
      let f2gap = ((String.length frag2.aseq) - n2) in
	if (f1gap < f2gap) then
	  let frag1b = 
	    find_neigh (Hashtbl.find frag1.neighbours (List.hd s1, 'r')) in
	  let lb = l - f1gap in
	    (p,frag1,n1,s1,frag2,n2,s2,f1gap), 
	    (p,frag1b,0,s1,frag2,n2+f1gap,s2,lb)
	else if (f1gap > f2gap) then 
	  let frag2b =
	    find_neigh (Hashtbl.find frag2.neighbours (List.hd s2, 'r')) in
	  let lb = l - f2gap in
	    (p,frag1,n1,s1,frag2,n2,s2,f2gap), 
	    (p,frag1,n1+f2gap,s1,frag2b,0,s2,lb)
	else
	  let frag1b = 
	    find_neigh (Hashtbl.find frag1.neighbours (List.hd s1, 'r')) in
	  let frag2b = 
	    find_neigh (Hashtbl.find frag2.neighbours (List.hd s2, 'r')) in
	  let lb = l - f1gap in
            (p,frag1,n1,s1,frag2,n2,s2,f1gap), 
            (p,frag1b,0,s1,frag2b,0,s2,lb)
    in 

    let rec expand_block pair1 outlist =
      let (p,frag1,n1,s1,frag2,n2,s2,l) = pair1 in
        (***  if the block, as annotated, is no longer in one fragment because
              ***  of other fusings, then convert this block into a list of
              ***  single-fragment blocks *)
        if inconsistent_frags frag1 frag2
        then []
        else
          if String.length (frag1.aseq) > n1+l-1 &&
            String.length (frag2.aseq) > n2+l-1 
          then (pair1::outlist)
          else 
            if frag1=seqhead || frag2=seqhead then
              (print_endline "problem";
               outlist)
            else
              let pair1,pair2 = nextfrag pair1 in
                expand_block pair2 (pair1::outlist)
    in 

    let fuse_one_block (p,frag1,n1,s1,frag2,n2,s2,l) =
      let consensus s1 s2 w1 w2 =
        for i = 0 to (String.length s1) -1 do
          if s1.[i] != s2.[i] then ( s1.[i] <- 'N'; w1.(i) <- 0.)
        done;
        s1, w1
      in 
	(***  fuse the two frags; it is assumed that expand_block has
              ***  already been called so n1+l and n2+l won't exceed the length
              ***  of the frag *)
	(***  divide frag1 and frag2 into frag1, frag2, frag3, frag4, frag5
              ***  where frag3 is the "shared piece" so the original frags are
              ***  split into frag1->frag3->frag4 and frag2->frag3->frag5 *)

	(
          let tempstr1=String.copy frag1.aseq in
          let tempstr2=String.copy frag2.aseq in 
	  let tempbgw1=Array.copy frag1.abgw in
	  let tempbgw2=Array.copy frag2.abgw in 
	    (
              frag1.aseq <- String.sub tempstr1 0 n1 ;
              frag2.aseq <- String.sub tempstr2 0 n2 ;
	      frag1.abgw <- Array.sub tempbgw1 0 n1;
	      frag2.abgw <- Array.sub tempbgw2 0 n2;
              let frag3 = 
		let as3, aw3 = 
		  consensus (String.sub tempstr1 n1 l) 
		    (String.sub tempstr2 n2 l) (Array.sub tempbgw1 n1 l) 
		    (Array.sub tempbgw2 n2 l)
		in 
		  { header = ""; 
                    aseq = as3;
		    abgw = aw3;
		    seq = Hashtbl.copy frag1.seq;
                    seqindex=IntSet.union frag1.seqindex frag2.seqindex;
                    seqlabel= Hashtbl.copy frag1.seqlabel;
                    neighbours=Hashtbl.copy frag1.neighbours;
                    limits = Hashtbl.copy frag1.limits
                  } in 
              let frag4 = { header = ""; 
                            aseq = String.sub tempstr1 (n1+l) ((String.length tempstr1)-n1-l);
			    abgw = Array.sub tempbgw1 (n1+l) ((String.length tempstr1)-n1-l);
			    seq = Hashtbl.copy frag1.seq;
                            seqindex= IntSet.union frag1.seqindex IntSet.empty;
                            seqlabel= Hashtbl.copy frag1.seqlabel;
                            neighbours= Hashtbl.copy frag1.neighbours;
                            limits = Hashtbl.copy frag1.limits
                          } in 
              let frag5 = { header = ""; 
                            aseq = String.sub tempstr2 (n2+l) ((String.length tempstr2)-n2-l);
                            abgw = Array.sub tempbgw2 (n2+l) ((String.length tempstr2)-n2-l);
			    seq = Hashtbl.copy frag2.seq;
                            seqindex=IntSet.union frag2.seqindex IntSet.empty;
                            seqlabel= Hashtbl.copy frag2.seqlabel;
                            neighbours= Hashtbl.copy frag2.neighbours;
                            limits = Hashtbl.copy frag2.limits
                          } 
              in (
                  IntSet.iter 
                    (fun n -> (Hashtbl.replace frag3.neighbours (n,'l') 
				 (Frag frag1);
                               Hashtbl.replace frag3.neighbours (n,'r') 
				 (Frag frag4);
                               Hashtbl.replace frag1.neighbours (n,'r') 
				 (Frag frag3);
                               Hashtbl.replace frag4.neighbours (n,'l') 
				 (Frag frag3);

			       Hashtbl.replace frag3.seq n
				 (String.sub (Hashtbl.find frag1.seq n) n1 l);
			       Hashtbl.replace frag4.seq n
				 (String.sub (Hashtbl.find frag1.seq n) 
				    (n1+l) ((String.length tempstr1)-n1-l));
			       Hashtbl.replace frag1.seq n
				 (String.sub (Hashtbl.find frag1.seq n) 0 n1);

                               Hashtbl.replace frag3.seqlabel n 
				 ((Hashtbl.find frag1.seqlabel n) ^ "1");
                               Hashtbl.replace frag4.seqlabel n  
				 ((Hashtbl.find frag4.seqlabel n) ^ "2");
                               Hashtbl.replace frag1.seqlabel n  
				 ((Hashtbl.find frag1.seqlabel n) ^ "0");
                              )) frag1.seqindex;
                  IntSet.iter 
                    (fun n -> (Hashtbl.replace frag3.neighbours (n,'l') 
				 (Frag frag2);
                               Hashtbl.replace frag3.neighbours (n,'r') 
				 (Frag frag5);
                               Hashtbl.replace frag2.neighbours (n,'r') 
				 (Frag frag3);
                               Hashtbl.replace frag5.neighbours (n,'l') 
				 (Frag frag3);

			       Hashtbl.replace frag3.seq n
				 (String.sub (Hashtbl.find frag2.seq n) n2 l);
			       (try
				  Hashtbl.replace frag5.seq n
				    (String.sub (Hashtbl.find frag2.seq n) 
				       (n2+l) ((String.length tempstr2)-n2-l))
				with
				    _ -> 
				      print_endline (Hashtbl.find frag2.seq n)
			       ); 
			       Hashtbl.replace frag2.seq n
				 (String.sub (Hashtbl.find frag2.seq n) 0 n2);

                               Hashtbl.replace frag3.seqlabel n 
				 ((Hashtbl.find frag2.seqlabel n) ^ "1");
                               Hashtbl.replace frag5.seqlabel n 
				 ((Hashtbl.find frag5.seqlabel n) ^ "2");
                               Hashtbl.replace frag2.seqlabel n 
				 ((Hashtbl.find frag2.seqlabel n) ^ "0")
                              )) frag2.seqindex;

                  seqset := SeqSet.add frag3 !seqset;
                  seqset := SeqSet.add frag4 !seqset;
                  seqset := SeqSet.add frag5 !seqset;
                  update_limits frag3;
		)
            )
	)
    in 

    let fuse_one_frag f = 
      let expanded = expand_block (firstfrag f) [] in
        if (List.for_all 
              (fun (p,frag1,n1,s1,frag2,n2,s2,l) ->
                 not (inconsistent_frags frag1 frag2))
              expanded) 
        then 
          List.iter fuse_one_block expanded
    in 
      List.iter fuse_one_frag blocklist
  in 
  let fraglist = SeqSet.elements !seqset in
  let blocklist =List.sort compare (make_block_list fraglist []) in
    match blocklist with 
        [] -> []
      | _ -> (fuse_block_list blocklist;
	      let (_,_,_,_,_,_,_,maxl) = List.hd blocklist in
		blockseq seql seqnumset maxl maxw)
;;

(***                                                              ***)
(***  Create aligned sequence out of the bunch of fragments       ***)
(***  produced by blockseq                                        ***)
(***                                                              ***)


let rec assemble_frags_to_seqs seqhead seqlist foutlist  =

  let is_complete frag = (*   if frag is multi-sequence, all copies
                          *** are at head of seqlist *)
    if frag == seqhead then false else
    if IntSet.cardinal frag.seqindex = 1 then true else
      let nfragmatches =
        List.fold_left 
          (fun n fr -> 
	     if IntSet.compare frag.seqindex fr.seqindex = 0
             then n+1 else n) 0 seqlist
      in 
        if nfragmatches = IntSet.cardinal frag.seqindex
        then true else false
  in 

  let no_other_blocks frag = (*   No other complete frag has a smaller
                              *** seqindex in seqlist *)
    if IntSet.cardinal frag.seqindex = 1 then true else
      List.fold_left 
        (fun b fr -> 
           if (IntSet.cardinal fr.seqindex > 0) &&
             (IntSet.compare fr.seqindex frag.seqindex = -1) &&
             is_complete fr
           then false else b) true seqlist
  in 

  let maxlen fragwidth seqindex = (*    For a complete multi-sequence
                                   ***  frag, it must be aligned when
                                   ***  tacked on, and can't overlap an
                                   ***  existing sequence; so find
                                   ***  "padding" length *)
    let maxlen_blockseqs =
      IntSet.fold (fun idx n ->
                     let fout = (List.nth foutlist idx) in
                       if Buffer.length fout.fseq > n
                       then Buffer.length fout.fseq else n) seqindex 0
    in 
    let push_up fs m =
      let m1 = ref m in
        (while 
           (fs.[!m1] =Char.lowercase fs.[!m1]) do
           m1 := !m1 + 1
         done; 
         while (!m1 < String.length fs)
           &&(fs.[!m1] <> Char.lowercase fs.[!m1]) do
           m1 := !m1 + 1
         done; 
         !m1)
    in 
    let rec overlap fs m1 n =
      if n = fragwidth then false
      else if (m1+n) >= String.length fs then false
      else if fs.[m1+n] <> Char.lowercase fs.[m1+n] then true
      else overlap fs m1 (n+1)
    in 
    let mx = ref maxlen_blockseqs in
      (  for n = 0 to (List.length foutlist - 1) do
         let fs = Buffer.contents (List.nth foutlist n).fseq in
           (while overlap fs !mx 0 do
              mx := push_up fs !mx
            done ) 
       done;
       !mx )
  in 

  let find_neigh neigh1 =
    match neigh1 with
        None -> seqhead 
      | Frag a -> a
  in 

  let rec process_frag_list seql foutl newseqlist n oldmaxlen =
    match seql with
        [] -> List.rev newseqlist
      | frag::seqt ->
	  let fragseq = Hashtbl.find frag.seq n in
          let fout = List.hd foutl in
            if (is_complete frag) && (no_other_blocks frag) then
              ( if IntSet.cardinal frag.seqindex = 1 then
                  (Buffer.add_string fout.fseq fragseq;
                   let nextfrag = find_neigh 
		     (Hashtbl.find frag.neighbours (n,'r')) in
                     process_frag_list seqt (List.tl foutl) 
                       (nextfrag::newseqlist) (n+1) oldmaxlen
                  )
                else (
                  if oldmaxlen = 0 && 
                    (Buffer.length fout.fseq > 0 || 
                     n = List.hd (IntSet.elements frag.seqindex)) then (
                    (*** Find padding value, unless we're just starting ***)
                    (*** If just starting, make sure this is the first  ***)
                    (*** in the block                                   ***)
                    let newmaxlen = (maxlen (String.length fragseq) 
                                       frag.seqindex) in
                      while Buffer.length fout.fseq < newmaxlen do
                        Buffer.add_char fout.fseq '-'
                      done
                  ) else
                    while Buffer.length fout.fseq < oldmaxlen do
                      Buffer.add_char fout.fseq '-'
                    done;
                  Buffer.add_string fout.fseq (String.uppercase fragseq);
                  let nextfrag 
		      = find_neigh (Hashtbl.find frag.neighbours (n,'r'))
                  in 
                    process_frag_list seqt (List.tl foutl) 
                      (nextfrag::newseqlist) (n+1)
                      ((Buffer.length fout.fseq) - (String.length fragseq))
                )
              )
            else 
              process_frag_list seqt (List.tl foutl)
                (frag::newseqlist) (n+1) oldmaxlen
  in 
  let newseqlist = process_frag_list seqlist foutlist [] 0 0
  in 
  let end_list =
    List.fold_left 
      (fun b fr -> if fr==seqhead then b else false) true newseqlist
  in
    if end_list
    then foutlist
    else 
      assemble_frags_to_seqs seqhead newseqlist foutlist
;;

(***                                                              ***)
(***                                                              ***)
(***                                                              ***)

let print_aligned foutlist = 
  let print_one_line foutlist n poslist =
    let print_one_line_seq fout pos =
      let newpos = ref pos in
        for m = 0 to 49 do 
          ( ( if m mod 50 = 0 then
                let head = 
                    if String.length fout.fheader < 11 
                    then String.sub fout.fheader 1 
                      (String.length fout.fheader -1)
                    else String.sub fout.fheader 1 10 
                in 
                  ( print_string head;
                    (if (String.length head < 10) then
                       print_string 
                         (String.make (10-(String.length head)) ' '));
                    print_string "  "
                  )
              else if ((m) mod 10) = 0 then 
                print_string " "
            );
            (let ch = 
               if n+m < Buffer.length fout.fseq
               then (Buffer.contents fout.fseq).[m+n] 
               else ' ' in
               (print_char ch;
                if ch = '-' or ch = ' ' then
                  ()
                else 
                  newpos := !newpos+1
               )
            );
            if ((m+1) mod 50) = 0 then (
              print_string "  ";
              print_int !newpos;
              print_string "\n";
            )) 
        done;
        !newpos
    in 
      List.map2 print_one_line_seq foutlist poslist
  in              
    
  let rec print_aligned foutlist n poslist =
    (print_endline "";
     if (List.fold_left (fun b f -> 
                           if n < (Buffer.length f.fseq) then true else b) 
           false foutlist) then 
       print_aligned foutlist (n+50) (print_one_line foutlist n poslist)
     else () 
    )
  in 
  let poslist = List.map (fun f -> 0) foutlist
  in print_aligned foutlist 0 poslist;;

(***                                                              ***)
(***                                                              ***)
(***                                                              ***)

let print_fasta foutlist =
  List.iter 
    (fun f ->
       print_endline f.fheader;
       print_endline (Buffer.contents f.fseq);
       print_string "\n"
    ) foutlist ;;

(***                                                              ***)
(***                                                              ***)
(***                                                              ***)
(***                                                              ***)

(***  main program below *)

let filenameset = ref [] ;;
let outtype = ref "";;

let argspeclist = 
  [("-A", Arg.Unit 
      (fun () -> outtype :=  !outtype^"A"), 
      "aligned output (compare -F option) (default: only this)");
   ("-b", Arg.Set_string auxfilename, "name of auxiliary file from which "
      ^ "to read background sequences (overridden by -B)");
   ("-B", Arg.Set_string bgfilename, "name of file containing background probabilities");
   ("-F", Arg.Unit 
      (fun () -> outtype := !outtype^"F"), 
      "fasta output (can use both -A and -F in either order)");
   ("-n", Arg.Set_int correlbg, "background correlation (default 2="
      ^ "dinucleotide;\n     1=single-site basecounts, 0=0.25 per base)");
   ("-x", Arg.Set_float prand, "set limit for how probable the match "
      ^ "is by chance\n     (default 0.002, smaller=more stringent)")]
in 
let message = 
  String.concat ""
    ["Sigma, version 1.0: simple greedy multiple alignment\n";
     "Copyright (C) 2006 Rahul Siddharthan <rsidd@imsc.res.in>\n";
     "Licensed under the GNU General Public License, version 2\n\n";
     "Usage: sigma [options] inputfile.fasta [inputfile2.fasta ...]\n\n";
     "Each fasta file may contain a single sequence or multiple sequences;\n";
     "all sequences will be aligned together.\n\n";
     "Options:"]
in                 
  Arg.parse argspeclist (fun s -> filenameset := !filenameset @ [s]) message;;


if !outtype = "" then outtype := "A";;


if !filenameset = [] then
  (Printf.fprintf stderr "Must specify name of file(s) in fasta format (or use -help)\n";
   exit 1);;


if !correlbg > 2 then 
  (print_endline "-n > 2 not implemented; using 2";
   correlbg := 2)
else if !correlbg < 0 then
  (print_endline "-n should be >= 0; assuming 0";
   correlbg := 0)
;;

let gotbgprobs = ref (!correlbg = 0);;

if !bgfilename > "" then
  try 
    ( getbgprobs_from_bgfile !bgfilename;
      normalise_bgprobs ();
      gotbgprobs := true 
    )
  with _ -> 
    Printf.fprintf stderr "Couldn't read bgprobs from file %s\n" !bgfilename
;;

if (not !gotbgprobs) then (
  let auxseqlist = 
    match !auxfilename with
	"" -> []
      | _ -> 
          try readfasta !auxfilename with
              _ -> (Printf.fprintf stderr 
		      "Couldn't open file %s : using input sequences\n" 
		      !auxfilename;
                    [])
  in 
    if auxseqlist != [] then
      ( getbgprobs auxseqlist;
        normalise_bgprobs () ;
	gotbgprobs := true
      )
)
;;

let seqlist = filename2seqlist !filenameset !gotbgprobs ;;

match List.length seqlist with
    1 -> (Printf.fprintf stderr "Error: no sequences read\n"; exit 1)
  | 2 -> (Printf.fprintf stderr 
            "Only one sequence read, no alignment needed\n"; exit 0)
  | _ -> ()
;;

let nel=(List.length seqlist) - 2 in
  for n = 0 to nel do
    let h = List.nth seqlist (n+1) in
      ( h.seqindex <- IntSet.add n IntSet.empty ;
        Hashtbl.replace h.neighbours (n,'l') (Frag (List.hd seqlist));
        Hashtbl.replace h.neighbours (n,'r') (Frag (List.hd seqlist));
        Hashtbl.replace h.seqlabel n "0.";
        Hashtbl.replace (List.hd seqlist).seqlabel n "-1";
        Hashtbl.replace h.seq n (String.copy h.aseq);
	(* h.abgw <- getwts h.aseq; *)
	(* h.abgw <- Array.create (String.length h.aseq) 0.0; *)
        Hashtbl.replace (List.hd seqlist).seq n "";
      )
  done;;

seqset := makefastaset seqlist;

let seqnumset =
  let rec range n r = 
    if n<0 then r else range (n-1) (IntSet.add n r)
  in range (List.length seqlist - 2) IntSet.empty
in blockseq seqlist seqnumset 10000 0.;;

let fout = List.map (fun fr -> {fheader=fr.header; 
                                fseq = Buffer.create 1000} ) 
             (List.tl seqlist) ;;

assemble_frags_to_seqs (List.hd seqlist) (List.tl seqlist) fout ;;

String.iter (fun c -> if c='A' then print_aligned fout else
               print_fasta fout) !outtype;;