(*** SiGMA : Simple Greedy Multiple Alignment *** Version 1.0 (March 11, 2006) *** Copyright (C) Rahul Siddharthan, 2006 *** *** Requires ocaml 3.x (http://caml.inria.fr) *** When compiling, link the str module as follows: *** bytecode compiler -> ocamlc str.cma sigma.ml -o sigma.bytecode *** native-code compiler -> ocamlopt str.cmxa sigma.ml -o sigma *** *** (native is around 7x-10x faster, but no debugger support) *** *** To produce a static binary on linux (so that it's independent of *** your machine's particular version of libc), use *** ocamlopt -cc "gcc -static" str.cmxa sigma.ml -o sigma ***) (*** Strategy: find most significant common substring between two strings *** (mutations may exist but are taken care of in the scoring) *** and "align" them. Then next largest common substring among *** existing fragments that may be "consistently" aligned. Each time *** a common substring is found it is "fused" into one fragment (with *** multiple associated sequence numbers). Repeat until nothing left. *** *** Actually, for reasons of efficiency a list of "best matches" is made *** and aligned with consistency checks, rather than doing it one *** match at a time, but that's a detail. ***) module IntSet = Set.Make (struct type t=int let compare=compare end ) (*** the main data type, the sequence fragment ***) type neigh_var = None | Frag of seq_frag and seq_frag = { header : string ; seq : (int, string) Hashtbl.t ; mutable aseq: string; mutable abgw: float array; mutable seqindex : IntSet.t; seqlabel: (int, string) Hashtbl.t; neighbours: (int * char, neigh_var) Hashtbl.t; limits : (int * char, string) Hashtbl.t } (*** Notes: *** *** 1. seq is a hashtbl keyed by sequence index n, giving corresponding *** sequence, since after alignment there may be mismatches. *** *** 2. aseq is a string giving the consensus of all aligned strings. *** For now, a mismatch will be marked as N and subsequent bases *** aligned to it will get a score zero. *** *** 3. seqindex is the original sequence to which the current sequence *** fragment belongs; a set containing initially a single number, but, *** as pairs are made, possibly multiple numbers *** *** 4. limits is a hashtbl keyed with pairs of sequence numbers and chars *** (n,c='l'/'r') mapping to the legal start (l) or end (r) position on *** sequence n for this fragment. *** *** 5. seqlabel is an array whose nth element is the label of that *** fragment on sequence n, a string representation of a float between 0 *** and 1 (actually between 0 and 0.3). Each fragment on each sequence *** will have a unique label. *** *** 6. neighbours is a hashtbl that takes as keys a pair of the *** sequence number and a char 'l'/'r' and outputs a ref to the *** neighbouring frag on that seq in that direction *** *** 7. abgw is the background weight, the log of the (conditional) *** background probability for that base divided by 0.25. *** *** The sequence starts in two datastructures, a list (whose first *** element is a dummy) and a set that contains all the "real" elements. *** At each alignment step two sequences (elements of the set) are *** broken up into 5 sequences (a shared portion and the non-shared *** boundaries, which may be zero-length). The original list always *** contains the starting element, with pointer to the next fragment. *** It's like a doubly-linked list (actually, a set of lists with *** crosslinks, if you like) with end fragments pointing to a *** "dummy" fragment. ***) type fasta_seq = { fheader : string; mutable fseq: Buffer.t} (*** Global variables ***) let prand = ref 0.002 let correlbg = ref 2 let bgfilename = ref "" let auxfilename = ref "" let singlesite = ref (Array.make 4 0.0) let dinucleotide = ref (Array.make_matrix 4 4 0.0) (*** ***) (*** ***) (*** ***) (*** best_match: finds the most significant common substring with *** mismatches. The significance is basically the background *** probability of the string multiplied by the probability of *** seeing a match of that quality in two long strings of the given *** lengths. The probability for fragment size l with string lengths *** l1 and l2 is 1-(1-p)^[(l1-l+1)(l2-l+1)] = approx. p*(l1-l+1)*(l2-l+1) *** where p = bg prob * (l choose m), where m = number of mismatches. *** Lowest answer = most significant. ***) let best_match s1 s2 w1 w2 = let sl1 = String.length s1 in let sl2 = String.length s2 in let fmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 1.0 in let lmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 0 in let mmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 0 in let best = ref 1.0 in let bestm = ref 0 in let bestn = ref 0 in let lbest = ref 0 in for m = 1 to sl1 do for n = 1 to sl2 do let a1 = s1.[m-1] in let b1 = s2.[n-1] in let f_old = fmat.(m-1).(n-1) in let m_old = mmat.(m-1).(n-1) in let l_new = lmat.(m-1).(n-1) + 1 in let m_new, f_new = if (a1=b1) && (a1 != 'N') then m_old, f_old *. sqrt(w1.(m-1)*.w2.(n-1)) *. (float l_new) /. (float (l_new-m_old)) else m_old + 1, f_old *. (float l_new) /. (float (m_old + 1)) in let (f_new, l_new, m_new) = (min (f_new,l_new,m_new) (1.0,0,0)) in ( fmat.(m).(n) <- f_new; lmat.(m).(n) <- l_new; mmat.(m).(n) <- m_new; if (f_new < !best) then ( best := f_new; bestm := m; bestn := n) ) done done; while (fmat.(!bestm - !lbest).(!bestn - !lbest) < 1.0) && (!lbest < !bestm) && (!lbest < !bestn) do lbest := !lbest + 1 done; best := !best *. (float (sl1 - !lbest+1)) *. (float (sl2 - !lbest+1)); if (!best < !prand) then !bestm - !lbest, !bestn - !lbest, !lbest, !best else 0,0,0,1.0 ;; (*** ***) (*** ***) (*** ***) let compareSeq seq1 seq2 = (*** if seqindex is same in both cases then check fragment number *** from "seqlabel", otherwise IntSet.compare *) (*** this is needed to create the module SeqSet, but isn't used, *** except to the extent that this should not return 0 unless *** two frags really are identical, which should not happen *) match (IntSet.compare seq1.seqindex seq2.seqindex ) with 0 -> ( let rec complabel = function | [] -> 0 | a::t -> match (compare (Hashtbl.find seq1.seqlabel a) (Hashtbl.find seq2.seqlabel a)) with 0 -> complabel t | n -> n in complabel (IntSet.elements seq1.seqindex)) | n -> n ;; module SeqSet = Set.Make (struct type t=seq_frag let compare=compareSeq end );; let seqset = ref SeqSet.empty ;; (*** ***) (*** ***) (*** ***) let readlines_noblank ffilename = (* works like python's readlines *) let fchan = open_in ffilename in let lenstr = in_channel_length fchan in let res = String.create lenstr in really_input fchan res 0 lenstr ; Str.split (Str.regexp "\\(\r\n\\|\n\\)+" ) res ;; let newfseq h = { header = h; seq = Hashtbl.create 5; aseq = "" ; abgw = Array.create 0 0.0; seqindex = IntSet.empty; seqlabel = Hashtbl.create 5; neighbours = Hashtbl.create 10; limits = Hashtbl.create 5 };; let readfasta filename = (* reads a fasta sequence, returns it as a list *** with a dummy first element used as a marker *) let flines = readlines_noblank filename in let rec flines2flist flist flines fcurr ln = if List.length flines = 0 then fcurr::flist else match (List.hd flines).[0] with '>' -> let fcurr2 = newfseq (List.hd flines) in flines2flist (fcurr::flist) (List.tl flines) fcurr2 (ln+1) | ';'|'#' -> flines2flist flist (List.tl flines) fcurr ln | _ -> let fcurr2 = { header = fcurr.header; seq=fcurr.seq; aseq = fcurr.aseq ^ (String.lowercase (List.hd flines)) ; abgw = Array.create 0 0.0; seqlabel= fcurr.seqlabel ; neighbours = fcurr.neighbours; seqindex = fcurr.seqindex; limits = fcurr.limits } in flines2flist flist (List.tl flines) fcurr2 ln in List.rev (flines2flist [] flines (newfseq "" ) (-1));; let makefastaset seqlist = let rec list2set a s = match a with [] -> s | h::t -> list2set t (SeqSet.add h s) in list2set (List.tl seqlist) (SeqSet.empty);; let basenum = function | 'a' | 'A' -> 0 | 'c' | 'C' -> 1 | 'g' | 'G' -> 2 | 't' | 'T' -> 3 | _ -> -1 ;; let getbgprobs_from_bgfile bgfile = (* read singlesite and dinucleotide from bgfile *) let bglines = readlines_noblank bgfile in let line2val line1 = let line1s=Str.split (Str.regexp "[ \t]+") line1 in if line1s > [] && line1.[0]!= '#' && (String.length (List.hd line1s) < 3) then let bs = List.hd line1s in if (String.length bs = 1) then !singlesite.(basenum bs.[0]) <- (float_of_string (List.nth line1s 1)) else !dinucleotide.(basenum bs.[0]).(basenum bs.[1]) <- (float_of_string (List.nth line1s 1)) else () in List.iter line2val bglines;; let getbgprobs auxseqlist = (*** assume !singlesite, !dinucleotide are initialised to zero ***) List.iter (fun f -> for n = 0 to (String.length f.aseq) - 1 do let c = basenum f.aseq.[n] in if c >= 0 then (!singlesite.(c) <- !singlesite.(c) +. 1.0; !singlesite.(3-c) <- !singlesite.(3-c) +. 1.0; if n>0 then let c1 = basenum f.aseq.[n-1] in if c1 >=0 then (!dinucleotide.(c1).(c) <- !dinucleotide.(c1).(c) +. 1.0; ) ) done ) auxseqlist ;; let normalise_bgprobs () = (let tots = ref 0.0 in (for n = 0 to 3 do tots := !tots +. !singlesite.(n) done; for n = 0 to 3 do !singlesite.(n) <- !singlesite.(n) /. !tots done ) ); for n = 0 to 3 do let totd = ref 0.0 in (for m = 0 to 3 do totd := !totd +. !dinucleotide.(n).(m) done; for m = 0 to 3 do !dinucleotide.(n).(m) <- !dinucleotide.(n).(m) /. !totd done ) done ;; let getwts s = let w = Array.make (String.length s) 0.0 in ( for n = 0 to (String.length s) - 1 do if !correlbg = 0 then w.(n) <- 0.25 else let cn = basenum s.[n] in if cn<0 then w.(n) <- 1. else if n=0 then w.(n) <- !singlesite.(cn) else let cn1 = basenum s.[n-1] in if cn1 < 0 || !correlbg = 1 then w.(n) <- !singlesite.(cn) else w.(n) <- !dinucleotide.(cn1).(cn) done; w ) ;; let filename2seqlist filenameset gotbgprobs = let fastalist1 = try readfasta (List.hd filenameset) with _ -> ( Printf.fprintf stderr "Error: unable to read file %s\n" (List.hd filenameset); exit 1) in let fastalist2l = List.map (fun filename -> try readfasta filename with _ -> ( Printf.fprintf stderr "Error: unable to read file %s\n" filename; exit 1) ) (List.tl filenameset) in let makeseqlist = function | [] -> [] | h::t -> List.fold_left (fun n m -> n @ (List.tl m)) (List.tl h) t in let fastalist = fastalist1 @ (makeseqlist fastalist2l) in ( if gotbgprobs then () else ( getbgprobs fastalist; normalise_bgprobs () ); List.iter (fun f -> f.abgw <- getwts f.aseq) fastalist; fastalist ) ;; let rec blockseq seql seqnumset maxl maxw = (*** the main routine, recursively forms blocks until no more are *** possible *) (*** first, make a list of best pairwise matches, removing matches *** that conflict with better matches; then pairwise-align those, *** one by one; then call self again. If null list, quit. *) let seqhead = List.hd seql in let seqlist = List.tl seql in let find_neigh = function | None -> seqhead | Frag a -> a in let update_limits seqfrag = (* updates limits for consistent alignment *) let rec sweep_limits frag l_stop r_start m = let lim = seqfrag.limits in if frag==seqhead then () else let nextfrag = find_neigh (Hashtbl.find frag.neighbours (m,'r')) in let thislabel = Hashtbl.find frag.seqlabel m in let fix_limits n = ( ( if thislabel <= l_stop then let newlim = try Hashtbl.find lim (n,'r') with _ -> "0.3" in let oldlim = try Hashtbl.find frag.limits (n,'r') with _ -> "0.3" in if oldlim > newlim then Hashtbl.replace frag.limits (n,'r') newlim); ( if thislabel >= r_start then let newlim = try Hashtbl.find lim (n,'l') with _ -> "0" in let oldlim = try Hashtbl.find frag.limits (n,'l') with _ -> "0" in if oldlim < newlim then Hashtbl.replace frag.limits (n,'l') newlim)) in (IntSet.iter fix_limits seqnumset; sweep_limits nextfrag l_stop r_start m) in ( IntSet.iter (* Set "diagonal" limits to self *) (fun n -> (Hashtbl.replace seqfrag.limits (n,'l') (Hashtbl.find seqfrag.seqlabel n) ; Hashtbl.replace seqfrag.limits (n,'r') (Hashtbl.find seqfrag.seqlabel n))) seqfrag.seqindex; IntSet.iter (* replace each limit with most stringent limit of *** a neighbouring fragment *) (fun n -> ( IntSet.iter (fun m -> (let fragl = find_neigh (Hashtbl.find seqfrag.neighbours (m,'l')) in let limn = try Hashtbl.find fragl.limits (n,'l') with _ -> "0" in let seqfraglim = try Hashtbl.find seqfrag.limits (n,'l') with _ -> "0" in if limn > seqfraglim then Hashtbl.replace seqfrag.limits (n,'l') limn ); (let fragr = find_neigh (Hashtbl.find seqfrag.neighbours (m,'r')) in let limn = try Hashtbl.find fragr.limits (n,'r') with _ -> "0.3" in let seqfraglim = try Hashtbl.find seqfrag.limits (n,'r') with _ -> "0.3" in if limn < seqfraglim then Hashtbl.replace seqfrag.limits (n,'r') limn ) ) seqfrag.seqindex ) ) seqnumset; let do_sweep n = let initfrag = (List.nth seqlist n) in let llimit = try Hashtbl.find seqfrag.limits (n,'l') with _ -> "0" in let rlimit = try Hashtbl.find seqfrag.limits (n,'r') with _ -> "0.3" in sweep_limits initfrag llimit rlimit n in IntSet.iter do_sweep seqnumset ) in let inconsistent_frags frag1 frag2 = let rec inconsistent_indices indexlist1 indexlist2 = (*** check that the frags don't lie outside allowed ranges for *** one another *) let rec inconsistent_indices_onepair index1 indexlist2 = match indexlist2 with [] -> false | h::t -> let lim2l = try Hashtbl.find frag2.limits (index1,'l') with _ -> "0" in let lim2r = try Hashtbl.find frag2.limits (index1,'r') with _ -> "0.3" in let lim1l = try Hashtbl.find frag1.limits (h,'l') with _ -> "0" in let lim1r = try Hashtbl.find frag1.limits (h,'r') with _ -> "0.3" in if (compare (Hashtbl.find frag1.seqlabel index1) lim2l <= 0) || (compare (Hashtbl.find frag1.seqlabel index1) lim2r >= 0) || (compare (Hashtbl.find frag2.seqlabel h ) lim1l <= 0) || (compare (Hashtbl.find frag2.seqlabel h) lim1r >= 0) then true else inconsistent_indices_onepair index1 t in match indexlist1 with [] -> false | h::t -> if inconsistent_indices_onepair h indexlist2 then true else inconsistent_indices t indexlist2 in inconsistent_indices (IntSet.elements frag1.seqindex) (IntSet.elements frag2.seqindex) in let rec make_block_list fraglist outlist = (*** make a list of best matches, to be traversed by fuse_block_list *) (*** note: at least one of the frags being paired must have *** "paired"=true *) let rec one_block frag rest_of_list outlist = match rest_of_list with [] -> outlist | a::t -> if (frag.aseq != "") && (a.aseq != "") then let n1,n2,l,p = best_match frag.aseq a.aseq frag.abgw a.abgw in if p<1.0 then let s1 = IntSet.elements frag.seqindex in let s2 = IntSet.elements a.seqindex in one_block frag t ((p,frag,n1,s1,a,n2,s2,l)::outlist) else one_block frag t outlist else one_block frag t outlist in match fraglist with [] -> outlist | a::t -> let t1 = List.filter ( fun f -> not (inconsistent_frags a f)) t in make_block_list t (one_block a t1 outlist) in let fuse_block_list blocklist = let rec firstfrag (p,frag1,n1,s1,frag2,n2,s2,l) = if n1 >= (String.length frag1.aseq) then let frag1b= find_neigh (Hashtbl.find frag1.neighbours (List.hd s1,'r')) in firstfrag (p,frag1b,n1-(String.length frag1.aseq), s1, frag2,n2,s2,l) else if n2 >= (String.length frag2.aseq) then let frag2b= find_neigh (Hashtbl.find frag2.neighbours (List.hd s2,'r')) in firstfrag (p,frag1,n1,s1, frag2b,n2-(String.length frag2.aseq), s2,l) else (p,frag1,n1,s1,frag2,n2,s2,l) in let nextfrag (p,frag1,n1,s1,frag2,n2,s2,l) = let f1gap = ((String.length frag1.aseq) - n1) in let f2gap = ((String.length frag2.aseq) - n2) in if (f1gap < f2gap) then let frag1b = find_neigh (Hashtbl.find frag1.neighbours (List.hd s1, 'r')) in let lb = l - f1gap in (p,frag1,n1,s1,frag2,n2,s2,f1gap), (p,frag1b,0,s1,frag2,n2+f1gap,s2,lb) else if (f1gap > f2gap) then let frag2b = find_neigh (Hashtbl.find frag2.neighbours (List.hd s2, 'r')) in let lb = l - f2gap in (p,frag1,n1,s1,frag2,n2,s2,f2gap), (p,frag1,n1+f2gap,s1,frag2b,0,s2,lb) else let frag1b = find_neigh (Hashtbl.find frag1.neighbours (List.hd s1, 'r')) in let frag2b = find_neigh (Hashtbl.find frag2.neighbours (List.hd s2, 'r')) in let lb = l - f1gap in (p,frag1,n1,s1,frag2,n2,s2,f1gap), (p,frag1b,0,s1,frag2b,0,s2,lb) in let rec expand_block pair1 outlist = let (p,frag1,n1,s1,frag2,n2,s2,l) = pair1 in (*** if the block, as annotated, is no longer in one fragment because *** of other fusings, then convert this block into a list of *** single-fragment blocks *) if inconsistent_frags frag1 frag2 then [] else if String.length (frag1.aseq) > n1+l-1 && String.length (frag2.aseq) > n2+l-1 then (pair1::outlist) else if frag1=seqhead || frag2=seqhead then (print_endline "problem"; outlist) else let pair1,pair2 = nextfrag pair1 in expand_block pair2 (pair1::outlist) in let fuse_one_block (p,frag1,n1,s1,frag2,n2,s2,l) = let consensus s1 s2 w1 w2 = for i = 0 to (String.length s1) -1 do if s1.[i] != s2.[i] then ( s1.[i] <- 'N'; w1.(i) <- 0.) done; s1, w1 in (*** fuse the two frags; it is assumed that expand_block has *** already been called so n1+l and n2+l won't exceed the length *** of the frag *) (*** divide frag1 and frag2 into frag1, frag2, frag3, frag4, frag5 *** where frag3 is the "shared piece" so the original frags are *** split into frag1->frag3->frag4 and frag2->frag3->frag5 *) ( let tempstr1=String.copy frag1.aseq in let tempstr2=String.copy frag2.aseq in let tempbgw1=Array.copy frag1.abgw in let tempbgw2=Array.copy frag2.abgw in ( frag1.aseq <- String.sub tempstr1 0 n1 ; frag2.aseq <- String.sub tempstr2 0 n2 ; frag1.abgw <- Array.sub tempbgw1 0 n1; frag2.abgw <- Array.sub tempbgw2 0 n2; let frag3 = let as3, aw3 = consensus (String.sub tempstr1 n1 l) (String.sub tempstr2 n2 l) (Array.sub tempbgw1 n1 l) (Array.sub tempbgw2 n2 l) in { header = ""; aseq = as3; abgw = aw3; seq = Hashtbl.copy frag1.seq; seqindex=IntSet.union frag1.seqindex frag2.seqindex; seqlabel= Hashtbl.copy frag1.seqlabel; neighbours=Hashtbl.copy frag1.neighbours; limits = Hashtbl.copy frag1.limits } in let frag4 = { header = ""; aseq = String.sub tempstr1 (n1+l) ((String.length tempstr1)-n1-l); abgw = Array.sub tempbgw1 (n1+l) ((String.length tempstr1)-n1-l); seq = Hashtbl.copy frag1.seq; seqindex= IntSet.union frag1.seqindex IntSet.empty; seqlabel= Hashtbl.copy frag1.seqlabel; neighbours= Hashtbl.copy frag1.neighbours; limits = Hashtbl.copy frag1.limits } in let frag5 = { header = ""; aseq = String.sub tempstr2 (n2+l) ((String.length tempstr2)-n2-l); abgw = Array.sub tempbgw2 (n2+l) ((String.length tempstr2)-n2-l); seq = Hashtbl.copy frag2.seq; seqindex=IntSet.union frag2.seqindex IntSet.empty; seqlabel= Hashtbl.copy frag2.seqlabel; neighbours= Hashtbl.copy frag2.neighbours; limits = Hashtbl.copy frag2.limits } in ( IntSet.iter (fun n -> (Hashtbl.replace frag3.neighbours (n,'l') (Frag frag1); Hashtbl.replace frag3.neighbours (n,'r') (Frag frag4); Hashtbl.replace frag1.neighbours (n,'r') (Frag frag3); Hashtbl.replace frag4.neighbours (n,'l') (Frag frag3); Hashtbl.replace frag3.seq n (String.sub (Hashtbl.find frag1.seq n) n1 l); Hashtbl.replace frag4.seq n (String.sub (Hashtbl.find frag1.seq n) (n1+l) ((String.length tempstr1)-n1-l)); Hashtbl.replace frag1.seq n (String.sub (Hashtbl.find frag1.seq n) 0 n1); Hashtbl.replace frag3.seqlabel n ((Hashtbl.find frag1.seqlabel n) ^ "1"); Hashtbl.replace frag4.seqlabel n ((Hashtbl.find frag4.seqlabel n) ^ "2"); Hashtbl.replace frag1.seqlabel n ((Hashtbl.find frag1.seqlabel n) ^ "0"); )) frag1.seqindex; IntSet.iter (fun n -> (Hashtbl.replace frag3.neighbours (n,'l') (Frag frag2); Hashtbl.replace frag3.neighbours (n,'r') (Frag frag5); Hashtbl.replace frag2.neighbours (n,'r') (Frag frag3); Hashtbl.replace frag5.neighbours (n,'l') (Frag frag3); Hashtbl.replace frag3.seq n (String.sub (Hashtbl.find frag2.seq n) n2 l); (try Hashtbl.replace frag5.seq n (String.sub (Hashtbl.find frag2.seq n) (n2+l) ((String.length tempstr2)-n2-l)) with _ -> print_endline (Hashtbl.find frag2.seq n) ); Hashtbl.replace frag2.seq n (String.sub (Hashtbl.find frag2.seq n) 0 n2); Hashtbl.replace frag3.seqlabel n ((Hashtbl.find frag2.seqlabel n) ^ "1"); Hashtbl.replace frag5.seqlabel n ((Hashtbl.find frag5.seqlabel n) ^ "2"); Hashtbl.replace frag2.seqlabel n ((Hashtbl.find frag2.seqlabel n) ^ "0") )) frag2.seqindex; seqset := SeqSet.add frag3 !seqset; seqset := SeqSet.add frag4 !seqset; seqset := SeqSet.add frag5 !seqset; update_limits frag3; ) ) ) in let fuse_one_frag f = let expanded = expand_block (firstfrag f) [] in if (List.for_all (fun (p,frag1,n1,s1,frag2,n2,s2,l) -> not (inconsistent_frags frag1 frag2)) expanded) then List.iter fuse_one_block expanded in List.iter fuse_one_frag blocklist in let fraglist = SeqSet.elements !seqset in let blocklist =List.sort compare (make_block_list fraglist []) in match blocklist with [] -> [] | _ -> (fuse_block_list blocklist; let (_,_,_,_,_,_,_,maxl) = List.hd blocklist in blockseq seql seqnumset maxl maxw) ;; (*** ***) (*** Create aligned sequence out of the bunch of fragments ***) (*** produced by blockseq ***) (*** ***) let rec assemble_frags_to_seqs seqhead seqlist foutlist = let is_complete frag = (* if frag is multi-sequence, all copies *** are at head of seqlist *) if frag == seqhead then false else if IntSet.cardinal frag.seqindex = 1 then true else let nfragmatches = List.fold_left (fun n fr -> if IntSet.compare frag.seqindex fr.seqindex = 0 then n+1 else n) 0 seqlist in if nfragmatches = IntSet.cardinal frag.seqindex then true else false in let no_other_blocks frag = (* No other complete frag has a smaller *** seqindex in seqlist *) if IntSet.cardinal frag.seqindex = 1 then true else List.fold_left (fun b fr -> if (IntSet.cardinal fr.seqindex > 0) && (IntSet.compare fr.seqindex frag.seqindex = -1) && is_complete fr then false else b) true seqlist in let maxlen fragwidth seqindex = (* For a complete multi-sequence *** frag, it must be aligned when *** tacked on, and can't overlap an *** existing sequence; so find *** "padding" length *) let maxlen_blockseqs = IntSet.fold (fun idx n -> let fout = (List.nth foutlist idx) in if Buffer.length fout.fseq > n then Buffer.length fout.fseq else n) seqindex 0 in let push_up fs m = let m1 = ref m in (while (fs.[!m1] =Char.lowercase fs.[!m1]) do m1 := !m1 + 1 done; while (!m1 < String.length fs) &&(fs.[!m1] <> Char.lowercase fs.[!m1]) do m1 := !m1 + 1 done; !m1) in let rec overlap fs m1 n = if n = fragwidth then false else if (m1+n) >= String.length fs then false else if fs.[m1+n] <> Char.lowercase fs.[m1+n] then true else overlap fs m1 (n+1) in let mx = ref maxlen_blockseqs in ( for n = 0 to (List.length foutlist - 1) do let fs = Buffer.contents (List.nth foutlist n).fseq in (while overlap fs !mx 0 do mx := push_up fs !mx done ) done; !mx ) in let find_neigh neigh1 = match neigh1 with None -> seqhead | Frag a -> a in let rec process_frag_list seql foutl newseqlist n oldmaxlen = match seql with [] -> List.rev newseqlist | frag::seqt -> let fragseq = Hashtbl.find frag.seq n in let fout = List.hd foutl in if (is_complete frag) && (no_other_blocks frag) then ( if IntSet.cardinal frag.seqindex = 1 then (Buffer.add_string fout.fseq fragseq; let nextfrag = find_neigh (Hashtbl.find frag.neighbours (n,'r')) in process_frag_list seqt (List.tl foutl) (nextfrag::newseqlist) (n+1) oldmaxlen ) else ( if oldmaxlen = 0 && (Buffer.length fout.fseq > 0 || n = List.hd (IntSet.elements frag.seqindex)) then ( (*** Find padding value, unless we're just starting ***) (*** If just starting, make sure this is the first ***) (*** in the block ***) let newmaxlen = (maxlen (String.length fragseq) frag.seqindex) in while Buffer.length fout.fseq < newmaxlen do Buffer.add_char fout.fseq '-' done ) else while Buffer.length fout.fseq < oldmaxlen do Buffer.add_char fout.fseq '-' done; Buffer.add_string fout.fseq (String.uppercase fragseq); let nextfrag = find_neigh (Hashtbl.find frag.neighbours (n,'r')) in process_frag_list seqt (List.tl foutl) (nextfrag::newseqlist) (n+1) ((Buffer.length fout.fseq) - (String.length fragseq)) ) ) else process_frag_list seqt (List.tl foutl) (frag::newseqlist) (n+1) oldmaxlen in let newseqlist = process_frag_list seqlist foutlist [] 0 0 in let end_list = List.fold_left (fun b fr -> if fr==seqhead then b else false) true newseqlist in if end_list then foutlist else assemble_frags_to_seqs seqhead newseqlist foutlist ;; (*** ***) (*** ***) (*** ***) let print_aligned foutlist = let print_one_line foutlist n poslist = let print_one_line_seq fout pos = let newpos = ref pos in for m = 0 to 49 do ( ( if m mod 50 = 0 then let head = if String.length fout.fheader < 11 then String.sub fout.fheader 1 (String.length fout.fheader -1) else String.sub fout.fheader 1 10 in ( print_string head; (if (String.length head < 10) then print_string (String.make (10-(String.length head)) ' ')); print_string " " ) else if ((m) mod 10) = 0 then print_string " " ); (let ch = if n+m < Buffer.length fout.fseq then (Buffer.contents fout.fseq).[m+n] else ' ' in (print_char ch; if ch = '-' or ch = ' ' then () else newpos := !newpos+1 ) ); if ((m+1) mod 50) = 0 then ( print_string " "; print_int !newpos; print_string "\n"; )) done; !newpos in List.map2 print_one_line_seq foutlist poslist in let rec print_aligned foutlist n poslist = (print_endline ""; if (List.fold_left (fun b f -> if n < (Buffer.length f.fseq) then true else b) false foutlist) then print_aligned foutlist (n+50) (print_one_line foutlist n poslist) else () ) in let poslist = List.map (fun f -> 0) foutlist in print_aligned foutlist 0 poslist;; (*** ***) (*** ***) (*** ***) let print_fasta foutlist = List.iter (fun f -> print_endline f.fheader; print_endline (Buffer.contents f.fseq); print_string "\n" ) foutlist ;; (*** ***) (*** ***) (*** ***) (*** ***) (*** main program below *) let filenameset = ref [] ;; let outtype = ref "";; let argspeclist = [("-A", Arg.Unit (fun () -> outtype := !outtype^"A"), "aligned output (compare -F option) (default: only this)"); ("-b", Arg.Set_string auxfilename, "name of auxiliary file from which " ^ "to read background sequences (overridden by -B)"); ("-B", Arg.Set_string bgfilename, "name of file containing background probabilities"); ("-F", Arg.Unit (fun () -> outtype := !outtype^"F"), "fasta output (can use both -A and -F in either order)"); ("-n", Arg.Set_int correlbg, "background correlation (default 2=" ^ "dinucleotide;\n 1=single-site basecounts, 0=0.25 per base)"); ("-x", Arg.Set_float prand, "set limit for how probable the match " ^ "is by chance\n (default 0.002, smaller=more stringent)")] in let message = String.concat "" ["Sigma, version 1.0: simple greedy multiple alignment\n"; "Copyright (C) 2006 Rahul Siddharthan \n"; "Licensed under the GNU General Public License, version 2\n\n"; "Usage: sigma [options] inputfile.fasta [inputfile2.fasta ...]\n\n"; "Each fasta file may contain a single sequence or multiple sequences;\n"; "all sequences will be aligned together.\n\n"; "Options:"] in Arg.parse argspeclist (fun s -> filenameset := !filenameset @ [s]) message;; if !outtype = "" then outtype := "A";; if !filenameset = [] then (Printf.fprintf stderr "Must specify name of file(s) in fasta format (or use -help)\n"; exit 1);; if !correlbg > 2 then (print_endline "-n > 2 not implemented; using 2"; correlbg := 2) else if !correlbg < 0 then (print_endline "-n should be >= 0; assuming 0"; correlbg := 0) ;; let gotbgprobs = ref (!correlbg = 0);; if !bgfilename > "" then try ( getbgprobs_from_bgfile !bgfilename; normalise_bgprobs (); gotbgprobs := true ) with _ -> Printf.fprintf stderr "Couldn't read bgprobs from file %s\n" !bgfilename ;; if (not !gotbgprobs) then ( let auxseqlist = match !auxfilename with "" -> [] | _ -> try readfasta !auxfilename with _ -> (Printf.fprintf stderr "Couldn't open file %s : using input sequences\n" !auxfilename; []) in if auxseqlist != [] then ( getbgprobs auxseqlist; normalise_bgprobs () ; gotbgprobs := true ) ) ;; let seqlist = filename2seqlist !filenameset !gotbgprobs ;; match List.length seqlist with 1 -> (Printf.fprintf stderr "Error: no sequences read\n"; exit 1) | 2 -> (Printf.fprintf stderr "Only one sequence read, no alignment needed\n"; exit 0) | _ -> () ;; let nel=(List.length seqlist) - 2 in for n = 0 to nel do let h = List.nth seqlist (n+1) in ( h.seqindex <- IntSet.add n IntSet.empty ; Hashtbl.replace h.neighbours (n,'l') (Frag (List.hd seqlist)); Hashtbl.replace h.neighbours (n,'r') (Frag (List.hd seqlist)); Hashtbl.replace h.seqlabel n "0."; Hashtbl.replace (List.hd seqlist).seqlabel n "-1"; Hashtbl.replace h.seq n (String.copy h.aseq); (* h.abgw <- getwts h.aseq; *) (* h.abgw <- Array.create (String.length h.aseq) 0.0; *) Hashtbl.replace (List.hd seqlist).seq n ""; ) done;; seqset := makefastaset seqlist; let seqnumset = let rec range n r = if n<0 then r else range (n-1) (IntSet.add n r) in range (List.length seqlist - 2) IntSet.empty in blockseq seqlist seqnumset 10000 0.;; let fout = List.map (fun fr -> {fheader=fr.header; fseq = Buffer.create 1000} ) (List.tl seqlist) ;; assemble_frags_to_seqs (List.hd seqlist) (List.tl seqlist) fout ;; String.iter (fun c -> if c='A' then print_aligned fout else print_fasta fout) !outtype;;