File: splitseq_da

package info (click to toggle)
staden 2.0.0%2Bb11-7
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 21,584 kB
  • sloc: ansic: 240,605; tcl: 65,360; cpp: 12,854; makefile: 11,203; sh: 3,023; fortran: 2,033; perl: 63; awk: 46
file content (106 lines) | stat: -rwxr-xr-x 2,473 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/bin/sh
#\
exec stash $0 ${@+"$@"}

#
# Splits a plain text or fasta file into a series of Experiment Files
# containing AP lines. These can then be put into directed assembly.
# The purpose is to allow assembly of very large sequences in an easy manner.
# NB. This is pretty hacky - with minimal error checking.
#


set overlap 1000
set length 25000

#############################################################################

proc get_seq {fd seq_name_p} {
    upvar $seq_name_p seq_name
    global next_seq

    if {[info exists next_seq]} {
	regexp "^>(.*)" $next_seq tmp seq_name
	set seq ""
    } else {
        gets $fd line
        if {[string match ">*" $line]} {
            # Fasta format - rename sequence
            regexp "^>(.*)" $line tmp seq_name
    	    set seq ""
        } else {
    	    regsub -all {[^acgtnACGTN-]} $line {} seq
        }
    }

    while {[gets $fd line] != -1} {
	if {[string match ">*" $line]} {
	    # New fasta entry
	    set next_seq $line
	    return $seq
	} else {
	    regsub -all {[^acgtnACGTN-]} $line {} line
	    append seq $line
	}
    }

    return $seq
}


# Parse args
if {[llength $argv] != 1} {
    puts "Usage: splitseq_da filename"
    exit
}

# Read and reformat file. Supports plain text and fasta formats
set fd [open $argv r]
set seq_name $argv
while {[set seq [get_seq $fd seq_name]] != ""} {
    set num 0
    set last ""
    set l [string length $seq]
    set inc [expr $length-$overlap]
    set fdfofn [open $seq_name.fofn w]
    for {set i 0} {$i+$overlap < $l} {incr i $inc} {
	set subseq [string range $seq $i [expr $i+$length-1]]
	if {$last != ""} {
	    set AP "$last + $inc -1"
	} else {
	    set AP "*new* +"
	}
       	puts "Creating file $seq_name.[format %04d $num]"
	puts $fdfofn $seq_name.[format %04d $num]
	set fdout [open $seq_name.[format %04d $num] w]
	puts $fdout "ID   $seq_name.[format %04d $num]"
	puts $fdout "SQ"
	set len [string length $subseq]
	set count 0
	for {set j 0} {$j < $len} {incr j 10} {
	    if {$count == 0} {
		puts -nonewline $fdout "     "
	    } else {
		puts -nonewline $fdout " "
	    }
	    puts -nonewline $fdout [string range $subseq $j [expr $j+9]]
	    if {[incr count] == 6} {
	        set count 0
		puts $fdout ""
	    }
	}
	if {$count != 0} {
	    puts $fdout ""
	}
	puts $fdout "//"
       	puts $fdout "AP   $AP"
	set last $seq_name.[format %04d $num]
	incr num
    }
    close $fdfofn
    puts "Creating file $seq_name.fofn"

}
close $fd

exit