File: generate_read.py

package info (click to toggle)
mindthegap 2.3.0-5
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 4,076 kB
  • sloc: cpp: 4,482; python: 917; sh: 419; makefile: 5
file content (96 lines) | stat: -rwxr-xr-x 2,778 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/python3
# -*- coding: utf-8 -*-
'''
Program to generate reads from a fasta file (for tests). P. Marijon based one
get_subsequence.py write by C. Lemaitre.
'''

import getopt, sys
import random
        

def usage():
    '''Usage'''
    print("-----------------------------------------------------------------------------")
    print(sys.argv[0]," : sub-sequence")
    print("-----------------------------------------------------------------------------")
    print("usage: ",sys.argv[0]," -f fasta_file -n numbre -l length")
    print("  -f: input fasta file")
    print("  -n: numbre of read you want(def : 1)")
    print("  -l: length of sub-sequence (def : 1)")
    print("  -h: help")
    print("-----------------------------------------------------------------------------")
    sys.exit(2)


def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "hf:n:l:", ["help", "fasta=", "num=", "len="])
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err))  # will print something like "option -a not recognized"
        usage()
        sys.exit(2)

    # Default parameters
    read_len=1
    fasta_file=0
    num_loop=1
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            sys.exit()
        elif opt in ("-f", "--fasta"):
            fasta_file = arg
        elif opt in ("-l", "--len"):
            read_len = int(arg)
        elif opt in ("-n", "--num"):
            num_loop = int(arg)
        else:
            assert False, "unhandled option"

    if fasta_file == 0 :
        print("Missing arguments")
        usage()
        return 2
    
    else:
        header = "base_header"
        sequence = ""
        filin = open(fasta_file,"r")

        for line in filin:
            if line[0] ==">":
                # header
                header = (line.lstrip(">")).rstrip("\n").rstrip(" ")
            else:
                sequence+=(line.rstrip("\n")).upper()

        filin.close()

        sequence_len = len(sequence)
        
        if sequence_len == 0 :
            print("warning we didn't find fasta sequence in file.")
            return 1

        if sequence_len < read_len :
            print("warning read length is upper than sequence length we can't generate read.")
            return 1
        
        for i in range(num_loop) :
            pos = sequence_len
            while pos + read_len > sequence_len :
                pos = random.randint(0, sequence_len)

            print(">"+header+"_read"+str(i)+"_pos_"+str(pos)+":"+str(pos+read_len))
            print(sequence[pos:pos+read_len])

    return 0
            
if __name__ == "__main__":
    exit(main())
    
# exemple :
# ./get_subsequence -f random_1Mb.fa -b 44444