File: dna2vec.jl

package info (click to toggle)
vg 1.30.0%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 267,848 kB
  • sloc: cpp: 446,974; ansic: 116,148; python: 22,805; cs: 17,888; javascript: 11,031; sh: 5,866; makefile: 4,039; java: 1,415; perl: 1,303; xml: 442; lisp: 242
file content (84 lines) | stat: -rwxr-xr-x 2,363 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env julia

using ArgParse
using Word2Vec

s = ArgParseSettings()
@add_arg_table s begin
    "--model", "-m"
    arg_type = String
    help = "use this model file (write given -i, query given -q)"
    required = true
    default = ""
    "--input", "-i"
    arg_type = String
    help = "input text corpus to model"
    default = ""
    "--kmer-size", "-k"
    arg_type = Int
    help = "kmer size to break input DNA sequences into"
    default = 5
    "--kmer-stride", "-s"
    arg_type = Int
    help = "bases between each successive kmer"
    default = 1
    "--window-size", "-w"
    arg_type = Int
    help = "window size to use during word2vec"
    default = 30
    "--cbow", "-c"
    help = "use continuous bag of words in word2vec"
    action = :store_true
    "--dims", "-d"
    arg_type = Int
    help = "number of dimensions in embedding"
    default = 100
    "--query", "-q"
    arg_type = String
    help = "print the vector for this string"
    default = ""
end

parsed_args = parse_args(ARGS, s)
input = parsed_args["input"]
model_file = parsed_args["model"]
model_size = parsed_args["dims"]
query = parsed_args["query"]
kmer_size = parsed_args["kmer-size"]
kmer_stride = parsed_args["kmer-stride"]
window_size = parsed_args["window-size"]
cbow = parsed_args["cbow"]

function kmers_of(k::Int, j::Int, s::String)
    [s[i:(i+k-1)] for i in range(1,j,Int(floor((length(s)-k)/j))+1)]
end

function kmers_of_file(k::Int, j::Int, f::String)
    kmers = []
    for s in readlines(open(f))
        append!(kmers, kmers_of(k, j, s))
    end
    kmers
end

function write_kmers_of(k::Int, j::Int, input::String, output::String)
    write(open(output, "w"), join(kmers_of_file(k, j, input), " "))
end

if input != ""
    if model_file == ""
        println("an output file is required when building a model")
        exit(1)
    end
    kmers = "$input.kmers"
    write_kmers_of(kmer_size, kmer_stride, input, kmers)
    word2vec(kmers, model_file, verbose = true, size = model_size, window=window_size, cbow=(cbow?1:0))
    rm(kmers)
elseif model_file != ""
    model = wordvectors(model_file)
    if query != ""
        kmers = kmers_of(kmer_size, kmer_stride, query)
        mean_vec = reduce(+, [(haskey(model.vocab_hash, kmer) ? get_vector(model, kmer) : 0) for kmer in kmers])/length(kmers)
        println("$query ", mean_vec)
    end
end