File: import_ont_model.py

package info (click to toggle)
nanopolish 0.14.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 11,760 kB
  • sloc: cpp: 22,200; ansic: 1,478; python: 814; makefile: 210; sh: 43; perl: 17
file content (88 lines) | stat: -rw-r--r-- 2,597 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#! /usr/bin/env python3

# This script takes a .model file provided by ONT and adds metadata that allows it
# to be compiled into nanopolish
import argparse
import sys
import os
from operator import itemgetter

def write_header(fh, key, value):
    fh.write("#" + key + "\t" + value + "\n")

# Argument handling
parser = argparse.ArgumentParser( description='Convert ONT model file into nanopolish format')
parser.add_argument('-i', '--input', type=str, required=True)
parser.add_argument('-o', '--output-dir', type=str, required=False)
parser.add_argument('-a', '--alphabet', type=str, required=False)
args = parser.parse_args()
f = open(args.input)

# Parse metadata out of the type dir
(dirs, filename) = os.path.split(args.input)
(_, type_dir) = os.path.split(dirs)
metadata_fields = type_dir.split("_")

if(len(metadata_fields) != 5):
    sys.stderr.write("Error, could not parse type dir\n")
    sys.exit(1)

pore = metadata_fields[0]
speed = metadata_fields[2]
K = metadata_fields[3].replace("mer", "")
is_rna = type_dir.find("RNA") != -1

new_kit_name = pore + "_" + speed

alphabet = "nucleotide" if args.alphabet == "" else args.alphabet
strand = ""
if filename.find("template") != -1:
    strand = "template"
else:
    assert(filename.find("complement") != -1)
    if filename.find("pop1") != -1:
        strand = "complement.pop1"
    else:
        assert(filename.find("pop2") != -1)
        strand = "complement.pop2"


dir_str = ""
if args.output_dir is not None:
    dir_str = args.output_dir + "/"
out_name = "%s%s.%s.%smer.%s.model" % (dir_str, new_kit_name, alphabet, K, strand)

out_file = open(out_name, "w")
write_header(out_file, "ont_model_name", type_dir)
write_header(out_file, "kit", new_kit_name)
write_header(out_file, "strand", strand)
write_header(out_file, "k", K)
if args.alphabet:
    write_header(out_file, "alphabet", args.alphabet)
write_header(out_file, "original_file", type_dir + "/" + filename)

# Read k-mer states into list
states = list()

# Copy everything to the output
header = f.readline()
out_file.write(header)
for line in f:
    # ONT files shouldnt have header tags
    assert(line[0] != "#")

    fields = line.rstrip().split()

    # The ONT RNA model is in the sequencing direction, which is 5'->3'
    # Nanopolish's internal convention is to do everything 5'->3' so we reverse
    # the direction of each state here
    if is_rna:
        fields[0] = fields[0][::-1]

    states.append(fields)

for record in sorted(states, key=itemgetter(0), reverse=False):
    out_file.write("\t".join(record) + "\n")

sys.stdout.write(out_name + "\n")