1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
|
from __future__ import print_function
#import sys
import os
import os.path as op
import traceback
#from os import listdir
#from os.path import isfile, join
import re
import logging
from seqcluster.libs.classes import sequence_unique
from seqcluster.libs.classes import quality
from seqcluster.libs.fastq import is_fastq, open_fastq
logger = logging.getLogger('prepare')
def prepare(args):
"""
Read all seq.fa files and create a matrix and unique fasta files.
The information is
:param args: options parsed from command line
:param con: logging messages going to console
:param log: logging messages going to console and file
:returns: files - matrix and fasta files that should be used with
and aligner (as bowtie) and run `seqcluster cluster`
"""
try:
f = open(args.config, 'r')
seq_out = open(op.join(args.out, "seqs.fastq"), 'w')
ma_out = open(op.join(args.out, "seqs.ma"), 'w')
except IOError as e:
traceback.print_exc()
raise IOError("Can not create output files: %s, %s or read %s" % (op.join(args.out, "seqs.ma"), op.join(args.out, "seqs.fastq"), args.config))
logger.info("Reading sequences")
seq_l, sample_l = _read_fastq_files(f, args)
logger.info("Creating matrix with unique sequences")
logger.info("Filtering: min counts %s, min size %s, max size %s, min shared %s" % (args.minc, args.minl, args.maxl, args.min_shared))
_create_matrix_uniq_seq(sample_l, seq_l, ma_out, seq_out, args.min_shared)
logger.info("Finish preprocessing. Get a sorted BAM file of seqs.fa and run seqcluster cluster.")
def _read_fasta_files(f, args):
""" read fasta files of each sample and generate a seq_obj
with the information of each unique sequence in each sample
:param f: file containing the path for each fasta file and
the name of the sample. Two column format with `tab` as field
separator
:returns: * :code:`seq_l`: is a list of seq_obj objects, containing
the information of each sequence
* :code:`sample_l`: is a list with the name of the samples
(column two of the config file)
"""
seq_l = {}
sample_l = []
idx = 1
for line1 in f:
line1 = line1.strip()
cols = line1.split("\t")
with open(cols[0], 'r') as fasta:
sample_l.append(cols[1])
for line in fasta:
if line.startswith(">"):
idx += 1
counts = int(re.search("x([0-9]+)", line.strip()).group(1))
else:
seq = line.strip()
seq = seq[0:int(args.maxl)] if len(seq) > int(args.maxl) else seq
if counts > int(args.minc) and len(seq) > int(args.minl):
if seq not in seq_l:
seq_l[seq] = sequence_unique(idx, seq)
seq_l[seq].add_exp(cols[1], counts)
return seq_l, sample_l
def _read_fastq_files(f, args):
""" read fasta files of each sample and generate a seq_obj
with the information of each unique sequence in each sample
:param f: file containing the path for each fasta file and
the name of the sample. Two column format with `tab` as field
separator
:returns: * :code:`seq_l`: is a list of seq_obj objects, containing
the information of each sequence
* :code:`sample_l`: is a list with the name of the samples
(column two of the config file)
"""
seq_l = {}
sample_l = []
idx = 1
p = re.compile("^[ATCGNU]+$")
with open(op.join(args.out, "stats_prepare.tsv"), 'w') as out_handle:
for line1 in f:
line1 = line1.strip()
cols = line1.split("\t")
# if not is_fastq(cols[0]):
# raise ValueError("file is not fastq: %s" % cols[0])
with open_fastq(cols[0]) as handle:
sample_l.append(cols[1])
total = added = 0
line = handle.readline()
while line:
if line.startswith("@") or line.startswith(">"):
seq = handle.readline().strip()
if not p.match(seq):
continue
idx += 1
total += 1
keep = {}
counts = int(re.search("x([0-9]+)", line.strip()).group(1))
if is_fastq(cols[0]):
handle.readline().strip()
qual = handle.readline().strip()
else:
qual = "I" * len(seq)
qual = qual[0:int(args.maxl)] if len(qual) > int(args.maxl) else qual
seq = seq[0:int(args.maxl)] if len(seq) > int(args.maxl) else seq
if counts > int(args.minc) and len(seq) > int(args.minl):
added += 1
if seq in keep:
keep[seq].update(qual)
else:
keep[seq] = quality(qual)
if seq not in seq_l:
seq_l[seq] = sequence_unique(idx, seq)
seq_l[seq].add_exp(cols[1], counts)
seq_l[seq].quality = keep[seq].get()
line=handle.readline()
print("total\t%s\t%s" % (idx, cols[1]), file=out_handle, end="")
print("added\t%s\t%s" % (len(seq_l), cols[1]), file=out_handle, end="")
logger.info("%s: Total read %s ; Total added %s" % (cols[1], idx, len(seq_l)))
return seq_l, sample_l
def _create_matrix_uniq_seq(sample_l, seq_l, maout, out, min_shared):
""" create matrix counts for each different sequence in all the fasta files
:param sample_l: :code:`list_s` is the output of :code:`_read_fasta_files`
:param seq_l: :code:`seq_s` is the output of :code:`_read_fasta_files`
:param maout: is a file handler to write the matrix count information
:param out: is a file handle to write the fasta file with unique sequences
:returns: Null
"""
skip = 0
if int(min_shared) > len(sample_l):
min_shared = len(sample_l)
maout.write("id\tseq")
for g in sample_l:
maout.write("\t%s" % g)
for s in seq_l.keys():
seen = sum([1 for g in seq_l[s].group if seq_l[s].group[g] > 0])
if seen < int(min_shared):
skip += 1
continue
maout.write("\nseq_%s\t%s" % (seq_l[s].idx, seq_l[s].seq))
for g in sample_l:
if g in seq_l[s].group:
maout.write("\t%s" % seq_l[s].group[g])
else:
maout.write("\t0")
qual = "".join(seq_l[s].quality)
out.write("@seq_%s\n%s\n+\n%s\n" % (seq_l[s].idx, seq_l[s].seq, qual))
out.close()
maout.close()
logger.info("Total skipped due to --min-shared parameter (%s) : %s" % (min_shared, skip))
|