1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
|
#!/usr/bin/python3
################################################################
# copyright (c) 2014,2015 by William R. Pearson and The Rector &
# Visitors of the University of Virginia */
################################################################
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under this License is distributed on an "AS
# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language
# governing permissions and limitations under the License.
################################################################
################################################################
# clustal2fasta.pl
################################################################
# clustal2fasta.pl takes a standard clustal format alignment file
# and produces the corresponding FASTA file.
#
# if --end_mask or --int_mask are set, then end or internal '-'s are converted to the query (first) sequence
# if --trim is set, then alignments beyond the beginning/end of the query sequence are trimmed
#
################################################################
import argparse
import fileinput
import re
################
#
# python re-write of clustal2fasta.pl
#
# in the future, modify for various query seeding strategies
################
arg_parse = argparse.ArgumentParser(description='Convert clustal MSA to FASTA library')
arg_parse.add_argument('--query|--query_file', dest='query_file', action='store',help='query sequence file')
arg_parse.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
args=arg_parse.parse_args()
msa = {}
seq_ids = []
is_line1 = True
for line in fileinput.input(args.files):
if is_line1:
is_line1 = False
continue
line = line.strip()
if not line:
continue
if re.search(r'^[\s:\*\+\.]+$',line):
continue
(seq_id, align) = re.split(r'\s+',line)
if seq_id in msa:
msa[seq_id] += align
else:
msa[seq_id] = align
seq_ids.append(seq_id)
for seq_id in seq_ids:
fmt_seq = re.sub(r'(.{0,60})',r'\1\n',msa[seq_id])
print(">%s\n%s" % (seq_id, fmt_seq))
|