1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
|
"""
A Martel grammar to read information from a clustal formatted file (*.aln).
This uses Andrew Dalke's Martel to do the parsing dirty work for me.
So all we need to do here is set up a big ol' regular expression to
let Martel know what the file looks like.
"""
# standard library
import sys
import Martel
# define everything we will parse at a ton of regular expressions with
# specific callbacks
version = Martel.Group("version",
Martel.Re("\d.\d\d?"))
header = Martel.Group("header",
Martel.Str("CLUSTAL ") +
Martel.Re(".+") +
Martel.MaxRepeat(Martel.AnyEol(), 0, 3))
seq_id = Martel.Group("seq_id",
Martel.Re("[-a-zA-Z:;^_'\",\+\#\|\[\]\(\)\/\.\d\?]+"))
# space between the sequence and id
seq_space = Martel.Group("seq_space",
Martel.Re("[ ]+"))
seq_info = Martel.Group("seq_info",
Martel.Re("[-a-zA-Z.]+"))
# you can output an optional number to tell you where you are in the sequence
# we need to swallow this up if it is here
seq_num = Martel.Group("seq_num",
Martel.Re("[ ]+") +
Martel.Re("[\d]+"))
seq_line = Martel.Group("seq_line", seq_id + seq_space + seq_info +
Martel.Opt(seq_num) +
Martel.Str("\n"))
match_stars = Martel.Group("match_stars",
Martel.Re("[ :\.\*]+") +
Martel.Opt(Martel.AnyEol()))
# separator between blocks
new_block = Martel.Group("new_block",
Martel.AnyEol())
block_info = Martel.Group("block_info",
Martel.Rep1(seq_line) +
Martel.Opt(match_stars) +
Martel.Rep(new_block))
# define the format we can import to parse clustal files, one header
# plus multiple lines of alignments
format = Martel.Group("clustalx",
header +
Martel.Rep1(block_info))
|