1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
|
# Read a FASTA description
import operator
from Martel import *
from Bio import Std
### Parse dbxrefs given the NCBI|descr|line as explained
### in ftp://ncbi.nlm.nih.gov/blast/db/README and augmented
### by experience
def make_2id(s, dbname, primary_name, secondary_name):
assert secondary_name is not None
if primary_name is None:
return Str(s + "||") + \
Std.dbxref_dbid(UntilSep(sep = "| "), {"dbname": dbname,
"type": secondary_name})
return Str(s + "|") + \
Std.dbxref_dbid(UntilSep(sep = "|"), {"dbname": dbname,
"type": primary_name}) + \
Str("|") + \
Std.dbxref_dbid(UntilSep(sep = "| "), {"dbname": dbname,
"type": secondary_name})
def make_1id(s, dbname, name):
return Str(s + "|") + \
Std.dbxref_dbid(UntilSep(sep = "| "), {"dbname": dbname,
"type": name})
ids = []
# gene identifier gi|id # This isn't in the README
ids.append(make_1id("gi", "x-gi", "primary"))
# GenBank gb|accession|locus
# gb|U37104|APU37104
ids.append(make_2id("gb", "gb", "primary", "secondary"))
# EMBL Data Library emb|accession|locus
# emb|F19596|HSPD04201
ids.append(make_2id("emb", "embl", "primary", "secondary"))
# DDBJ, DNA Database of Japan dbj|accession|locus
ids.append(make_2id("dbj", "ddbj", "primary", "secondary"))
# NBRF PIR pir||entry
ids.append(make_2id("pir", "pir", None, "primary"))
# Protein Research Foundation prf||name
ids.append(make_2id("prf", "x-prf", None, "primary"))
# SWISS-PROT sp|accession|entry name
ids.append(make_2id("sp", "sp", "primary", "secondary"))
# Brookhaven Protein Data Bank pdb|entry|chain
ids.append(make_2id("pdb", "x-pdb", "primary", "secondary")) # XXX not correct
# Patents pat|country|number
ids.append(make_2id("pat", "x-pat", "primary", "secondary")) # XXX not correct
# GenInfo Backbone Id bbs|number
ids.append(make_1id("bbs", "x-bbs", "primary"))
# General database identifier gnl|database|identifier
gnl_id = Str("gnl|") + \
Std.dbxref_dbname(UntilSep(sep = "| ")) + \
Str("|") + \
Std.dbxref_dbid(UntilSep(sep = "| "))
ids.append(gnl_id)
# NCBI Reference Sequence ref|accession|locus
ids.append(make_2id("ref", "x-ref", "primary", "secondary"))
# Local Sequence identifier lcl|identifier
ids.append(make_1id("lcl", "local", "primary"))
# "|" them all together
ncbi_word = Std.dbxref(reduce(operator.or_, ids))
#ncbi_term = Assert(Re("[^ \R]+\|")) + \
ncbi_term = ncbi_word + Rep(Str("|") + ncbi_word)
# Anything else
generic_term = Std.dbxref(
Std.dbxref_dbid(UntilSep(sep = " "), {"dbname": "local"})
)
id_term = ncbi_term | generic_term
###########################################################
comment_lines = Rep(Str("#") + ToEol())
title = Str(">") + Std.description_line(id_term + UntilEol()) + AnyEol()
seqline = AssertNot(Str(">")) + Std.sequence(UntilEol()) + AnyEol()
# can get a sequence line without an Eol at the end of a file
seqline_nonewline = AssertNot(Str(">")) + Std.sequence(Word())
sequence = Std.sequence_block(Rep(seqline | seqline_nonewline))
record = Std.record(comment_lines + title + sequence + Rep(AnyEol()))
# define a format which reads records, but allows #-style comments in
# the FASTA file
format = HeaderFooter("dataset", {"format": "fasta"},
comment_lines, RecordReader.Until, (">",),
record, RecordReader.StartsWith, (">",),
comment_lines, RecordReader.Everything, ()
)
|