File: split_qiime.py

package info (click to toggle)
pplacer 1.1~alpha19-8
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 17,324 kB
  • sloc: ml: 20,927; ansic: 9,002; python: 1,641; makefile: 171; sh: 77; xml: 50
file content (47 lines) | stat: -rwxr-xr-x 1,485 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env python
import argparse
import logging
import csv
import sys
import re

from Bio import SeqIO

log = logging.getLogger(__name__)

description_regexp = re.compile(r'([^\s_]+)_\d+\s+(\S+)')

def main():
    logging.basicConfig(
        level=logging.INFO, format="%(levelname)s: %(message)s")

    parser = argparse.ArgumentParser(
        description="Extract the original sequence names from a QIIME FASTA file.")
    parser.add_argument(
        'qiime', type=argparse.FileType('r'), nargs='?', default=sys.stdin,
        help="input QIIME file (default: stdin)")
    parser.add_argument(
        'fasta', type=argparse.FileType('w'), nargs='?', default=sys.stdout,
        help="output FASTA file (default: stdout)")
    parser.add_argument(
        'specimen_map', type=argparse.FileType('w'), nargs='?',
        help="if specified, output specimen map (default: don't write)")

    args = parser.parse_args()

    specimen_writer = None
    if args.specimen_map is not None:
        specimen_writer = csv.writer(args.specimen_map)

    for seq in SeqIO.parse(args.qiime, 'fasta'):
        match = description_regexp.match(seq.description)
        if not match:
            log.warning("sequence %r had a strange description", seq.id)
            continue
        specimen, seq.id = match.groups()
        seq.description = ''
        SeqIO.write([seq], args.fasta, 'fasta')
        if specimen_writer:
            specimen_writer.writerow((seq.id, specimen))

main()