File: get_genome_seq.py

package info (click to toggle)

fasta3 36.3.8i.14-Nov-2020-3

links: PTS, VCS
area: main
in suites: sid, trixie
size: 7,016 kB
sloc: ansic: 77,269; perl: 10,677; python: 2,461; sh: 428; csh: 86; sql: 55; makefile: 40

file content (54 lines) | stat: -rwxr-xr-x 1,676 bytes

parent folder | download | duplicates (3)

#!/usr/bin/python3

################
## get_hg38_bed.py parses an HG38 coordinate into a pseudo-bed entry,
## and runs bedtools getfasta to return the fasta sequence
##

import sys
import re
from subprocess import Popen, PIPE, STDOUT
import shlex
import argparse

## a genome_loc should look like: chr#:start-stop
## if stop < start, coordinates are reversed

genome_dict={'hg38':'genome_dna/hg38/reference.fa',
             'mm10':'genome_dna/mm10/reference.fa',
             'rn6':'genome_dna/rn6/rn6.fa',
             'bosTau9':'genome_dna/bosTau9/bosTau9.fa'}

parser=argparse.ArgumentParser(description='get_genome_seq.py : get fasta sequence from genome coordinates ')
parser.add_argument('--genome', help='genome: hg38 | mm10 | rn6 | bosTau9',dest='genome',action='store',default='hg38')
parser.add_argument('coords', help='genome coordinates chr1:12345-54321', nargs='*')

args=parser.parse_args()

bed_cmd = 'bedtools getfasta -fi $RDLIB2/%s -bed stdin' % (genome_dict[args.genome])

bed_lines = ''
for genome_loc in args.coords:

    chrom, g_range = genome_loc.split(':')
    g_start, g_end = g_range.split('-')

    if (g_start > g_end):
        g_start, g_end = g_end, g_start

    g_start, g_end = int(g_start), int(g_end)
    g_start -= 1

    bed_lines += '%s\t%d\t%d\n' % (chrom, g_start, g_end)

bed_p = Popen(bed_cmd, stdout=PIPE, stdin=PIPE, stderr=STDOUT, shell=True, encoding='utf-8')
out, err = bed_p.communicate(input=bed_lines)

for line in out.split('\n'):
    if (line and line[0]=='>'):
        (chrom, start, stop) = re.search(r'>([^:]+):(\d+)\-(\d+)',line).groups()
        print(line + " @C:%s" % (start))
    elif (line):
        print(line)