1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
|
#!/usr/bin/python3
################
## get_hg38_bed.py parses an HG38 coordinate into a pseudo-bed entry,
## and runs bedtools getfasta to return the fasta sequence
##
import sys
import re
from subprocess import Popen, PIPE, STDOUT
import shlex
import argparse
## a genome_loc should look like: chr#:start-stop
## if stop < start, coordinates are reversed
genome_dict={'hg38':'genome_dna/hg38/reference.fa',
'mm10':'genome_dna/mm10/reference.fa',
'rn6':'genome_dna/rn6/rn6.fa',
'bosTau9':'genome_dna/bosTau9/bosTau9.fa'}
parser=argparse.ArgumentParser(description='get_genome_seq.py : get fasta sequence from genome coordinates ')
parser.add_argument('--genome', help='genome: hg38 | mm10 | rn6 | bosTau9',dest='genome',action='store',default='hg38')
parser.add_argument('coords', help='genome coordinates chr1:12345-54321', nargs='*')
args=parser.parse_args()
bed_cmd = 'bedtools getfasta -fi $RDLIB2/%s -bed stdin' % (genome_dict[args.genome])
bed_lines = ''
for genome_loc in args.coords:
chrom, g_range = genome_loc.split(':')
g_start, g_end = g_range.split('-')
if (g_start > g_end):
g_start, g_end = g_end, g_start
g_start, g_end = int(g_start), int(g_end)
g_start -= 1
bed_lines += '%s\t%d\t%d\n' % (chrom, g_start, g_end)
bed_p = Popen(bed_cmd, stdout=PIPE, stdin=PIPE, stderr=STDOUT, shell=True, encoding='utf-8')
out, err = bed_p.communicate(input=bed_lines)
for line in out.split('\n'):
if (line and line[0]=='>'):
(chrom, start, stop) = re.search(r'>([^:]+):(\d+)\-(\d+)',line).groups()
print(line + " @C:%s" % (start))
elif (line):
print(line)
|