1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
|
#!/usr/bin/python3
# Convert bam files to wiggle files for usage with AUGUSTUS (exonpart hints)
# Alternative to Augustus/auxprogs/bam2wig/bam2wig, which often causes
# compilation problems. However, this script also has a number of dependencies:
# UCSC tools twoBitInfo, fatToTwoBit, Unix sort, samtools.
# Unix sort and samtools are usually installed on AUGUSTUS user PCs,
# UCSC tools are automatically obtained if not present on machine.
# In comparison to bam2wig, this script requires the genome file
# as additional input argument. In return, it ensures that aligments
# do not exceed sequence boundaries (which sometimes happens and is not caught
# by bam2wig).
# This script is a re-assembly of functions from MakeHub
# at https://github.com/Gaius-Augustus/MakeHub
import string
import random
import os
import os.path
import argparse
import re
import urllib.request
import subprocess
import platform
from inspect import currentframe, getframeinfo
import shutil
''' ******************* BEGIN HEAD ******************************************'''
''' Contains arg parser & software requirement checks '''
__author__ = "Katharina J. Hoff"
__copyright__ = "Copyright 2019. All rights reserved."
__credits__ = "MakeHub project"
__license__ = "Artistic Licsense"
__version__ = "1.0.0"
__email__ = "katharina.hoff@uni-greifswald.de"
__status__ = "development"
ucsc_tools = {'twoBitInfo': '', 'faToTwoBit': ''}
parser = argparse.ArgumentParser(
description='Convert bam file to wiggle format for usage with AUGUSTUS ' +
'as exonpart hints.')
parser.add_argument('-b', '--bamFile', required=True, type=str,
help="Input file in Bam format.")
parser.add_argument('-g', '--genomeFile', required=True, type=str,
help="Input genome file in FASTA format.")
parser.add_argument('-o', '--outFile', required=True, type=str,
help="Output file in wiggle format.")
parser.add_argument('-s', '--SAMTOOLS_PATH', required=False, type=str,
help="Path to samtools executable, e.g. \'/usr/bin\'.")
args = parser.parse_args()
''' This script contains bash calls for samtools. It has not been tested
on OS X or Windows. Therefore check platform prior further actions. '''
plat_sys = platform.system()
if plat_sys != "Linux":
frameinfo = getframeinfo(currentframe())
print('Warning in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': '
+ "This script has been developed and tested on Linux, " +
"only. Your operating system is " + plat_sys +
". The script " +
"might not work properly on your system!")
''' This script depends on samtools pileup. Check whether samtools is
available. '''
samtools = ""
if args.SAMTOOLS_PATH:
samtools = args.SAMTOOLS_PATH + "/samtools"
if not(os.access(samtools, os.X_OK)):
frameinfo = getframeinfo(currentframe())
print('Error in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': ' + samtools + " is not executable!")
exit(1)
else:
if shutil.which("samtools") is not None:
samtools = shutil.which("samtools")
else:
frameinfo = getframeinfo(currentframe())
print('Error in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': '
+ "Unable to locate samtools binary!")
print("samtools is available as package in many Linux distributions.")
print("For example, on Ubuntu, try installing with:")
print("\"sudo apt install samtools\"")
print("If samtools is unavailable as a package, you can obtain it " +
"from github at:")
print("https://github.com/samtools/samtools")
exit(1)
''' Find bash sort '''
sort_tool = shutil.which('sort')
if sort_tool is None:
frameinfo = getframeinfo(currentframe())
print('Error in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': ' + "Unable to locate bash tool 'sort'")
print('sort is part of most Linux distributions. On Ubuntu, it is ' +
'part of the package coreutils. Try re-installing your bash if sort' +
' is missing on your system.')
quit(1)
''' Find or obtain UCSC tools (here, tool for chromsome sizes file).
script has never been tested on Darwin, tool download is implemented
for Darwin though. '''
arch = platform.machine()
for key, val in ucsc_tools.items():
if shutil.which(key) is not None:
ucsc_tools[key] = shutil.which(key)
elif os.path.isfile(os.getcwd() + "/" + key):
ucsc_tools[key] = os.getcwd() + "/" + key
if not(os.access(ucsc_tools[key], os.X_OK)):
os.chmod(ucsc_tools[key], 0o777)
else:
if not(arch == 'x86_64'):
frameinfo = getframeinfo(currentframe())
print('Error in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': '
+ "This script depends on binaries that are available for " +
"x86_64 architecture for linux and MacOX. " +
"We have determined that your system architecture is " +
arch + "." +
" Please try downloading " + key +
" for your architecture from: " +
"http://hgdownload.soe.ucsc.edu/admin/exe")
exit(1)
elif not(plat_sys == 'Linux') and not(plat_sys == 'Darwin'):
frameinfo = getframeinfo(currentframe())
print('Error in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': '
+ "This script depends on binaries that are available for " +
"x86_64 architecture for linux and MacOX. " +
"We have determined that your system is " +
plat_sys + "." +
" Please try downloading " + key +
" for your operating system from: " +
"http://hgdownload.soe.ucsc.edu/admin/exe")
exit(1)
else:
if plat_sys == 'Linux':
tool_url = "http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/" + key
elif plat_sys == 'Darwin':
tool_url = "http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/" + key
print("Was unable to locate " + key +
" on your system, will try to download it from " + tool_url + "...")
with urllib.request.urlopen(tool_url) as response, open(key, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
ucsc_tools[key] = os.getcwd() + "/" + key
os.chmod(ucsc_tools[key], 0o777)
''' ******************* END HEAD ********************************************'''
''' ******************* BEGIN FUNCTIONS *************************************'''
''' Function that runs a subprocess with arguments '''
def run_simple_process(args_lst):
try:
result = subprocess.run(
args_lst, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if(result.returncode == 0):
return(result)
else:
frameinfo = getframeinfo(currentframe())
print('Error in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': ' + "Return code of subprocess was " +
str(result.returncode) + str(result.args))
quit(1)
except subprocess.CalledProcessError as grepexc:
frameinfo = getframeinfo(currentframe())
print('Error in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': ' + "Failed executing: ",
" ".join(grepexec.args))
print("Error code: ", grepexc.returncode, grepexc.output)
quit(1)
''' Function that runs a subprocess with input from STDIN '''
def run_process_stdinput(args_lst, byte_obj):
try:
result = subprocess.run(args_lst, input=byte_obj,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if(result.returncode == 0):
return(result)
else:
frameinfo = getframeinfo(currentframe())
print('Error in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': '
+ "run_process_stdinput: return code of subprocess was "
+ str(result.returncode))
quit(1)
except subprocess.CalledProcessError as grepexc:
frameinfo = getframeinfo(currentframe())
print('Error in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': ' +
"Failed executing: ", " ".join(grepexec.args))
print("Error code: ", grepexc.returncode, grepexc.output)
quit(1)
''' Function that writes subprocess byte object to flat file '''
def write_byteobj(byte_obj, outfile):
try:
with open(outfile, 'w') as byteobj_handle:
byteobj_handle.write(byte_obj.decode('utf-8'))
except IOError:
frameinfo = getframeinfo(currentframe())
print('Error in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': ' + "Failed to open file " + outfile +
" for writing!")
quit(1)
''' Function that converts bam file to wig file with RSeQC '''
def bamToWig(bam_file, wig_file, size_file, mpileup_file):
subprcs_args = [samtools, "mpileup", "-o", mpileup_file, bam_file]
run_simple_process(subprcs_args)
# observed that in rare cases a wig file might contain coverage for one
# more base than present in the sequence; seems to be an alignment
# error, not a bam2wig error, because the same problem arises if I
# convert bam to wig in python in a different way
# therefore check wiggle file for sanity and modify if required
# (i.e. cleave coverage at sequence end)
chrom_sizes = {}
try:
with open(size_file, "r") as size_handle:
for line in size_handle:
split_list = re.split(r'\t', line.rstrip('\n'))
chrom_sizes[split_list[0]] = int(split_list[1])
except IOError:
frameinfo = getframeinfo(currentframe())
print('Error in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': ' + "Could not open file " +
size_file + " for reading!")
try:
with open(mpileup_file, "r") as pileup_handle:
try:
with open(wig_file, "w") as wig_handle:
wig_handle.write(
"track name=" + bam_file + " type=wiggle_0\n")
lastSeq = ""
lastStart = 0
for line in pileup_handle:
seq, start, t1, depth, t2, t3 = line.split()
if (seq != lastSeq) and (start != lastStart):
wig_handle.write(
"variableStep chrom=" + seq + "\n")
if(int(start) <= chrom_sizes[seq]):
wig_handle.write(start + " " + depth + "\n")
lastSeq = seq
lastStart = start
except IOError:
print('Error in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': ' +
"Could not open file " + wig_file + " for writing!")
except IOError:
print('Error in file ' + frameinfo.filename + ' at line ' +
str(frameinfo.lineno) + ': ' +
"Could not open file " + pilup_file + " for reading!")
''' Function that generates a random string of fixed length, default length is
10.'''
def randomString(stringLength=10):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(stringLength))
''' ******************* END FUNCTIONS ***************************************'''
''' Generate random file prefix for temporary files '''
prefix = randomString()
twoBitFile = "twoBitFile_" + prefix
ChromSizes_file = "chromSizes_" + prefix
pileup_file = "pileup_" + prefix
''' Generate chrosome sizes file '''
subprcs_args = [ucsc_tools['faToTwoBit'], args.genomeFile, twoBitFile]
run_simple_process(subprcs_args)
subprcs_args = [ucsc_tools['twoBitInfo'], twoBitFile, 'stdout']
result = run_simple_process(subprcs_args)
subprcs_args = [sort_tool, '-k2rn']
result = run_process_stdinput(subprcs_args, result.stdout)
write_byteobj(result.stdout, ChromSizes_file)
''' Generate Wiggle file '''
bamToWig(args.bamFile, args.outFile, ChromSizes_file, pileup_file)
''' Delete temporary files '''
os.remove(twoBitFile)
os.remove(ChromSizes_file)
os.remove(pileup_file)
|