File: do.sh

package info (click to toggle)
augustus 3.5.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 777,036 kB
  • sloc: cpp: 80,063; perl: 21,491; python: 4,368; ansic: 1,244; makefile: 1,126; sh: 171; javascript: 32
file content (75 lines) | stat: -rwxr-xr-x 3,165 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# these commands will run the whole tutorial automatically
# input in this directory:
# chr2L.sm.fa  rnaseq1.fq  rnaseq2.fq

mkdir gindex
STAR --runThreadN 4 --runMode genomeGenerate --genomeDir gindex --genomeFastaFiles chr2L.sm.fa # ~1m
STAR --runThreadN 4 --genomeDir gindex --readFilesIn rnaseq1.fq rnaseq2.fq --alignIntronMax 100000 \
  --outSAMtype BAM  SortedByCoordinate --outWigType wiggle --outWigStrand Unstranded # ~2m

for s in nasonia zebrafish tomato; do
  augustus --species=$s chr2L.sm.fa --softmasking=on --predictionEnd=1000000 > aug.$s.1-1M.gff &
done

sleep 2m

echo "chr2L   23513712" > chrom.sizes # size of chromosomes
mv Signal.Unique.str1.out.wig rnaseq.wig    # rename for convenience
mv Aligned.sortedByCoord.out.bam rnaseq.bam 
wigToBigWig rnaseq.wig chrom.sizes rnaseq.bw
bamtools index -in rnaseq.bam # index required by UCSC browser
scp rnaseq.bw rnaseq.bam rnaseq.bam.bai mario@hgwdev:~/public_html/tutorial2015/results/ # copy to web space


echo "browser hide all" > customtrack
echo "track name=\"STAR RNA-Seq alignments\" type=bam visibility=4 bigDataUrl=http://hgwdev.cse.ucsc.edu/~mario/tutorial2015/results/rnaseq.bam" >> customtrack
echo "track name=\"STAR coverage\" type=bigWig visibility=2 bigDataUrl=http://hgwdev.cse.ucsc.edu/~mario/tutorial2015/results/rnaseq.bw" >> customtrack

for s in nasonia zebrafish tomato; do
   echo "track name=\"AUGUSTUS $s abinitio\"  db=dm6 visibility=3" >> customtrack
   grep -P "AUGUSTUS\t(CDS|exon)\t" aug.$s.1-1M.gff >> customtrack # use only the relevant coordinate lines
done

# create hints from rnaseq
bam2hints --intronsonly --in=rnaseq.bam --out=hints.intron.gff
cat rnaseq.wig | wig2hints.pl --width=10 --margin=10 --minthresh=2 --minscore=4 --prune=0.1 --src=W --type=ep \
  --radius=4.5 --pri=4 --strand="." > hints.ep.gff
cat hints.intron.gff hints.ep.gff > hints.gff

# Predict genes using hints:
for i in {0..3}; do
    augustus --species=nasonia chr2L.sm.fa --softmasking=on --predictionStart=$((i*2000000)) --predictionEnd=$(((i+1)*2000000+50000)) \
    --hintsfile=hints.gff --extrinsicCfgFile=extrinsic.M.RM.E.W.cfg  > aug.nasonia.hints.$i.gff &
done

sleep 7m

cat aug.nasonia.hints.{0..3}.gff | join_aug_pred.pl > aug.nasonia.hints.gff

#  Add the browser track of genes with evidence:

echo "track name=\"AUGUSTUS nasonia hints\"  db=dm6 visibility=3" >> customtrack
grep -P "AUGUSTUS\t(CDS|exon)\t" aug.nasonia.hints.gff  >> customtrack
gzip -c customtrack > customtrack.whints.gz
scp customtrack.whints.gz mario@hgwdev:~/public_html/tutorial2015/results/ # copy to web space

# subset of 100 supported tx
cat aug.nasonia.hints.gff | perl -ne 'if (/\ttranscript\t.*\t(\S+)/){$tx=$1;} if (/transcript supported.*100/) {print "$tx\n";}' | tee supported.lst | wc -l

gff2gbSmallDNA.pl --good=supported.lst aug.nasonia.hints.gff chr2L.sm.fa 5000 genes.gb

##################### training.html

randomSplit.pl genes.gb 100

grep -c LOCUS genes.gb*
new_species.pl --species=bug

etraining --species=bug genes.gb.train

ls -ort $AUGUSTUS_CONFIG_PATH/species/bug/

augustus --species=bug genes.gb.test | tee firsttest.out # takes ~1m

grep -A 22 Evaluation firsttest.out