File: get_data.sh

package info (click to toggle)
bcftools 1.22-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 22,792 kB
  • sloc: ansic: 72,978; perl: 7,583; sh: 694; python: 595; makefile: 301
file content (51 lines) | stat: -rwxr-xr-x 3,351 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# ----------------------------------------------------------------------
# Human reference
# I have this locally:
# /nfs/srpipe_references/references/Human/GRCh38_full_analysis_set_plus_decoy_hla/all/fasta/Homo_sapiens.GRCh38_full_analysis_set_plus_decoy_hla.fa

# This has 3366 sequences.
# The BAMs appear to be aligned against 2580 sequences.
# The GIAB reference directory appears to be a few hundred max.
# I'm unsure where their reference used was, but if we stick to the
# already aligned files and stick to the primary chromosomes then
# frankly any GRCh38 should be fine for evaluation purposes.
# (NB: unsure if true for chr21 due to changes, but that may be in
# patch form only)

# ----------------------------------------------------------------------
# HG005 for final evaluation only
# chr20 only

# ---- Truth set
echo "Fetching truth set"
wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/ChineseTrio/HG005_NA24631_son/NISTv4.2.1/GRCh38/HG005_GRCh38_1_22_v4.2.1_benchmark.bed
wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/ChineseTrio/HG005_NA24631_son/NISTv4.2.1/GRCh38/HG005_GRCh38_1_22_v4.2.1_benchmark.vcf.gz
wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/ChineseTrio/HG005_NA24631_son/NISTv4.2.1/GRCh38/HG005_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi

# ---- Data files
reg=chr20:20000000-21000000
echo "Getting Illumina $reg"
samtools view -o illumina_300x.bam --write-index -@8 https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/ChineseTrio/HG005_NA24631_son/HG005_NA24631_son_HiSeq_300x/NHGRI_Illumina300X_Chinesetrio_novoalign_bams/HG005.GRCh38_full_plus_hs38d1_analysis_set_minus_alts.300x.bam $reg

echo "Getting BGI $reg"
samtools view -o bgi_100x.bam --write-index -@8 https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/ChineseTrio/HG005_NA24631_son/NIST_BGIseq_2x150bp_100x/GRCh38/HG005_GRCh38_BGIseq-2x150-100x_NIST_20211126.bam $reg

echo "Getting PacBio $reg"
samtools view -o pacbio_50x.bam --write-index -@8 https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/ChineseTrio/HG005_NA24631_son/PacBio_CCS_15kb_20kb_chemistry2/GRCh38/GIAB_5mC_CpG/HG005.GRCh38.deepvariant.haplotagged.bam $reg

# ----------------------------------------------------------------------
# HG002 for code modification, tuning, tweaking and round-trips.
# chr1 only

# ---- Truth set
echo "Fetching truth set"
wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz
wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi
wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/AshkenazimTrio/HG002_NA24385_son/NISTv4.2.1/GRCh38/HG002_GRCh38_1_22_v4.2.1_benchmark.noinconsistent.bed

# ---- Data files
# Data is same locations as above, but more data available.  Have a browse
# basically and decide what to test with
reg=chr1
echo "Browse https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG002_NA24385_son/"
# eg https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG002_NA24385_son/Element_AVITI_20231018/HG002_GRCh37_Element-StdInsert_2X150_81x_20231018.bam