forked from DamianSkrzypczak/Remus
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmake_data_tree.sh
More file actions
executable file
·158 lines (130 loc) · 6.38 KB
/
make_data_tree.sh
File metadata and controls
executable file
·158 lines (130 loc) · 6.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env bash
# Tree Configuration
DATA_ROOT=data
DATA_SRC_DIR=data_sources
GENES=${DATA_ROOT}/genes
GENES_RAW=${GENES}/raw
MIRNA=${DATA_ROOT}/mirna
MIRNA_RAW=${MIRNA}/raw
F5_TSS_HG19=${DATA_ROOT}/promoters/fantom5/hg19
F5_TSS_HG38=${DATA_ROOT}/promoters/fantom5/GRCh38
F5_ENH=${DATA_ROOT}/enhancers/fantom5
F5_ENH_HG19=${F5_ENH}/hg19
F5_ENH_HG38=${F5_ENH}/GRCh38
F5_ENH_RAW=${F5_ENH}/raw
ENC_ENH=${DATA_ROOT}/enhancers/encode
ENC_ENH_RAW=${ENC_ENH}/raw
ENC_CHROMATIN=${DATA_ROOT}/chromatin/encode
ENC_CHROMATIN_RAW=${ENC_CHROMATIN}/raw
SCREEN_RAW=${DATA_ROOT}/screen/raw
SCREEN=${DATA_ROOT}/screen
LIFTOVER_EXEC=external_resources/liftOver
LIFTOVER_HG19_HG38_CHAIN=external_resources/hg19ToHg38.over.chain.gz
# Make tree
printf "Making directories tree under %s\n" ${DATA_ROOT}
mkdir -p ${GENES_RAW} ${GENES} -v
mkdir -p ${MIRNA_RAW} ${MIRNA} -v
mkdir -p ${F5_ENH_RAW} ${F5_ENH_HG19} ${F5_ENH_HG38} -v
mkdir -p ${F5_TSS_HG19} ${F5_TSS_HG38} -v
mkdir -p ${ENC_ENH_RAW} ${ENC_ENH} -v
mkdir -p ${ENC_CHROMATIN_RAW} ${ENC_CHROMATIN} -v
mkdir -p ${SCREEN} ${SCREEN_RAW} -v
# Create genes.db
printf "Creating genes db\n"
cp ${DATA_SRC_DIR}/NCBIrefSeq.hg19.txt ${GENES_RAW}/hg19.txt
cp ${DATA_SRC_DIR}/NCBIrefSeq.hg38.txt ${GENES_RAW}/hg38.txt
python3 remus/data_import/create_genes_db.py -i ${GENES_RAW} -o ${GENES}/genes.db
# Create miRNA targets.db
printf "Creating miRNA targets db\n"
cp ${DATA_SRC_DIR}/mirtarbase.* ${MIRNA_RAW}/
wget -O ${DATA_SRC_DIR}/hsa_miRWalk_3UTR.7z http://mirwalk.umm.uni-heidelberg.de/download/hsa_miRWalk_3UTR.7z
7zr x -so ${DATA_SRC_DIR}/hsa_miRWalk_3UTR.7z | gzip -c > ${MIRNA_RAW}/mirwalk_3UTR.tsv.gz
python3 remus/data_import/create_mirna_target_db.py -i ${MIRNA_RAW} -o ${MIRNA}/targets.db
# Download FANTOM5 CAGE expression matrix and ontology (to find transcription start sites)
printf "Acquiring transcription start sites FANTOM5 data\n"
wget -O ${DATA_SRC_DIR}/hg19.cage_peak_phase1and2combined_tpm.osc.txt.gz 'http://fantom.gsc.riken.jp/5/datafiles/latest/extra/CAGE_peaks/hg19.cage_peak_phase1and2combined_tpm.osc.txt.gz'
wget -O ${DATA_SRC_DIR}/ff-phase2-170801.obo.txt http://fantom.gsc.riken.jp/5/datafiles/latest/extra/Ontology/ff-phase2-170801.obo.txt
# aggregate samples by organs, tissues and cell-types and store location of TSSs in BED files
python3 remus/data_import/aggregate_CAGE_peaks.py ${DATA_SRC_DIR}/ff-phase2-170801.obo.txt ${DATA_SRC_DIR}/hg19.cage_peak_phase1and2combined_tpm.osc.txt.gz ${F5_TSS_HG19}
# trim columns, sort, compress and index BED files
for b in ${F5_TSS_HG19}/*.bed; do
cut -f1-3 ${b} | bedtools sort -i - | bedtools merge -i - > ${b}.sbed
mv ${b}.sbed ${b}
bgzip ${b} && tabix -p bed ${b}.gz
done
# liftover to hg38
for b in ${F5_TSS_HG19}/*.bed.gz; do
hg38_bed=${F5_TSS_HG38}/`basename ${b%.gz}`
${LIFTOVER_EXEC} ${b} ${LIFTOVER_HG19_HG38_CHAIN} ${hg38_bed} ${hg38_bed}.unmapped
bedtools sort -i ${hg38_bed} | bedtools merge -i - > ${hg38_bed}.tmp && mv ${hg38_bed}.tmp ${hg38_bed}
bgzip ${hg38_bed} && tabix -p bed ${hg38_bed}.gz
echo "Lifted over ${b} to ${hg38_bed}.gz"
done
# Download FANTOM5 enhancers
printf "Acquiring enhancers fantom5 data\n"
wget http://enhancer.binf.ku.dk/presets/facet_expressed_enhancers.tgz -P ${F5_ENH_RAW}
printf "... extracting celltype data"
tar -xzf ${F5_ENH_RAW}/facet_expressed_enhancers.tgz -C ${F5_ENH_HG19} --wildcards CL:*
printf "... extracting organ data"
tar -xzf ${F5_ENH_RAW}/facet_expressed_enhancers.tgz -C ${F5_ENH_HG19} --wildcards UBERON*
# remove the _expressed_enhancers suffix and semicolon
rename.ul "_expressed_enhancers" "" ${F5_ENH_HG19}/*.bed
rename.ul ":" "_" ${F5_ENH_HG19}/*.bed
# trim columns, compress and index BED files
for b in ${F5_ENH_HG19}/*.bed; do
cut -f1-3 ${b} > ${b}.new && mv ${b}.new ${b}
bgzip ${b} && tabix -p bed ${b}.gz
done
# liftover to hg38
for b in ${F5_ENH_HG19}/*.bed.gz; do
hg38_bed=${F5_ENH_HG38}/`basename ${b%.gz}`
${LIFTOVER_EXEC} ${b} ${LIFTOVER_HG19_HG38_CHAIN} ${hg38_bed} ${hg38_bed}.unmapped
bedtools sort -i ${hg38_bed} | bedtools merge -i - > ${hg38_bed}.tmp && mv ${hg38_bed}.tmp ${hg38_bed}
bgzip ${hg38_bed} && tabix -p bed ${hg38_bed}.gz
echo "Lifted over ${b} to ${hg38_bed}.gz"
done
#
# Download ENCODE enhancers data (TF ChIP-seq)
#
ENC_ENH_METADATA=${DATA_SRC_DIR}/ENCODE_enhancer_metadata_190912.tsv
printf "Acquiring ENCODE enhancers data\n"
# download raw BED files
awk -F '\t' '$48~"released" && ($3~"optimal" || $3~"pseudoreplicated") {print $43}' ${ENC_ENH_METADATA} | wget -i - -P ${ENC_ENH_RAW}
# generate collapsing script & run it
python3 remus/data_import/collapse_encode_enhancer_beds.py ${ENC_ENH_METADATA} ${ENC_ENH_RAW} ${ENC_ENH} > ${ENC_ENH}/collapse_and_liftover.sh
chmod u+x ${ENC_ENH}/collapse_and_liftover.sh && ${ENC_ENH}/collapse_and_liftover.sh
# delete raw BEDs to save space
# rm -r ${ENC_ENH}
#
# Download ENCODE accessible chromatin data (~1.6GB)
#
ENC_CHROMATIN_METADATA=${DATA_SRC_DIR}/ENCODE_chromatin_metadata_190911.tsv
printf "Acquiring ENCODE accessible chromatin data\n"
# download raw BED files
awk -F '\t' '$48~"released" {print $43}' ${ENC_CHROMATIN_METADATA} | wget -i - -P ${ENC_CHROMATIN_RAW}
# generate collapsing script & run it
python3 remus/data_import/collapse_encode_chromatin_beds.py ${ENC_CHROMATIN_METADATA} ${ENC_CHROMATIN_RAW} ${ENC_CHROMATIN} > ${ENC_CHROMATIN}/collapse_and_liftover.sh
chmod u+x ${ENC_CHROMATIN}/collapse_and_liftover.sh && ${ENC_CHROMATIN}/collapse_and_liftover.sh
# delete raw BEDs to save space
# rm -r ${ENC_CHROMATIN_RAW}
# rm -r ${ENC_CHROMATIN_RAW}
#
# ENCODE Screen ccREs datasets
#
SCREEN_METADATA_TABLE=${DATA_SRC_DIR}/screen_ccREs_annotation_report_2019_9_6_11h_21m.tsv
# Download raw BED files (~8G)
echo -n "Downloading SCREEN data..."
for ids in `awk -F"\t" '$5~"5-group" {print $36}' ${SCREEN_METADATA_TABLE}`; do
id1=`echo $ids | cut -d"/" -f3`
id2=`echo $ids | cut -d"/" -f6`
echo "https://www.encodeproject.org/files/${id1}/@@download/${id1}.bed.gz"
echo "https://www.encodeproject.org/files/${id2}/@@download/${id2}.bed.gz"
done > ${SCREEN_RAW}/files
wget -i ${SCREEN_RAW}/files -P ${SCREEN_RAW}
echo "Done."
# extract enhancers, promoters, insulators, and open chromatin from these data
python3 remus/data_import/split_screen_beds.py ${SCREEN_METADATA_TABLE} ${SCREEN_RAW} ${SCREEN} > ${SCREEN}/collapse_and_liftover.sh
chmod u+x ${SCREEN}/collapse_and_liftover.sh && ${SCREEN}/collapse_and_liftover.sh
#
# delete raw BEDs to save space
# rm -r ${SCREEN_RAW}