crc_tutorial_scripts/0_summarize_data.py at master · charlesylin/crc_tutorial_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/python


#crc tutorial scripts
#0_summarize_data.py
#identifies datasets and dependencies necessary for tutorial


#==========================================================================
#=============================DEPENDENCIES=================================
#==========================================================================


import sys, os, string
# Get the script's full local path
whereAmI = os.path.dirname(os.path.realpath(__file__)) +'/'
print(whereAmI)

pipeline_dir = '/'.join(whereAmI.split('/')[0:-2]) + '/pipeline/'
print(pipeline_dir)


sys.path.append(whereAmI)
sys.path.append(pipeline_dir)

import pipeline_dfci
import utils
import string
import numpy
import os
import re
from collections import defaultdict
import subprocess
#==========================================================================
#============================PARAMETERS====================================
#==========================================================================


projectName = 'crc_tutorial'
genome ='hg19'
annotFile = '%s/annotation/%s_refseq.ucsc' % (pipeline_dir,genome)

#project folders
projectFolder = '/'.join(whereAmI.split('/')[0:-2]) + '/'
projectFolder = utils.formatFolder(projectFolder,True)


#mask Files


#==========================================================================
#============================LIST OF DATAFILES=============================
#==========================================================================

#this project will utilize multiple datatables
#data tables are organized largely by type/system
#some data tables overlap for ease of analysis

#ChIP-seq
chip_data_file = '%sdata_tables/MM1S_CHIP_TABLE.txt' % (projectFolder)

#ATAC-seq
atac_data_file = '%sdata_tables/MM1S_ATAC_TABLE.txt' % (projectFolder)

#==========================================================================
#===========================MAIN METHOD====================================
#==========================================================================


def main():


    print('main analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print('#======================================================================')
    print('#==================I. LOADING DATA ANNOTATION TABLES===================')
    print('#======================================================================')
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for chip data file
    pipeline_dfci.summary(chip_data_file)


    #for chip data file
    pipeline_dfci.summary(atac_data_file)

    print('\n\n')
    print('#======================================================================')
    print('#=====================II. CONFIGURING GENOME BUILD=====================')
    print('#======================================================================')
    print('\n\n')


    genome_directory = '%sgenomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/' % (projectFolder)
    mask_file =  '%sgenomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed' % (projectFolder)

    config_table = [['BUILD:FIELD:PATH'],
                    ['%s:%s:%s' % (genome,'genome_directory',genome_directory)],
                    ['%s:%s:%s' % (genome,'mask_file',mask_file)],
                    ]
    config_path = '%scrc_config.txt' %(whereAmI)

    utils.unParseTable(config_table,config_path,'\t')
    print('writing genome configuration to %s' % (config_path))


    print('\n\n')
    print('#======================================================================')
    print('#==================III. DETECTING DEPENDENCIES=========================')
    print('#======================================================================')
    print('\n\n')

    from distutils.spawn import find_executable

    # Try to find bamliquidator, bamliquidator_batch.py, and fimo
    bamliquidatorString = find_executable('bamliquidator')
    if bamliquidatorString is None:
        raise ValueError('bamliquidator not found in path')
    else:
        print('found bamliquidator')

    bamliquidatorBatchString = find_executable('bamliquidator_batch.py')
    if bamliquidatorString is None:
        raise ValueError('bamliquidator_batch.py not found in path')
    else:
        print('found bamliquidator_batch.py')

    bamliquidatorBatchString = find_executable('fimo')
    if bamliquidatorString is None:
        raise ValueError('fimo not found in path')
    else:
        print('found fimo')


#==========================================================================
#==================================THE END=================================
#==========================================================================


if __name__=="__main__":
    main()