-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathmyParallel.py
More file actions
188 lines (175 loc) · 6.81 KB
/
myParallel.py
File metadata and controls
188 lines (175 loc) · 6.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 5 10:12:34 2015
@author: yuan
"""
#standar modules
#import multiprocessing as mp # multi process
#import multiprocessing.dummy as mpd #multi-threading
import os
import re
#personal modules
import myDict
import myIO
#class
#class:samples
class samples:
def __init__(self, par):
self.par = par
#print self.par['dir_raw_data']
#print self.par['file_sample_info']
#print self.par['dir_result']
#print self.par['dir_result_array']
if 'dir_result' in self.par and 'dir_result_array' not in self.par:
self.par['dir_result_array'] = self.par['dir_result']
self.raw_sample = {}
self.sample_raw = {}
self.sample_names = []
self.sample_dirs = {}
def seek_fq(self,dir_raw_data):
print('Retrieve all *.fastq files under', dir_raw_data)
raw_files = []
#get all files
all_files = myIO.dir_os(dir_raw_data).recrusive_files()
#find file with .fastq or .fq
for af in all_files:
m = re.search(r'fastq$|fq$', af)
if m:
#print 'raw data:',af
raw_files.append(af)
return raw_files
#sample names based on raw files
def raw_to_samples(self):
#get all fastq files
raw_files = self.seek_fq(self.par['dir_raw_data'])
#print raw_files
#connect raw file to sample name
for raw_file in raw_files:
sample_name = myIO.file_os(raw_file).name_prefix()
self.raw_sample[raw_file] = sample_name
if sample_name in self.sample_raw:
self.sample_raw[sample_name].append(raw_file)
else:
self.sample_raw[sample_name] = [raw_file]
#print dict
#myDict.basic(self.sample_raw).print_dict()
#myDict.basic(self.raw_sample).print_dict()
#sample names based on sample_info.csv
def file_to_samples(self):
#get all fastq files
raw_files = self.seek_fq(self.par['dir_raw_data'])
print('Number of raw files:', len(raw_files))
#read sample info file
print('Read sample file: ', self.par['file_sample_info'])
in_obj = open(self.par['file_sample_info'], 'rt')
#set connections between raw data and sample_name
for line in in_obj:
line = line.rstrip("\n")
items = line.split(',')
raw_file_name = items[0]
sample_name = items[1]
#print prefix
for raw_file in raw_files:
file_name = myIO.file_os(raw_file).file_name()
#print file_name
if file_name.find(raw_file_name) == 0:
#dict: raw_sample
self.raw_sample[raw_file] = sample_name
#dict: sample_raw
if sample_name in self.sample_raw:
self.sample_raw[sample_name].append(raw_file)
else:
self.sample_raw[sample_name] = [raw_file]
in_obj.close()
#print dict
#myDict.basic(self.sample_raw).print_dict()
#myDict.basic(self.raw_sample).print_dict()
def sample_info(self):
sample_pairs = {}
for raw_file, sample_name in self.raw_sample.items():
raw_file_name = myIO.file_os(raw_file).file_name()
group = 'NC' if 'BEADS' in raw_file_name.upper() else 'PhIP'
if not 'unassigned' in raw_file_name:
sample_name = re.sub('_R1', "", sample_name)
pair = '{},{}'.format(raw_file_name, sample_name)
sample_pairs[pair]=group
#export dict to file
print('Generate sample file: ', self.par['file_sample_info'])
#order per record: fastq file name, sample_name, phip_group
myDict.basic(sample_pairs).dict_to_file(self.par['file_sample_info'], ',')
#storage given sample_dir
def sample_storage(self):
result_dirs = self.par['dir_result_array'].split(',')
result_dirs_pool = result_dirs*len(self.sample_names)
#print '===', result_dirs_pool
#
for sample_name in self.sample_raw:
#search if sample dir exists
flag = 0
for result_dir in result_dirs:
sample_dir = result_dir+sample_name+'/'
if os.path.isdir(sample_dir):
self.sample_dirs[sample_name] = sample_dir
flag=1
break
#assign this sample with a directory
if flag == 0:
given_result_dir = result_dirs_pool.pop()
sample_dir = given_result_dir+sample_name+'/'
self.sample_dirs[sample_name] = sample_dir
#myDict.basic(self.sample_dirs).print_dict()
def group_names(self,col=3):
if col < 3: col = 3
#
group_samples = {}
sample_groups = {}
in_obj = open(self.par['file_sample_info'],'rt')
for line in in_obj:
line = line.rstrip("\n")
items = line.split(',')
if len(items) < col:
print('No group name in Column ', col)
break
sample_name, group_name = items[1], items[col-1]
sample_groups[sample_name]=group_name
if group_name in group_samples:
group_samples[group_name] += ',' + sample_name
else:
group_samples[group_name] = sample_name
in_obj.close()
return group_samples, sample_groups
#TOP function
def export_sample_info(self):
print('get sample_raw and raw_sample')
if os.path.isfile(self.par['file_sample_info']):
self.file_to_samples()
else:
self.raw_to_samples()
#generate sample file
self.sample_info()
#
print("\nSample and raw files:")
myDict.basic(self.sample_raw).print_dict()
self.par['raw_to_sample'] = self.raw_sample # one raw file vs one sample name
self.par['sample_to_raw'] = self.sample_raw #one sample name vs a list of raw files
#get sample names
self.sample_names = sorted(self.sample_raw.keys())
self.par['sample_names'] = self.sample_names
#get sample_dirs
self.sample_storage()
self.par['sample_dirs'] = self.sample_dirs
print('get group names if exists')
flag = 1
while flag > 0:
group_samples, sample_groups=self.group_names(flag+2)
if group_samples == {}:
flag = 0
else:
key = 'group' + str(flag)
self.par[key] = group_samples
flag += 1
print('Groups of {}: {}'.format(key,self.par[key].keys()))
return self.par
########
#end