Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,20 @@ chr2 345678 456789 DUP
```
If you have any questions that aren't answered here, please [raise a new issue](https://github.com/andrewSharo/StrVCTVRE/issues)

## Install via conda

An alternative way to install StrVCTVRE is through conda. StrVCTVRE is available on bioconda, so it can be installed by running:

```
conda install -c bioconda strvctvre
```

When using the tool via conda, you will need to install the following additional files in order to run StrVCTVRE:

1. The phyloP conservation scores as described in the relevant [chapter](#3-download-phylop-conservation-scores-for-human-genome-38) above.
2. The random forest scores joblib file located in the located [here](data/rfTrainedAllChromsPy3.joblib) in the StrVCTVRE repository. You can use `--scores /path/to/rfTrainedAllChromsPy3.joblib` to indicate the path to this file when running StrVCTVRE.
3. The exon transcript file located [here](data/exons_Appris_featurized_transcript_Chr1-Y_loeuf.sorted.bed) in the StrVCTVRE repository. You can use `--exon_transcripts /path/to/exons_Appris_featurized_transcript_Chr1-Y_loeuf.sorted.bed` to indicate the path to this file when running StrVCTVRE.

## Citation
If you use StrVCTRE in your work, please cite:

Expand Down
11 changes: 9 additions & 2 deletions StrVCTVRE.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
metavar = 'path/to/hg38.phyloP100way.bw',dest='phylopPath')
parser.add_argument('-a','--assembly',help='Genome assembly of input, either GRCh38 or GRCh37',choices=['GRCh37','GRCh38'],default='GRCh38',dest='assembly')
parser.add_argument('-l','--liftover',help='Liftover executable path, required if assembly is GRCh37',required=False,metavar='/path/to/liftover',dest='pathLiftover')
parser.add_argument('-t','--exon_transcripts',help='Exon transcript file path, defaults to \'data/exons_Appris_featurized_transcript_Chr1-Y_loeuf.sorted.bed\' when not provided',default='data/exons_Appris_featurized_transcript_Chr1-Y_loeuf.sorted.bed', type=str)
parser.add_argument('-s','--scores',help='Path to the joblib file containing random forest scores, defaults to \'data/rfTrainedAllChromsPy3.joblib\' when not provided',default='data/rfTrainedAllChromsPy3.joblib', type=str)
# for testing
# args = parser.parse_args(['-i','/test/path/sept','-o','/test/output/sept'])

Expand All @@ -42,6 +44,11 @@
if args.assembly == 'GRCh37' and args.pathLiftover is None:
parser.error("--assembly GRCh37 requires --liftover")

if args.exon_transcripts != '' or not os.path.isfile(args.exon_transcripts):
parser.error('exon transcript file not found at ' + args.exon_transcripts + '. Provide an existing file using the -t or --exon_transcripts argument.')

if args.scores != '' or not os.path.isfile(args.scores):
parser.error('random forest scores file not found at ' + args.scores + '. Provide an existing file using the -s or --scores argument.')

# Create temporary directory to store files created, deleted after finished running

Expand Down Expand Up @@ -196,7 +203,7 @@

print('\nidentifying exonic deletions and duplications...\n')

exons = pybedtools.BedTool('data/exons_Appris_featurized_transcript_Chr1-Y_loeuf.sorted.bed')
exons = pybedtools.BedTool(args.exon_transcripts)
df[['chrom','start','end','OldID']].to_csv(os.path.join(td,'svs.bed'),sep='\t', index=False,header=False)
a = pybedtools.BedTool(os.path.join(td,'svs.bed'))
b = a.intersect(exons, wa=True, wb=True).saveas(os.path.join(td,'svsExonOverlap.bed'))
Expand Down Expand Up @@ -251,7 +258,7 @@
# X = an[an['chrom'] == chrm][['DEL','numExonsFinal','phyloP', 'lowestExonRank', 'allSkippable','lowestExonsInGene', 'anyConstExon','pLIMax','loeufMin', 'cdsFracStartMin', 'cdsFracEndMax', 'cdsFracMax', 'pLI_max25_ID', 'loeuf_min25_ID','topExp','topUsage','maxStrength']].copy()
# an.loc[an['chrom'] == chrm,'path'] = rf.predict_proba(X)[:,1]

rf = load('data/rfTrainedAllChromsPy3.joblib')
rf = load(args.scores)
X = an[['DEL','numExonsFinal','phyloP', 'lowestExonRank', 'allSkippable','lowestExonsInGene', 'anyConstExon','pLIMax','loeufMin', 'cdsFracStartMin', 'cdsFracEndMax', 'cdsFracMax', 'pLI_max25_ID', 'loeuf_min25_ID','topExp','topUsage','maxStrength']].copy()
an['path'] = rf.predict_proba(X)[:,1]
an.set_index('OldID', inplace=True)
Expand Down