diff --git a/bioinformaticsProject/BioCompProject.sh b/bioinformaticsProject/BioCompProject.sh new file mode 100644 index 0000000..526d976 --- /dev/null +++ b/bioinformaticsProject/BioCompProject.sh @@ -0,0 +1,78 @@ +#Shell Script for Biocomputing Bash Project Fall 2021 + +#Usage: bash BioCompProject.sh +#the code will ask users for the path's of where each folder is located. However to make it +#easier on users, we suggest organizing the project's files based on the structure below: +##### /bioinformaticsProject : /muscle, /ref_sequences, /BioCompProject.sh, /proteomes, +# /hmmer/bin/: hmmbuild hmmsearch + + +# The following code is getting the path of McRa gene, HSP70 gene and Proteomes from the user. +printf "Hi, I hope you are doing great.\nCould you please give me the path of McRA_Genes?\n" +printf "Example: ref_sequences directory is in Biocomp-Project, which is current working directory, enter './ref_sequences'\n" +printf "If any path is your current path, please enter only a '.'\n" + +read Path1 +printf "Awesome, and also share the path of HSP70 genes?\n" +read Path2 +printf "Great! Now kindly enter the path where proteomes are stored\n" +read Path3 +printf "Thank you. Almost there...I promise. Please tell us the path where your Muscle program is stored.\n" +read Path4 +printf "Finally, could you list the path for your hmmbuild and hmmsearch?\n" +read Path5 + +printf "The paths you have entered are: " +echo $Path1 $Path2 $Path3 $Path4 $Path5 + +#following commands concatenates all the McrA gene files and all of the HSP70 gene files +#results in one file for McrA genes and anothe for HSP70 genes +cat $Path1/mcrAgene*.fasta > $Path1/mcrAlist.fasta +cat $Path2/hsp70gene*.fasta > $Path2/hsp70list.fasta + +#next set of commands runs the Muscle program to create alignment files +#for both types of reference sequences. Then a profile is built for each to perform a search + +$Path4/muscle -in $Path1/mcrAlist.fasta -out $Path1/mcrAlignment.fasta + +$Path4/muscle -in $Path2/hsp70list.fasta -out $Path2/hsp70Alignment.fasta + +$Path5/hmmbuild $Path1/mcrAbuild.fasta $Path1/mcrAlignment.fasta + +$Path5/hmmbuild $Path2/hsp70build.fasta $Path2/hsp70Alignment.fasta + +#removes any file with this naming convention +rm genehitlist.csv + +#print a header row for your CSV file +echo -e 'Proteome\tMcrA Hits\tHSP70 Hits' >> genehitlist.csv + +#begin for loop to generate a search image that will be then referenced to all 50 proteomes +#to find any proteome that has the most McrA and HSP70 hits/matches +for i in {01..50} + +do + +$Path5/hmmsearch --tblout mcrAsearch.fasta $Path1/mcrAbuild.fasta $Path3/proteome_$i* + +mcrAhit=$(cat mcrAsearch.fasta | grep -v "^#" | wc -l) + +$Path5/hmmsearch --tblout hsp70search.fasta $Path2/hsp70build.fasta $Path3/proteome_$i* + +hsp70hit=$(cat hsp70search.fasta | grep -v "^#" | wc -l) + +echo -e Proteome_$i'\t'$mcrAhit'\t\t'$hsp70hit >> genehitlist.csv + +done + + +# The following pipeline travereses genehitlist.csv to find the best 5 candidates that are +#methanogens with pH resistance qualities. First, we check to see if there are any McrA hits, and +#discard any that return with 0 hits. Following this we sort the reminaing proteomes based on their +#the number of hsp70 gene hits/matches. Finally, this list is stored in a new .txt file and is displayed for the user to see + +cat genehitlist.csv |awk -F '\t' '$2>0'|sort -k 3 -nr | head -n 5 | cut -f 1 > UltimateHitList.txt + +echo "This is the ultimate list of Proteomes to use for your experiment...Good luck!" +cat UltimateHitList.txt + diff --git a/bioinformaticsProject/BioinformaticsProjectPrompt2021.pdf b/bioinformaticsProject/BioinformaticsProjectPrompt2021.pdf deleted file mode 100644 index 13ab712..0000000 Binary files a/bioinformaticsProject/BioinformaticsProjectPrompt2021.pdf and /dev/null differ diff --git a/bioinformaticsProject/UltimateHitList.txt b/bioinformaticsProject/UltimateHitList.txt new file mode 100644 index 0000000..266d057 --- /dev/null +++ b/bioinformaticsProject/UltimateHitList.txt @@ -0,0 +1,5 @@ +Proteome_50 +Proteome_45 +Proteome_42 +Proteome_03 +Proteome_24 diff --git a/bioinformaticsProject/genehitlist.csv b/bioinformaticsProject/genehitlist.csv new file mode 100644 index 0000000..eea9a6a --- /dev/null +++ b/bioinformaticsProject/genehitlist.csv @@ -0,0 +1,51 @@ +Proteome McrA Hits HSP70 Hits +Proteome_01 0 4 +Proteome_02 0 2 +Proteome_03 1 3 +Proteome_04 0 4 +Proteome_05 1 2 +Proteome_06 0 0 +Proteome_07 1 2 +Proteome_08 0 5 +Proteome_09 0 1 +Proteome_10 0 3 +Proteome_11 0 6 +Proteome_12 0 6 +Proteome_13 0 3 +Proteome_14 0 2 +Proteome_15 1 1 +Proteome_16 1 1 +Proteome_17 0 4 +Proteome_18 0 8 +Proteome_19 2 1 +Proteome_20 0 3 +Proteome_21 0 5 +Proteome_22 0 9 +Proteome_23 2 2 +Proteome_24 1 2 +Proteome_25 0 5 +Proteome_26 0 1 +Proteome_27 0 1 +Proteome_28 0 1 +Proteome_29 1 0 +Proteome_30 0 1 +Proteome_31 0 7 +Proteome_32 0 4 +Proteome_33 0 0 +Proteome_34 0 2 +Proteome_35 0 1 +Proteome_36 0 3 +Proteome_37 0 1 +Proteome_38 1 1 +Proteome_39 1 1 +Proteome_40 0 2 +Proteome_41 0 1 +Proteome_42 1 3 +Proteome_43 0 3 +Proteome_44 1 1 +Proteome_45 1 3 +Proteome_46 0 2 +Proteome_47 0 1 +Proteome_48 1 1 +Proteome_49 0 3 +Proteome_50 1 3 diff --git a/bioinformaticsProject/hmmer/bin/alimask b/bioinformaticsProject/hmmer/bin/alimask new file mode 100755 index 0000000..dcb30d6 Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/alimask differ diff --git a/bioinformaticsProject/hmmer/bin/hmmalign b/bioinformaticsProject/hmmer/bin/hmmalign new file mode 100755 index 0000000..4f87bc5 Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/hmmalign differ diff --git a/bioinformaticsProject/hmmer/bin/hmmbuild b/bioinformaticsProject/hmmer/bin/hmmbuild new file mode 100755 index 0000000..dd86a8c Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/hmmbuild differ diff --git a/bioinformaticsProject/hmmer/bin/hmmconvert b/bioinformaticsProject/hmmer/bin/hmmconvert new file mode 100755 index 0000000..425fbf3 Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/hmmconvert differ diff --git a/bioinformaticsProject/hmmer/bin/hmmemit b/bioinformaticsProject/hmmer/bin/hmmemit new file mode 100755 index 0000000..beca368 Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/hmmemit differ diff --git a/bioinformaticsProject/hmmer/bin/hmmfetch b/bioinformaticsProject/hmmer/bin/hmmfetch new file mode 100755 index 0000000..cef6083 Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/hmmfetch differ diff --git a/bioinformaticsProject/hmmer/bin/hmmlogo b/bioinformaticsProject/hmmer/bin/hmmlogo new file mode 100755 index 0000000..6a1d0b8 Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/hmmlogo differ diff --git a/bioinformaticsProject/hmmer/bin/hmmpgmd b/bioinformaticsProject/hmmer/bin/hmmpgmd new file mode 100755 index 0000000..5260293 Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/hmmpgmd differ diff --git a/bioinformaticsProject/hmmer/bin/hmmpgmd_shard b/bioinformaticsProject/hmmer/bin/hmmpgmd_shard new file mode 100755 index 0000000..7f306ba Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/hmmpgmd_shard differ diff --git a/bioinformaticsProject/hmmer/bin/hmmpress b/bioinformaticsProject/hmmer/bin/hmmpress new file mode 100755 index 0000000..e374ce2 Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/hmmpress differ diff --git a/bioinformaticsProject/hmmer/bin/hmmscan b/bioinformaticsProject/hmmer/bin/hmmscan new file mode 100755 index 0000000..abfdd7f Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/hmmscan differ diff --git a/bioinformaticsProject/hmmer/bin/hmmsearch b/bioinformaticsProject/hmmer/bin/hmmsearch new file mode 100755 index 0000000..4b3218a Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/hmmsearch differ diff --git a/bioinformaticsProject/hmmer/bin/hmmsim b/bioinformaticsProject/hmmer/bin/hmmsim new file mode 100755 index 0000000..8d2c80d Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/hmmsim differ diff --git a/bioinformaticsProject/hmmer/bin/hmmstat b/bioinformaticsProject/hmmer/bin/hmmstat new file mode 100755 index 0000000..9f20319 Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/hmmstat differ diff --git a/bioinformaticsProject/hmmer/bin/jackhmmer b/bioinformaticsProject/hmmer/bin/jackhmmer new file mode 100755 index 0000000..6fa9681 Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/jackhmmer differ diff --git a/bioinformaticsProject/hmmer/bin/makehmmerdb b/bioinformaticsProject/hmmer/bin/makehmmerdb new file mode 100755 index 0000000..42c3eaf Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/makehmmerdb differ diff --git a/bioinformaticsProject/hmmer/bin/nhmmer b/bioinformaticsProject/hmmer/bin/nhmmer new file mode 100755 index 0000000..68fe397 Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/nhmmer differ diff --git a/bioinformaticsProject/hmmer/bin/nhmmscan b/bioinformaticsProject/hmmer/bin/nhmmscan new file mode 100755 index 0000000..bea5226 Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/nhmmscan differ diff --git a/bioinformaticsProject/hmmer/bin/phmmer b/bioinformaticsProject/hmmer/bin/phmmer new file mode 100755 index 0000000..5be4bd7 Binary files /dev/null and b/bioinformaticsProject/hmmer/bin/phmmer differ diff --git a/bioinformaticsProject/hmmer/share/man/man1/alimask.1 b/bioinformaticsProject/hmmer/share/man/man1/alimask.1 new file mode 100644 index 0000000..9b20860 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/alimask.1 @@ -0,0 +1,340 @@ +.TH "alimask" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +alimask \- calculate and add column mask to a multiple sequence alignment + +.SH SYNOPSIS +.B alimask +[\fIoptions\fR] +.I msafile +.I postmsafile + + +.SH DESCRIPTION + +.PP +.B alimask +is used to apply a mask line to a multiple sequence +alignment, based on provided alignment or model coordinates. +When +.B hmmbuild +receives a masked alignment as input, it produces a profile +model in which the emission probabilities at masked positions +are set to match the background frequency, rather than being +set based on observed frequencies in the alignment. +Position-specific insertion and deletion rates are not +altered, even in masked regions. +.B alimask +autodetects input format, and produces masked alignments +in Stockholm format. +.I msafile +may contain only one sequence alignment. + +.PP +A common motivation for masking a region in an alignment is +that the region contains a simple tandem repeat that is +observed to cause an unacceptably high rate of false positive +hits. + +.PP +In the simplest case, a mask range is given in coordinates +relative to the input alignment, using +.BI \-\-alirange " ". +However it is more often the case that the region to be +masked has been identified in coordinates relative to +the profile model (e.g. based on recognizing a simple +repeat pattern in false hit alignments or in the HMM logo). +Not all alignment columns are converted to match state +positions in the profile (see the +.B \-\-symfrac +flag for +.B hmmbuild +for discussion), so model positions do not necessarily match +up to alignment column positions. +To remove the burden of converting model positions to +alignment positions, +.B alimask +accepts the mask range input in model coordinates as well, +using +.BI \-\-modelrange " ". +When using this flag, +.I alimask +determines which alignment positions would be identified by +.B hmmbuild +as match states, a process that requires that all +.B hmmbuild +flags impacting that decision be supplied to +.BR alimask . +It is for this reason that many of the +.B hmmbuild +flags are also used by +.BR alimask . + + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + +.TP +.BI \-o " " +Direct the summary output to file +.IR , +rather than to stdout. + + +.SH OPTIONS FOR SPECIFYING MASK RANGE + +A single mask range is given as a dash-separated pair, like +.B "\-\-modelrange 10\-20" +and multiple ranges may be submitted as a comma-separated list, +.BR "\-\-modelrange 10\-20,30\-42" . + + +.TP +.BI \-\-modelrange " " +Supply the given range(s) in model coordinates. + +.TP +.BI \-\-alirange " " +Supply the given range(s) in alignment coordinates. + +.TP +.B \-\-apendmask +Add to the existing mask found with the alignment. +The default is to overwrite any existing mask. + +.TP +.BI \-\-model2ali " " +Rather than actually produce the masked alignment, simply +print model range(s) corresponding to input alignment +range(s). + +.TP +.BI \-\-ali2model " " +Rather than actually produce the masked alignment, simply +print alignment range(s) corresponding to input model +range(s). + + +.SH OPTIONS FOR SPECIFYING THE ALPHABET + +.TP +.B \-\-amino +Assert that sequences in +.I msafile +are protein, bypassing alphabet autodetection. + +.TP +.B \-\-dna +Assert that sequences in +.I msafile +are DNA, bypassing alphabet autodetection. + +.TP +.B \-\-rna +Assert that sequences in +.I msafile +are RNA, bypassing alphabet autodetection. + + + +.SH OPTIONS CONTROLLING PROFILE CONSTRUCTION + +These options control how consensus columns are defined in an alignment. + +.TP +.B \-\-fast +Define consensus columns as those that have a fraction >= +.B symfrac +of residues as opposed to gaps. (See below for the +.B \-\-symfrac +option.) This is the default. + +.TP +.B \-\-hand +Define consensus columns in next profile using reference annotation to +the multiple alignment. +This allows you to define any consensus columns you like. + +.TP +.BI \-\-symfrac " " +Define the residue fraction threshold necessary to define a +consensus column when using the +.B \-\-fast +option. The default is 0.5. The symbol fraction in each column +is calculated after taking relative sequence weighting into account, +and ignoring gap characters corresponding to ends of sequence +fragments +(as opposed to internal insertions/deletions). +Setting this to 0.0 means that every alignment column will be assigned +as consensus, which may be useful in some cases. Setting it to 1.0 +means that only columns that include 0 gaps (internal +insertions/deletions) will be assigned as consensus. + +.TP +.BI \-\-fragthresh " " +We only want to count terminal gaps as deletions if the aligned +sequence is known to be full-length, not if it is a fragment (for +instance, because only part of it was sequenced). HMMER uses a simple +rule to infer fragments: if the sequence length L is less than +or equal to a fraction +.I +times the alignment length in columns, +then the sequence is handled as a fragment. The default is 0.5. +Setting +.B \-\-fragthresh 0 +will define no (nonempty) sequence as a fragment; you might want to do +this if you know you've got a carefully curated alignment of full-length +sequences. +Setting +.B \-\-fragthresh 1 +will define all sequences as fragments; you might want to do this if +you know your alignment is entirely composed of fragments, such as +translated short reads in metagenomic shotgun data. + + +.SH OPTIONS CONTROLLING RELATIVE WEIGHTS + +HMMER uses an ad hoc sequence weighting algorithm to downweight +closely related sequences and upweight distantly related ones. This +has the effect of making models less biased by uneven phylogenetic +representation. For example, two identical sequences would typically +each receive half the weight that one sequence would. These options +control which algorithm gets used. + +.TP +.B \-\-wpb +Use the Henikoff position-based sequence weighting scheme [Henikoff +and Henikoff, J. Mol. Biol. 243:574, 1994]. This is the default. + +.TP +.B \-\-wgsc +Use the Gerstein/Sonnhammer/Chothia weighting algorithm [Gerstein et +al, J. Mol. Biol. 235:1067, 1994]. + +.TP +.B \-\-wblosum +Use the same clustering scheme that was used to weight data in +calculating BLOSUM subsitution matrices [Henikoff and Henikoff, +Proc. Natl. Acad. Sci 89:10915, 1992]. Sequences are single-linkage +clustered at an identity threshold (default 0.62; see +.BR \-\-wid ) +and within each cluster of c sequences, each sequence gets relative +weight 1/c. + +.TP +.B \-\-wnone +No relative weights. All sequences are assigned uniform weight. + +.TP +.BI \-\-wid " " +Sets the identity threshold used by single-linkage clustering when +using +.BR \-\-wblosum . +Invalid with any other weighting scheme. Default is 0.62. + + + + + +.SH OTHER OPTIONS + +.TP +.BI \-\-informat " " +Assert that input +.I msafile +is in alignment format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBa2m\fR or \fBA2M\fR both work). + + +.TP +.BI \-\-outformat " " +Write the output +.I postmsafile +in alignment format +.IR . +Common choices for +.I +include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +The string +.I +is case-insensitive (\fBa2m\fR or \fBA2M\fR both work). +Default is +.BR stockholm . + + +.TP +.BI \-\-seed " " +Seed the random number generator with +.IR , +an integer >= 0. +If +.I +is nonzero, any stochastic simulations will be reproducible; the same +command will give the same results. +If +.I +is 0, the random number generator is seeded arbitrarily, and +stochastic simulations will vary from run to run of the same command. +The default seed is 42. + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmalign.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmalign.1 new file mode 100644 index 0000000..a18006f --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmalign.1 @@ -0,0 +1,185 @@ +.TH "hmmalign" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmalign \- align sequences to a profile + +.SH SYNOPSIS +.B hmmalign +[\fIoptions\fR] +.I hmmfile +.I seqfile + +.SH DESCRIPTION + +.PP +Perform a multiple sequence alignment of all the sequences in +.I seqfile +by aligning them individually to the profile HMM in +.I hmmfile. +The new alignment is output to stdout. + +.PP +The +.I hmmfile +should contain only a single profile. If it contains more, only the +first profile in the file will be used. + +.PP +Either +.I hmmfile +or +.I seqfile +(but not both) may be '\-' (dash), which +means reading this input from stdin rather than a file. + +.PP +The sequences in +.I seqfile +are aligned in unihit local alignment mode. Therefore they should +already be known to contain only a single domain (or a fragment of one). +The optimal alignment +may assign some residues as nonhomologous (N and C states), in which +case these residues are still included in the resulting alignment, but +shoved to the outer edges. To trim these unaligned nonhomologous residues from +the result, see the +.B \-\-trim +option. + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + +.TP +.BI \-o " " +Direct the output alignment to file +.I , +rather than to stdout. + +.TP +.BI \-\-mapali " " +Merge the existing alignment in file +.I +into the result, where +.I +is exactly the same alignment that was used to build the model in +.I hmmfile. +This is done using a map of alignment columns to consensus +profile positions that is stored in the +.I hmmfile. +The multiple alignment in +.I +will be exactly reproduced in its consensus columns (as defined by the +profile), but the displayed alignment in insert columns may be +altered, because insertions relative to a profile are considered by +convention to be unaligned data. + + +.TP +.B \-\-trim +Trim nonhomologous residues (assigned to N and C states in the optimal +alignments) from the resulting multiple alignment output. + +.TP +.B \-\-amino +Assert that sequences in +.I seqfile +are protein, bypassing alphabet autodetection. + +.TP +.B \-\-dna +Assert that sequences in +.I seqfile +are DNA, bypassing alphabet autodetection. + +.TP +.B \-\-rna +Assert that sequences in +.I seqfile +are RNA, bypassing alphabet autodetection. + + +.TP +.BI \-\-informat " " +Assert that input +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). + +.TP +.BI \-\-outformat " " +Write the output alignment +in format +.IR . +Common choices for +.I +include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +The string +.I +is case-insensitive (\fBa2m\fR or \fBA2M\fR both work). +Default is +.BR stockholm . + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmbuild.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmbuild.1 new file mode 100644 index 0000000..bf2d7d1 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmbuild.1 @@ -0,0 +1,556 @@ +.TH "hmmbuild" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmbuild \- construct profiles from multiple sequence alignments + +.SH SYNOPSIS +.B hmmbuild +[\fIoptions\fR] +.I hmmfile +.I msafile + + +.SH DESCRIPTION + +For each multiple sequence alignment in +.I msafile +build a profile HMM +and save it to a new file +.IR hmmfile . + + +.PP +.I msafile +may be '\-' (dash), which means +reading this input from stdin rather than a file. + + +.PP +.I hmmfile +may not be '\-' (stdout), because sending the HMM file to stdout would +conflict with the other text output of the program. + + + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + +.TP +.BI \-n " " +Name the new profile +.IR . +The default is to use the name of the alignment (if one is present in +the +.IR msafile , +or, failing that, the name of the +.IR hmmfile . +If +.I msafile +contains more than one alignment, +.B \-n +doesn't work, and every alignment must have a name +annotated in the +.I msafile +(as in Stockholm #=GF ID annotation). + + +.TP +.BI \-o " " +Direct the summary output to file +.IR , +rather than to stdout. + +.TP +.BI \-O " " +After each model is constructed, resave annotated, possibly modified +source alignments to a file +.I +in Stockholm format. +The alignments are annotated with a reference annotation line +indicating which columns were assigned as consensus, and sequences are +annotated with what relative sequence weights were assigned. Some +residues of the alignment may have been shifted to accommodate +restrictions of the Plan7 profile architecture, which disallows +transitions between insert and delete states. + + +.SH OPTIONS FOR SPECIFYING THE ALPHABET + +.TP +.B \-\-amino +Assert that sequences in +.I msafile +are protein, bypassing alphabet autodetection. + +.TP +.B \-\-dna +Assert that sequences in +.I msafile +are DNA, bypassing alphabet autodetection. + +.TP +.B \-\-rna +Assert that sequences in +.I msafile +are RNA, bypassing alphabet autodetection. + +.SH OPTIONS CONTROLLING PROFILE CONSTRUCTION + +These options control how consensus columns are defined in an alignment. + +.TP +.B \-\-fast +Define consensus columns as those that have a fraction >= +.B symfrac +of residues as opposed to gaps. (See below for the +.B \-\-symfrac +option.) This is the default. + +.TP +.B \-\-hand +Define consensus columns in next profile using reference annotation to +the multiple alignment. +This allows you to define any consensus columns you like. + +.TP +.BI \-\-symfrac " " +Define the residue fraction threshold necessary to define a +consensus column when using the +.B \-\-fast +option. The default is 0.5. The symbol fraction in each column +is calculated after taking relative sequence weighting into account, +and ignoring gap characters corresponding to ends of sequence +fragments +(as opposed to internal insertions/deletions). +Setting this to 0.0 means that every alignment column will be assigned +as consensus, which may be useful in some cases. Setting it to 1.0 +means that only columns that include 0 gaps (internal +insertions/deletions) will be assigned as consensus. + +.TP +.BI \-\-fragthresh " " +We only want to count terminal gaps as deletions if the aligned +sequence is known to be full-length, not if it is a fragment (for +instance, because only part of it was sequenced). HMMER uses a simple +rule to infer fragments: if the range of a sequence in the alignment +(the number of alignment columns between the first and last positions +of the sequence) is less than or equal to a fraction +.I +times the alignment length in columns, +then the sequence is handled as a fragment. The default is 0.5. +Setting +.B \-\-fragthresh 0 +will define no (nonempty) sequence as a fragment; you might want to do +this if you know you've got a carefully curated alignment of full\-length +sequences. +Setting +.B \-\-fragthresh 1 +will define all sequences as fragments; you might want to do this if +you know your alignment is entirely composed of fragments, such as +translated short reads in metagenomic shotgun data. + + +.SH OPTIONS CONTROLLING RELATIVE WEIGHTS + +HMMER uses an ad hoc sequence weighting algorithm to downweight +closely related sequences and upweight distantly related ones. This +has the effect of making models less biased by uneven phylogenetic +representation. For example, two identical sequences would typically +each receive half the weight that one sequence would. These options +control which algorithm gets used. + +.TP +.B \-\-wpb +Use the Henikoff position-based sequence weighting scheme [Henikoff +and Henikoff, J. Mol. Biol. 243:574, 1994]. This is the default. + +.TP +.B \-\-wgsc +Use the Gerstein/Sonnhammer/Chothia weighting algorithm [Gerstein et +al, J. Mol. Biol. 235:1067, 1994]. + +.TP +.B \-\-wblosum +Use the same clustering scheme that was used to weight data in +calculating BLOSUM subsitution matrices [Henikoff and Henikoff, +Proc. Natl. Acad. Sci 89:10915, 1992]. Sequences are single-linkage +clustered at an identity threshold (default 0.62; see +.BR \-\-wid ) +and within each cluster of c sequences, each sequence gets relative +weight 1/c. + +.TP +.B \-\-wnone +No relative weights. All sequences are assigned uniform weight. + +.TP +.BI \-\-wid " " +Sets the identity threshold used by single-linkage clustering when +using +.BR \-\-wblosum . +Invalid with any other weighting scheme. Default is 0.62. + + + + +.SH OPTIONS CONTROLLING EFFECTIVE SEQUENCE NUMBER + +After relative weights are determined, they are normalized to sum to a +total effective sequence number, +.IR eff_nseq . +This number may be the actual number of sequences in the alignment, +but it is almost always smaller than that. +The default entropy weighting method +(\fB\-\-eent\fR) +reduces the effective sequence +number to reduce the information content (relative entropy, or average +expected score on true homologs) per consensus position. The target +relative entropy is controlled by a two-parameter function, where the +two parameters are settable with +.B \-\-ere +and +.BR \-\-esigma . + +.TP +.B \-\-eent +Adjust effective sequence number to achieve a specific relative entropy +per position (see +.BR \-\-ere ). +This is the default. + +.TP +.B \-\-eclust +Set effective sequence number to the number of single-linkage clusters +at a specific identity threshold (see +.BR \-\-eid ). +This option is not recommended; it's for experiments evaluating +how much better +.B \-\-eent +is. + +.TP +.B \-\-enone +Turn off effective sequence number determination and just use the +actual number of sequences. One reason you might want to do this is +to try to maximize the relative entropy/position of your model, which +may be useful for short models. + +.TP +.BI \-\-eset " " +Explicitly set the effective sequence number for all models to +.IR . + +.TP +.BI \-\-ere " " +Set the minimum relative entropy/position target to +.IR . +Requires +.BR \-\-eent . +Default depends on the sequence alphabet. For protein +sequences, it is 0.59 bits/position; for nucleotide +sequences, it is 0.45 bits/position. + +.TP +.BI \-\-esigma " " +Sets the minimum relative entropy contributed by an entire +model alignment, over its whole length. This has the effect +of making short models have +higher relative entropy per position than +.B \-\-ere +alone would give. The default is 45.0 bits. + +.TP +.BI \-\-eid " " +Sets the fractional pairwise identity cutoff used by +single linkage clustering with the +.B \-\-eclust +option. The default is 0.62. + + +.SH OPTIONS CONTROLLING PRIORS + +By default, weighted counts are converted to mean posterior +probability parameter estimates using mixture Dirichlet priors. +Default mixture Dirichlet prior parameters for protein models and for +nucleic acid (RNA and DNA) models are built in. The following options +allow you to override the default priors. + +.TP +.B \-\-pnone +Don't use any priors. Probability parameters will simply be the +observed frequencies, after relative sequence weighting. + +.TP +.B \-\-plaplace +Use a Laplace +1 prior in place of the default mixture Dirichlet +prior. + + + + +.SH OPTIONS CONTROLLING SINGLE SEQUENCE SCORING + +By default, if a query is a single sequence from a file in +.IR fasta +format, +.B hmmbuild +constructs a search model from that sequence and a standard +20x20 substitution matrix for residue probabilities, along with two +additional parameters for position-independent gap open and gap extend +probabilities. These options allow the default single-sequence scoring +parameters to be changed, and for single-sequence scoring options to +be applied to a single sequence coming from an aligned format. + +.TP +.BI \-\-singlemx +If a single sequence query comes from a multiple sequence alignment file, +such as in +.IR stockholm +format, the search model is by default constructed as is typically done +for multiple sequence alignments. This option forces +.B hmmbuild +to use the single-sequence method with substitution score matrix. + +.TP +.BI \-\-mx " " +Obtain residue alignment probabilities from the built-in +substitution matrix named +.IR . +Several standard matrices are built-in, and do not need to be +read from files. +The matrix name +.I +can be +PAM30, PAM70, PAM120, PAM240, BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, +BLOSUM90, or DNA1. +Only one of the +.B \-\-mx +and +.B \-\-mxfile +options may be used. + +.TP +.BI \-\-mxfile " " +Obtain residue alignment probabilities from the substitution matrix +in file +.IR . +The default score matrix is BLOSUM62 for protein sequences, and +DNA1 for nucleotide sequences (these matrices are internal to +HMMER and do not need to be available as a file). +The format of a substitution matrix +.I +is the standard format accepted by BLAST, FASTA, and other sequence +analysis software. +See ftp.ncbi.nlm.nih.gov/blast/matrices/ for example files. (The only +exception: we require matrices to be square, so for DNA, use files +like NCBI's NUC.4.4, not NUC.4.2.) + +.TP +.BI \-\-popen " " +Set the gap open probability for a single sequence query model to +.IR . +The default is 0.02. +.I +must be >= 0 and < 0.5. + +.TP +.BI \-\-pextend " " +Set the gap extend probability for a single sequence query model to +.IR . +The default is 0.4. +.I +must be >= 0 and < 1.0. + + +.SH OPTIONS CONTROLLING E-VALUE CALIBRATION + +The location parameters for the expected score distributions for MSV +filter scores, Viterbi filter scores, and Forward scores require three +short random sequence simulations. + +.TP +.BI \-\-EmL " " +Sets the sequence length in simulation that estimates the location +parameter mu for MSV filter E-values. Default is 200. + +.TP +.BI \-\-EmN " " +Sets the number of sequences in simulation that estimates the location +parameter mu for MSV filter E-values. Default is 200. + +.TP +.BI \-\-EvL " " +Sets the sequence length in simulation that estimates the location +parameter mu for Viterbi filter E-values. Default is 200. + +.TP +.BI \-\-EvN " " +Sets the number of sequences in simulation that estimates the location +parameter mu for Viterbi filter E-values. Default is 200. + +.TP +.BI \-\-EfL " " +Sets the sequence length in simulation that estimates the location +parameter tau for Forward E-values. Default is 100. + +.TP +.BI \-\-EfN " " +Sets the number of sequences in simulation that estimates the location +parameter tau for Forward E-values. Default is 200. + +.TP +.BI \-\-Eft " " +Sets the tail mass fraction to fit in the simulation that estimates +the location parameter tau for Forward evalues. Default is 0.04. + + +.SH OTHER OPTIONS + +.TP +.BI \-\-cpu " " +Set the number of parallel worker threads to +.IR . +On multicore machines, the default is 2. +You can also control this number by setting an environment variable, +.IR HMMER_NCPU . +There is also a master thread, so the actual number of threads that +HMMER spawns is +.IR +1. + +This option is not available if HMMER was compiled with POSIX threads +support turned off. + + + +.TP +.BI \-\-informat " " +Assert that input +.I msafile +is in alignment format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBa2m\fR or \fBA2M\fR both work). + + +.TP +.BI \-\-seed " " +Seed the random number generator with +.IR , +an integer >= 0. +If +.I +is nonzero, any stochastic simulations will be reproducible; the same +command will give the same results. +If +.I +is 0, the random number generator is seeded arbitrarily, and +stochastic simulations will vary from run to run of the same command. +The default seed is 42. + + +.TP +.BI \-\-w_beta " " +Window length tail mass. +The upper bound, +.IR W , +on the length at which nhmmer expects to find an instance of the +model is set such that the fraction of all sequences generated +by the model with length +.I ">= W" +is less than +.IR . +The default is 1e-7. + + + +.TP +.BI \-\-w_length " " +Override the model instance length upper bound, +.IR W , +which is otherwise controlled by +.BR \-\-w_beta . +It should be larger than the model length. The value of +.I W +is used deep in the acceleration pipeline, and modest changes +are not expected to impact results (though larger values of +.I W +do lead to longer run time). + + +.TP +.B \-\-mpi +Run as a parallel MPI program. Each alignment is assigned to a MPI +worker node for construction. (Therefore, the maximum parallelization +cannot exceed the number of alignments in the input +.IR msafile .) +This is useful when building large profile libraries. This option is +only available if optional MPI capability was enabled at compile-time. + + +.TP +.B \-\-stall +For debugging MPI parallelization: arrest program execution +immediately after start, and wait for a debugger to attach to the +running process and release the arrest. + + +.TP +.BI \-\-maxinsertlen " " +Restrict insert length parameterization such that the expected +insert length at each position of the model is no more than +.IR . + + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmc2.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmc2.1 new file mode 100644 index 0000000..364d2b6 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmc2.1 @@ -0,0 +1,78 @@ +.TH "hmmc2" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmc2 \- example client for the HMMER daemon + + +.SH SYNOPSIS +.B hmmc2 +[\fIoptions\fR] + + +.SH DESCRIPTION + +.PP +.B Hmmc2 +is a text client for the hmmpgmd or hmmpgmd_shard daemons. When run, it opens a connection to a daemon at the specified +IP address and port, and then enters an interactive loop waiting for the user to input commands to be sent to the daemon. +See the User's Guide for the HMMER Daemon for a discussion of hmmpgmd's command format. + + +.SH OPTIONS + +.TP +.B \-i +Specify the IP address of the daemon that hmmc2 should connect to. Defaults to 127.0.0.1 if not provided + + +.TP +.B \-p +Specify the port number that the daemon is listening on. Defaults to 51371 if not provided + + +.TP +.B \-S +Print the scores of any hits found during searches. + + +.TP +.B \-A +Print the alignment of any hits found during searches. This is a superset of the "-S" flag, so providing both is redundant. + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + + + diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmconvert.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmconvert.1 new file mode 100644 index 0000000..057b997 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmconvert.1 @@ -0,0 +1,106 @@ +.TH "hmmconvert" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmconvert \- convert profile file to various formats + + +.SH SYNOPSIS +.B hmmconvert +[\fIoptions\fR] +.I hmmfile + + +.SH DESCRIPTION + +.PP +The +.B hmmconvert +utility +converts an input profile file to different HMMER formats. + +.PP +By default, the input profile can be in any HMMER format, including +old/obsolete formats from HMMER2, ASCII or binary; the output profile +is a current HMMER3 ASCII format. + +.PP +.I hmmfile +may be '\-' (dash), which means reading this input from stdin rather +than a file. + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + +.TP +.B \-a +Output profiles in ASCII text format. This is the default. + +.TP +.B \-b +Output profiles in binary format. + +.TP +.B \-2 +Output in legacy HMMER2 ASCII text format, in ls (glocal) mode. This +allows HMMER3 models to be converted back to a close approximation of +HMMER2, for comparative studies. + +.TP +.BI \-\-outfmt " " +Output in a HMMER3 ASCII text format other then the most current one. +Valid choices for +.I +are +.B "3/a" +through +.BR "3/f" . +The current format is +.BR 3/f , +and this is the default. The format +.B 3/b +was used in the official HMMER3 release, and the others were used in +the various testing versions. + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + + + +" + diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmemit.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmemit.1 new file mode 100644 index 0000000..c5d2e89 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmemit.1 @@ -0,0 +1,250 @@ +.TH "hmmemit" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmemit \- sample sequences from a profile + +.SH SYNOPSIS +.B hmmemit +[\fIoptions\fR] +.I hmmfile + + +.SH DESCRIPTION + +.PP +The +.B hmmemit +program +samples (emits) sequences from the profile HMM(s) in +.IR hmmfile , +and writes them to output. +Sampling sequences may be useful for a variety of purposes, including +creating synthetic true positives for benchmarks or tests. + +.PP +The default is to sample one unaligned sequence from the core +probability model, which means that each sequence consists of one +full-length domain. Alternatively, with the +.B \-c +option, you can emit a simple majority-rule consensus sequence; +or with the +.B \-a +option, you can emit an alignment (in which case, you probably +also want to set +.B \-N +to something other than its default of 1 sequence per model). + +.PP +As another option, with the +.B \-p +option you can sample a sequence from a fully configured HMMER search +profile. This means sampling a `homologous sequence' by HMMER's +definition, including nonhomologous flanking sequences, local +alignments, and multiple domains per sequence, depending on the length +model and alignment mode chosen for the profile. + +.PP +The +.I hmmfile +may contain a library of HMMs, in which case +each HMM will be used in turn. + +.PP +.I hmmfile +may be '\-' (dash), which +means reading this input from stdin rather than a file. + + +.SH COMMON OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + + +.TP +.BI \-o " " +Direct the output sequences to file +.IR , +rather than to stdout. + +.TP +.BI \-N " " +Sample +.I +sequences per model, rather than just one. + + + +.SH OPTIONS CONTROLLING WHAT TO EMIT + +The default is to sample +.B N +sequences from the core model. Alternatively, +you may choose one (and only one) of the following alternatives. + + +.TP +.B \-a +Emit an alignment for each HMM in the +.I hmmfile +rather than sampling unaligned sequences one at a time. + +.TP +.B \-c +Emit a plurality-rule consensus sequence, instead of sampling a +sequence from the profile HMM's probability distribution. The +consensus sequence is formed by selecting the maximum probability +residue at each match state. + +.TP +.B \-C +Emit a fancier plurality-rule consensus sequence than the +.B \-c +option. If the maximum probability residue has p < +.B minl +show it as a lower case 'any' residue (n or x); if p >= +.B minl +and < +.B minu +show it as a lower case residue; and if p >= +.B minu +show it as an upper case residue. +The default settings of +.B minu +and +.B minl +are both 0.0, which means +.B \-C +gives the same output as +.B \-c +unless you also set +.B minu +and +.B minl +to what you want. + +.TP +.B \-p +Sample unaligned sequences from the implicit search profile, not from +the core model. The core model consists only of the homologous states +(between the begin and end states of a HMMER Plan7 model). The profile +includes the nonhomologous N, C, and J states, local/glocal and +uni/multihit algorithm configuration, and the target length model. +Therefore sequences sampled from a profile may include nonhomologous +as well as homologous sequences, and may contain more than one +homologous sequence segment. By default, the profile is in multihit +local mode, and the target sequence length is configured for L=400. + + + + +.SH OPTIONS CONTROLLING EMISSION FROM PROFILES + +These options require that you have set the +.B \-p +option. + +.TP +.BI \-L " " +Configure the profile's target sequence length model to generate a +mean length of approximately rather than the default of 400. + +.TP +.B \-\-local +Configure the profile for multihit local alignment. + +.TP +.B \-\-unilocal +Configure the profile for unihit local alignment (Smith/Waterman). + +.TP +.B \-\-glocal +Configure the profile for multihit glocal alignment. + +.TP +.B \-\-uniglocal +Configure the profile for unihit glocal alignment. + + +.SH OPTIONS CONTROLLING FANCY CONSENSUS EMISSION + +These options require that you have set the +.B \-C +option. + +.TP +.BI \-\-minl " " +Sets the +.B minl +threshold for showing weakly conserved residues as lower case. +(0 <= x <= 1) + +.TP +.BI \-\-minu " " +Sets the +.B minu +threshold for showing strongly conserved residues as upper case. +(0 <= x <= 1) + + + +.SH OTHER OPTIONS + +.TP +.BI \-\-seed " " +Seed the random number generator with +.IR , +an integer >= 0. +If +.I +is nonzero, any stochastic simulations will be reproducible; the same +command will give the same results. +If +.I +is 0, the random number generator is seeded arbitrarily, and +stochastic simulations will vary from run to run of the same command. +The default is 0: use an arbitrary seed, so different +.B hmmemit +runs will generate different samples. + + + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + + + diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmer.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmer.1 new file mode 100644 index 0000000..bc46865 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmer.1 @@ -0,0 +1,199 @@ +.TH "HMMER" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME + +HMMER \- profile HMMs for biological sequence analysis + +.SH SYNOPSIS + +.nf +.B hmmalign + Align sequences to a profile + +.B hmmbuild + Construct profiles from multiple sequence alignments + +.B hmmconvert + Convert profile file to various formats + +.B hmmemit + Sample sequences from a profile + +.B hmmfetch + Retrieve profiles from a file + +.B hmmlogo + Produce a conservation logo graphic from a profile + +.B hmmpgmd + Daemon for database search web services + +.B hmmpress + Prepare a profile database for hmmscan + +.B hmmscan + Search sequence(s) against a profile database + +.B hmmsearch + Search profile(s) against a sequence database + +.B hmmsim + Collect profile score distributions on random sequences + +.B hmmstat + Summary statistics for a profile file + +.B jackhmmer + Iteratively search sequence(s) against a sequence database + +.B makehmmerdb + build nhmmer database from a sequence file + +.B nhmmer + Search DNA/RNA queries against a DNA/RNA sequence database + +.B nhmmscan + Search DNA/RNA sequence(s) against a DNA/RNA profile database + +.B phmmer + Search protein sequence(s) against a protein sequence database + +.B alimask + Calculate and add column mask to a multiple sequence alignment +.fi + +.SH DESCRIPTION + +HMMER is a suite of several programs for biological sequence alignment +and database homology search. It uses probabilistic models called +"profile hidden Markov models" (profile HMMs) to represent the likely +evolutionary homologs of a single sequence or a multiple alignment of +a sequence family. A main avenue of research is to improve the +evolutionary predictive models in HMMER to be able to recognize and +accurately align increasingly remote homologs, distant in time. + +HMMER is also used as an organizational tool, to group the +exponentially growing number of biological sequences into a vastly +smaller set of well-annotated sequence families. New sequences can be +annotated by comparison against curated sequence family databases of +prebuilt HMMER profiles, in addition or instead of comparison to the +entire sequence database. Databases such as Pfam, SMART, and +TIGRfams, among others, are based on this principle. + +HMMER is used in three main modes: to search a sequence database for +new homologs of a sequence or a sequence family; to search a profile +database (like Pfam) to find what known family a query sequence +belongs to, or what domains it has; and to automatically construct +large multiple alignments (i.e. with an effectively unlimited number +of sequences) using a profile representative of a sequence family. + + +Suppose you have a multiple sequence alignment of a sequence family of +interest, and you want to search a sequence database for additional +homologs. The +.B hmmbuild +program builds profile(s) from multiple alignment(s). +The +.B hmmsearch +program searches protein profile(s) against a protein sequence database, +and +.B nhmmer +searches nucleotide profile(s) against a nucleotide sequence database. + +Suppose you have a single sequence of interest, and you want to search +a sequence database for additional homologs. The +.B phmmer +program searches a single protein sequence against a protein sequence +database. The +.B jackhmmer +program does the same thing but iteratively -- homologs detected in a +previous round are incorporated into a new profile, and the new +profile is searched again. +.B phmmer +is used like BLASTP, and +.B jackhmmer +is used like a protein PSI-BLAST. The +.B nhmmer +program searches a single nucleotide sequence against a nucleotide sequence. + +Suppose you have sequence(s) that you want to analyze using a +HMMER-based profile HMM database like Pfam (http://pfam.sanger.ac.uk). +The +.B hmmpress +program formats a profile HMM flatfile (such as the file you +would download from Pfam) into a HMMER binary database. +The +.B hmmscan +program searches protein sequence(s) against that database. +The +.B nhmmscan +program can similarly search nucleotide sequence(s) against +a pressed database of nucleotide profiles, such as from +Dfam (http://dfam.janelia.org). + + +Suppose you want to align lots of sequences. You can construct a +manageably small alignment of a representative set of sequences, +build a profile with +.BR hmmbuild , +and use the +.B hmmalign +program to align any number of sequences to that profile. + +HMMER also includes some auxiliary tools for working with large +profile databases. +.B hmmfetch +fetches one or more profiles from a database. +.B hmmstat +prints summary statistics about a profile file. + +For compatibility with other profile software and previous versions of +HMMER, the +.B hmmconvert +program converts profiles to a few other formats. We intend to add +more support for other formats over time. + +The +.B hmmemit +program generates (simulates) "homologous" sequences by sampling from +a profile. It can also generate a "consensus" sequence. + +The +.B hmmsim +program is a simulator used for collecting statistics about score +distributions on random sequences. + +Each program has its own man page. + + +.SH SEE ALSO + +This is a summary man page for the entire HMMER3 package. +See individual man pages +[\fBhmmbuild\fR(1), +for example] for usage, options, and description of each program in the package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmfetch.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmfetch.1 new file mode 100644 index 0000000..75595bc --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmfetch.1 @@ -0,0 +1,195 @@ +.TH "hmmfetch" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmfetch \- retrieve profiles from a file + +.SH SYNOPSIS + +.nf +\fBhmmfetch\fR [\fIoptions\fR] \fIhmmfile key\fR + (retrieve HMM named \fIkey\fR) + +\fBhmmfetch \-f \fR[\fIoptions\fR] \fIhmmfile keyfile\fR + (retrieve all HMMs listed in \fIkeyfile\fR) + +\fBhmmfetch \-\-index \fR[\fIoptions\fR] \fIhmmfile\fR + (index \fIhmmfile\fR for fetching) +.fi + +.SH DESCRIPTION + +.PP +Quickly retrieves one or more profile HMMs from an +.I hmmfile +(a large Pfam database, for example). + +.PP +For maximum speed, the +.I hmmfile +should be indexed first, using +.BR "hmmfetch \-\-index" . +The index is a binary file named +.IR hmmfile .ssi. +However, this is optional, and retrieval will still +work from unindexed files, albeit much more slowly. + +.PP +The default mode is to retrieve a single profile by name or +accession, called the +.IR key . +For example: + +.nf + \fB% hmmfetch Pfam-A.hmm Caudal_act\fR + \fB% hmmfetch Pfam-A.hmm PF00045\fR +.fi + +.PP +With the +.B \-f +option, a +.I keyfile +containing a list of one or more keys is read instead. +The first whitespace-delimited field on each non-blank non-comment +line of the +.I keyfile +is used as a +.IR key , +and any remaining data on the line is ignored. This allows +a variety of whitespace delimited datafiles to be used +as a +.IR keyfile . + +.PP +When using +.B \-f +and a +.IR keyfile , +if +.B hmmfile +has been indexed, the keys are retrieved in the order +they occur in the +.IR keyfile , +but if +.B hmmfile +isn't indexed, keys are retrieved in the order they occur +in the +.BR hmmfile . +This is a side effect of an implementation that allows +multiple keys to be retrieved even if the +.B hmmfile +is a nonrewindable stream, like a standard input pipe. + +.PP +In normal use +(without +.B \-\-index +or +.B \-f +options), +.I hmmfile +may be '\-' (dash), which +means reading input from stdin rather than a file. +With the +.B \-\-index +option, +.I hmmfile +may not be '\-'; it does not make sense +to index a standard input stream. +With the +.B \-f +option, +either +.I hmmfile +or +.I keyfile +(but not both) may be '\-'. +It is often particularly useful to read +.I keyfile +from standard input, because this allows +use to use arbitrary command line invocations to +create a list of HMM names or accessions, then fetch them all +to a new file, just with one command. + +.PP +By default, fetched HMMs are printed to standard output in HMMER3 format. + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + +.TP +.B \-f +The second commandline argument is a +.I keyfile +instead of a single +.IR key . +The first field on each line of the +.I keyfile +is used as a retrieval +.I key +(an HMM name or accession). +Blank lines and comment lines (that start with +a # character) are ignored. + +.TP +.BI \-o " " +Output HMM(s) to file +.I +instead of to standard output. + +.TP +.B \-O +Output HMM(s) to individual file(s) named +.I key +instead of standard output. + +.TP +.B \-\-index +Instead of retrieving one or more profiles from +.IR hmmfile , +index the +.I hmmfile +for future retrievals. +This creates a +.IR hmmfile .ssi +binary index file. + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmlogo.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmlogo.1 new file mode 100644 index 0000000..511bd68 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmlogo.1 @@ -0,0 +1,99 @@ +.TH "hmmlogo" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmlogo \- produce a conservation logo graphic from a profile + + +.SH SYNOPSIS +.B hmmlogo +[\fIoptions\fR] +.I hmmfile + + +.SH DESCRIPTION + +.PP +.B hmmlogo +computes letter height and indel parameters that can be used to +produce a profile HMM logo. This tool is essentially a +command-line interface for much of the data underlying the Skylign +logo server (skylign.org). + +By default, +.B hmmlogo +prints out a table of per-position letter heights (dependent on the +requested height method), then prints a table of per-position gap +probabilities. + +In a typical logo, the total height of a stack of letters for one +position depends on the information content of the position, and +that stack height is subdivided according to the emission +probabilities of the letters of the alphabet. + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + + +.TP +.B \-\-height_relent_all +Total height = relative entropy (aka information content); all letters +are given a positive height. (default) + +.TP +.B \-\-height_relent_abovebg +Total height = relative entropy (aka information content); only letters +with above-background probability are given positive height. + +.TP +.B \-\-height_score +Total height = sums of scores of positive-scoring letters; letter +height depends on the score of that letter at that position. Only +letters with above-background probability (positive score) are +given positive height. (Note that only letter height is meaningful - +stack height has no inherent meaning). + +.TP +.B \-\-no_indel +Don't print out the indel probability table. + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + + + diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmpgmd.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmpgmd.1 new file mode 100644 index 0000000..021759e --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmpgmd.1 @@ -0,0 +1,219 @@ +.TH "hmmpgmd" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmpgmd \- daemon for database search web services + + +.SH SYNOPSIS +.B hmmpgmd +[\fIoptions\fR] + + +.SH DESCRIPTION + +.PP +The +.B hmmpgmd +program is the daemon that we use internally for the hmmer.org web server. +It essentially stands in front of the search programs +.BR phmmer , +.BR hmmsearch , +and +.BR hmmscan . + +.PP +To use +.BR hmmpgmd , +first an instance must be started up as a +master +server, and provided with at least one +sequence database +(using the +.B \-\-seqdb +flag) +and/or an +HMM database +(using the +.B \-\-hmmdb +flag). +A sequence database must be in hmmpgmd format, which may be +produced using +.BR esl-reformat . +An HMM database is of the form produced by +.BR hmmbuild . +The input database(s) will be loaded into memory by the +master. When the master has finished loading the database(s), it +prints the line: +"Data loaded into memory. Master is ready." + + +.PP +After the master is ready, one or more instances of hmmpgmd may +be started as workers. These workers may be (and typically are) on +different machines from the master, but must have access to the +same database file(s) provided to the master, with the same path. As +with the master, each worker loads the database(s) into memory, and +indicates completion by printing: "Data loaded into memory. Worker is ready." + + +.PP +The master process and workers are expected to remain running. +One or more clients then connect to the master and submit possibly +many queries. The master distributes the work of a query among the +workers, collects results, and merges them before responding to the +client. Two example client programs are included in the HMMER src +directory - the C program +.B hmmc2 +and the perl script +.BR hmmpgmd_client_example.pl . +These are intended as examples only, and should be extended as +necessary to meet your needs. + +.PP +A query is submitted to the master from the client as a character +string. Queries may be the sort that would normally be handled +by +.B phmmer +(protein sequence vs protein database), +.B hmmsearch +(protein HMM query vs protein database), or +.B hmmscan +(protein query vs protein HMM database). + + +The general form of a client query is to start with a single line +of the form +.BR "@[options]" , +followed by multiple lines of text representing either the query HMM +or fasta-formatted sequence. The final line of each query is the separator +.BR "//" . + + +.PP +For example, to perform a +.B phmmer +type search of a sequence against a sequence database +file, the first line is of the form +.BR "@\-\-seqdb 1" , +then the fasta-formatted query sequence starting with the header line +.BR >sequence-name , +followed by one or more lines of sequence, and finally the closing +.BR "//" . + +.PP +To perform an +.B hmmsearch +type search, the query sequence is replaced by the full +text of a HMMER-format query HMM. + +.PP +To perform an +.B hmmscan +type search, the text matches that of the +.B phmmer +type search, except that the first line changes to +.BR "@\-\-hmmdb 1" . + +.PP +In the hmmpgmd-formatted sequence database file, each sequence +can be associated with one or more sub-databases. The +.B \-\-seqdb +flag indicates which of these sub-databases will be queried. +The HMM database format does not support sub-databases. + + + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + +.TP +.BI \-\-master +Run as the master server. + +.TP +.BI \-\-worker " " +Run as a worker, connecting to the master server that is running on IP +address +.IR . + +.TP +.BI \-\-cport " " +Port to use for communication between clients and the master server. +The default is 51371. + +.TP +.BI \-\-wport " " +Port to use for communication between workers and the master server. +The default is 51372. + +.TP +.BI \-\-ccncts " " +Maximum number of client connections to accept. The default is 16. + +.TP +.BI \-\-wcncts " " +Maximum number of worker connections to accept. The default is 32. + +.TP +.BI \-\-pid " " +Name of file into which the process id will be written. + +.TP +.BI \-\-seqdb " " +Name of the file (in +.B hmmpgmd +format) containing protein sequences. +The contents of this file will be cached for searches. + +.TP +.BI \-\-hmmdb " " +Name of the file containing protein HMMs. The contents of this file +will be cached for searches. + +.TP +.BI \-\-cpu " " +Number of parallel threads to use (for +.B \-\-worker +). + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + + + diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmpgmd_shard.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmpgmd_shard.1 new file mode 100644 index 0000000..26c515c --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmpgmd_shard.1 @@ -0,0 +1,162 @@ +.TH "hmmpgmd_shard" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmpgmd_shard \- sharded daemon for database search web services + + +.SH SYNOPSIS +.B hmmpgmd_shard +[\fIoptions\fR] + + +.SH DESCRIPTION + +.PP +The +.B hmmpgmd_shard +program provides a sharded version of the +.B hmmpgmd +program that we use internally to implement high-performance HMMER services that can be accessed via the internet. See the +.B hmmpgmd +man page for a discussion of how the base +.B hmmpgmd +program is used. This man page discusses differences between +.B hmmpgmd_shard +and +.B hmmpgmd. +The base +.B hmmpgmd +program loads the entirety of its database file into RAM on every worker node, in spite of the fact that each worker node searches a predictable fraction of the database(s) contained in that file when performing searches. This wastes RAM, particularly when many worker nodes are used to accelerate searches of large databases. + +.PP +.B Hmmpgmd_shard +addresses this by dividing protein sequence database files into shards. Each worker node loads only 1/Nth of the database file, where N is the number of worker nodes attached to the master. HMM database files are not sharded, meaning that every worker node will load the entire database file into RAM. Current HMM databases are much smaller than current protein sequence databases, and easily fit into the RAM of modern servers even without sharding. + +.PP +.B Hmmpgmd_shard +is used in the same manner as +.B hmmpgmd +, except that it takes one additional argument: +.BI \-\-num_shards " " +, which specifies the number of shards that protein databases will be divided into, and defaults to 1 if unspecified. This argument is only valid for the master node of a +.B hmmpgmd +system (i.e., when +.BI \-\-master +is passed to the +.B hmmpgmd +program), and must be equal to the number of worker nodes that will connect to the master node. +.B Hmmpgmd_shard +will signal an error if more than +.BI num_shards +worker nodes attempt to connect to the master node or if a search is started when fewer than +.BI num_shards +workers are connected to the master. + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + +.TP +.BI \-\-master +Run as the master server. + +.TP +.BI \-\-worker " " +Run as a worker, connecting to the master server that is running on IP +address +.IR . + +.TP +.BI \-\-cport " " +Port to use for communication between clients and the master server. +The default is 51371. + +.TP +.BI \-\-wport " " +Port to use for communication between workers and the master server. +The default is 51372. + +.TP +.BI \-\-ccncts " " +Maximum number of client connections to accept. The default is 16. + +.TP +.BI \-\-wcncts " " +Maximum number of worker connections to accept. The default is 32. + +.TP +.BI \-\-pid " " +Name of file into which the process id will be written. + +.TP +.BI \-\-seqdb " " +Name of the file (in +.B hmmpgmd +format) containing protein sequences. +The contents of this file will be cached for searches. + +.TP +.BI \-\-hmmdb " " +Name of the file containing protein HMMs. The contents of this file +will be cached for searches. + +.TP +.BI \-\-cpu " " +Number of parallel threads to use (for +.B \-\-worker +). + +.TP +.BI \-\-num_shards " " +Number of shards to divide cached sequence database(s) into. HMM databases are not sharded, due to their small size. +This option is only valid when the +.B \-\-master +option is present, and defaults to 1 if not specified. +.B Hmmpgmd_shard +requires that the number of shards be equal to the number of worker nodes, and will give errors if more than +.BI num_shards +workers attempt to connect to the master node or if a search is started with fewer than +.BI num_shards +workers connected to the master. + +.SH SEE ALSO + +See +.BR hmmmpgmd (1) +for a description of the base hmmpgmd command and how the daemon should be used. + +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + + + diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmpress.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmpress.1 new file mode 100644 index 0000000..1d9db01 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmpress.1 @@ -0,0 +1,106 @@ +.TH "hmmpress" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmpress \- prepare a profile database for hmmscan + +.SH SYNOPSIS + +.B hmmpress +[\fIoptions\fR] +.I hmmfile + + +.SH DESCRIPTION + +.PP +Constructs binary compressed datafiles for +.BR hmmscan , +starting from a profile database +.I hmmfile +in standard HMMER3 format. +The +.B hmmpress +step is required for +.B hmmscan +to work. + +.PP +Four files are created: +.IB hmmfile .h3m, +.IB hmmfile .h3i, +.IB hmmfile .h3f, +and +.IB hmmfile .h3p. +The +.IB hmmfile .h3m +file contains the profile HMMs and their annotation in a binary +format. +The +.IB hmmfile .h3i +file is an SSI index for the +.IB hmmfile .h3m +file. +The +.IB hmmfile .h3f +file contains precomputed data structures +for the fast heuristic filter (the MSV filter). +The +.IB hmmfile .h3p +file contains precomputed data structures +for the rest of each profile. + +.PP +.I hmmfile +may not be '\-' (dash); running +.B hmmpress +on a standard input stream rather than a file +is not allowed. + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + +.TP +.B \-f +Force; overwrites any previous hmmpress'ed datafiles. The default is +to bitch about any existing files and ask you to delete them first. + + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmscan.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmscan.1 new file mode 100644 index 0000000..e0074b6 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmscan.1 @@ -0,0 +1,466 @@ +.TH "hmmscan" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmscan \- search sequence(s) against a profile database + + +.SH SYNOPSIS +.B hmmscan +[\fIoptions\fR] +.I hmmdb +.I seqfile + + + +.SH DESCRIPTION + +.PP +.B hmmscan +is used to search protein sequences against collections +of protein profiles. For each sequence in +.IR seqfile , +use that query sequence to search the target database of +profiles in +.IR hmmdb , +and output ranked lists of the profiles with the +most significant matches to the sequence. + +.PP +The +.I seqfile +may contain more than one query sequence. Each will be searched +in turn against +.I hmmdb. + +.PP +The +.I hmmdb +needs to be press'ed using +.B hmmpress +before it can be searched with +.BR hmmscan . +This creates four binary files, +suffixed +.BR .h3{fimp} . + +.PP +The query +.I seqfile +may be '\-' (a dash character), in which case +the query sequences are read from a +stdin +pipe instead of from a file. +The +.I hmmdb +cannot be read from a +stdin +stream, because it needs to have +those four auxiliary binary files generated by +.BR hmmpress . + +.PP +The output format is designed to be human-readable, but is often so +voluminous that reading it is impractical, and parsing it is a pain. The +.B \-\-tblout +and +.B \-\-domtblout +options save output in simple tabular formats that are concise and +easier to parse. +The +.B \-o +option allows redirecting the main output, including throwing it away +in /dev/null. + + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + + + +.SH OPTIONS FOR CONTROLLING OUTPUT + +.TP +.BI \-o " " +Direct the main human-readable output to a file +.I +instead of the default stdout. + +.TP +.BI \-\-tblout " " +Save a simple tabular (space-delimited) file summarizing the +per-target output, with one data line per homologous target model +found. + +.TP +.BI \-\-domtblout " " +Save a simple tabular (space-delimited) file summarizing the +per-domain output, with one data line per homologous domain +detected in a query sequence for each homologous model. + +.TP +.BI \-\-pfamtblout " " +Save an especially succinct tabular (space-delimited) file +summarizing the per-target output, with one data line per +homologous target model found. + + +.TP +.B \-\-acc +Use accessions instead of names in the main output, where available +for profiles and/or sequences. + +.TP +.B \-\-noali +Omit the alignment section from the main output. This can greatly +reduce the output volume. + +.TP +.B \-\-notextw +Unlimit the length of each line in the main output. The default +is a limit of 120 characters per line, which helps in displaying +the output cleanly on terminals and in editors, but can truncate +target profile description lines. + +.TP +.BI \-\-textw " " +Set the main output's line length limit to +.I +characters per line. The default is 120. + + + +.SH OPTIONS FOR REPORTING THRESHOLDS + +Reporting thresholds control which hits are reported in output files +(the main output, +.BR \-\-tblout , +and +.BR \-\-domtblout ). + +.TP +.BI \-E " " +In the per-target output, report target profiles with an E-value of <= +.IR . +The default is 10.0, meaning that on average, about 10 false positives +will be reported per query, so you can see the top of the noise +and decide for yourself if it's really noise. + +.TP +.BI \-T " " +Instead of thresholding per-profile output on E-value, instead +report target profiles with a bit score of >= +.IR . + +.TP +.BI \-\-domE " " +In the per-domain output, for target profiles that have already satisfied +the per-profile reporting threshold, report individual domains +with a conditional E-value of <= +.IR . +The default is 10.0. +A conditional E-value means the expected number of additional false +positive domains in the smaller search space of those comparisons that +already satisfied the per-profile reporting threshold (and thus must +have at least one homologous domain already). + + +.TP +.BI \-\-domT " " +Instead of thresholding per-domain output on E-value, instead +report domains with a bit score of >= +.IR . + + + + +.SH OPTIONS FOR INCLUSION THRESHOLDS + +Inclusion thresholds are stricter than reporting thresholds. +Inclusion thresholds control which hits are considered to be +reliable enough +to be included in an output alignment or a subsequent search round. +In +.BR hmmscan , +which does not have any alignment output (like +.B hmmsearch +or +.BR phmmer ) +nor any iterative search steps (like +.BR jackhmmer ), +inclusion thresholds have little effect. They only affect what domains +get marked as significant (!) or questionable (?) in domain +output. + +.TP +.BI \-\-incE " " +Use an E-value of <= +.I +as the per-target inclusion threshold. +The default is 0.01, meaning that on average, about 1 false positive +would be expected in every 100 searches with different query +sequences. + +.TP +.BI \-\-incT " " +Instead of using E-values for setting the inclusion threshold, instead +use a bit score of >= +.I +as the per-target inclusion threshold. +It would be unusual to use bit score thresholds with +.IR hmmscan , +because you don't expect a single score threshold to work for +different profiles; different profiles have slightly different +expected score distributions. + +.TP +.BI \-\-incdomE " " +Use a conditional E-value of <= +.I +as the per-domain inclusion threshold, in targets that have already +satisfied the overall per-target inclusion threshold. +The default is 0.01. + +.TP +.BI \-\-incdomT " " +Instead of using E-values, +instead use a bit score of >= +.I +as the per-domain inclusion threshold. +As with +.B \-\-incT +above, +it would be unusual to use a single bit score threshold in +.BR hmmscan . + + + +.SH OPTIONS FOR MODEL-SPECIFIC SCORE THRESHOLDING + +Curated profile databases may define specific bit score thresholds for +each profile, superseding any thresholding based on statistical +significance alone. + +To use these options, the profile must contain the appropriate (GA, +TC, and/or NC) optional score threshold annotation; this is picked up +by +.B hmmbuild +from Stockholm format alignment files. Each thresholding option has +two scores: the per-sequence threshold +.I +and the per-domain threshold +.IR . +These act as if +.BI \-T " " +.BI \-\-incT " " +.BI \-\-domT " " +.BI \-\-incdomT " " +has been applied specifically using each model's curated thresholds. + +.TP +.B \-\-cut_ga +Use the GA (gathering) bit scores in the model to set +per-sequence (GA1) and per-domain (GA2) reporting and inclusion +thresholds. GA thresholds are generally considered to be the +reliable curated thresholds defining family membership; for example, +in Pfam, these thresholds define what gets included in Pfam Full +alignments based on searches with Pfam Seed models. + +.TP +.B \-\-cut_nc +Use the NC (noise cutoff) bit score thresholds in the model to set +per-sequence (NC1) and per-domain (NC2) reporting and inclusion +thresholds. NC thresholds are generally considered to be the score of +the highest-scoring known false positive. + +.TP +.B \-\-cut_tc +Use the NC (trusted cutoff) bit score thresholds in the model to set +per-sequence (TC1) and per-domain (TC2) reporting and inclusion +thresholds. TC thresholds are generally considered to be the score of +the lowest-scoring known true positive that is above all known false +positives. + + + + +.SH CONTROL OF THE ACCELERATION PIPELINE + +HMMER3 searches are accelerated in a three-step filter pipeline: the +MSV filter, the Viterbi filter, and the Forward filter. The first +filter is the fastest and most approximate; the last is the full +Forward scoring algorithm. There is also a bias filter step between +MSV and Viterbi. Targets that pass all the steps in the acceleration +pipeline are then subjected to postprocessing -- domain +identification and scoring using the Forward/Backward algorithm. + +Changing filter thresholds only removes or includes targets from +consideration; changing filter thresholds does not alter bit scores, +E-values, or alignments, all of which are determined solely in +postprocessing. + +.TP +.B \-\-max +Turn off all filters, including the bias filter, and run full +Forward/Backward postprocessing on every target. This increases +sensitivity somewhat, at a large cost in speed. + +.TP +.BI \-\-F1 " " +Set the P-value threshold for the MSV filter step. The default is +0.02, meaning that roughly 2% of the highest scoring nonhomologous +targets are expected to pass the filter. + +.TP +.BI \-\-F2 " " +Set the P-value threshold for the Viterbi filter step. +The default is 0.001. + +.TP +.BI \-\-F3 " " +Set the P-value threshold for the Forward filter step. +The default is 1e-5. + +.TP +.B \-\-nobias +Turn off the bias filter. This increases sensitivity somewhat, but can +come at a high cost in speed, especially if the query has biased +residue composition (such as a repetitive sequence region, or if it is +a membrane protein with large regions of hydrophobicity). Without the +bias filter, too many sequences may pass the filter with biased +queries, leading to slower than expected performance as the +computationally intensive Forward/Backward algorithms shoulder an +abnormally heavy load. + + + +.SH OTHER OPTIONS + +.TP +.B \-\-nonull2 +Turn off the null2 score corrections for biased composition. + +.TP +.BI \-Z " " +Assert that the total number of targets in your searches is +.IR , +for the purposes of per-sequence E-value calculations, +rather than the actual number of targets seen. + +.TP +.BI \-\-domZ " " +Assert that the total number of targets in your searches is +.IR , +for the purposes of per-domain conditional E-value calculations, +rather than the number of targets that passed the reporting thresholds. + +.TP +.BI \-\-seed " " +Set the random number seed to +.IR . +Some steps in postprocessing require Monte Carlo simulation. The +default is to use a fixed seed (42), so that results are exactly +reproducible. Any other positive integer will give different (but also +reproducible) results. A choice of 0 uses an arbitrarily chosen seed. + +.TP +.BI \-\-qformat " " +Assert that input +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). + + + +.TP +.BI \-\-cpu " " +Set the number of parallel worker threads to +.IR . +On multicore machines, the default is 2. +You can also control this number by setting an environment variable, +.IR HMMER_NCPU . +There is also a master thread, so the actual number of threads that +HMMER spawns is +.IR +1. + +This option is not available if HMMER was compiled with POSIX threads +support turned off. + + +.TP +.BI \-\-stall +For debugging the MPI master/worker version: pause after start, to +enable the developer to attach debuggers to the running master and +worker(s) processes. Send SIGCONT signal to release the pause. +(Under gdb: +.BR "(gdb) signal SIGCONT" ) + +(Only available if optional MPI support was enabled at compile-time.) + +.TP +.B \-\-mpi +Run under MPI control with master/worker parallelization (using +.BR mpirun , +for example, or equivalent). Only available if optional MPI support +was enabled at compile-time. + + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmsearch.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmsearch.1 new file mode 100644 index 0000000..32216d3 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmsearch.1 @@ -0,0 +1,444 @@ +.TH "hmmsearch" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmsearch \- search profile(s) against a sequence database + + +.SH SYNOPSIS +.B hmmsearch +[\fIoptions\fR] +.I hmmfile +.I seqdb + + +.SH DESCRIPTION + +.PP +.B hmmsearch +is used to search one or more profiles against a sequence database. +For each profile in +.IR hmmfile , +use that query profile to search the target database of sequences in +.IR seqdb , +and output ranked lists of the sequences with the most significant +matches to the profile. +To build profiles from multiple alignments, see +.BR hmmbuild . + +.PP +Either the query +.I hmmfile +or the target +.I seqdb +may be '\-' (a dash character), in which case +the query profile or target database input will be read from a +stdin +pipe instead of from a +file. Only one input source can come through +stdin, +not both. +An exception is that if the +.I hmmfile +contains more than one profile query, then +.I seqdb +cannot come from stdin, because we can't rewind the +streaming target database to search it with another profile. + +.PP +The output format is designed to be human-readable, but is often so +voluminous that reading it is impractical, and parsing it is a pain. The +.B \-\-tblout +and +.B \-\-domtblout +options save output in simple tabular formats that are concise and +easier to parse. +The +.B \-o +option allows redirecting the main output, including throwing it away +in /dev/null. + + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + + + +.SH OPTIONS FOR CONTROLLING OUTPUT + +.TP +.BI \-o " " +Direct the main human-readable output to a file +.I +instead of the default stdout. + +.TP +.BI \-A " " +Save a multiple alignment of all significant hits (those satisfying +.IR "inclusion thresholds" ) +to the file +.IR . + +.TP +.BI \-\-tblout " " +Save a simple tabular (space-delimited) file summarizing the +per-target output, with one data line per homologous target sequence +found. + +.TP +.BI \-\-domtblout " " +Save a simple tabular (space-delimited) file summarizing the +per-domain output, with one data line per homologous domain +detected in a query sequence for each homologous model. + +.TP +.B \-\-acc +Use accessions instead of names in the main output, where available +for profiles and/or sequences. + +.TP +.B \-\-noali +Omit the alignment section from the main output. This can greatly +reduce the output volume. + +.TP +.B \-\-notextw +Unlimit the length of each line in the main output. The default +is a limit of 120 characters per line, which helps in displaying +the output cleanly on terminals and in editors, but can truncate +target profile description lines. + +.TP +.BI \-\-textw " " +Set the main output's line length limit to +.I +characters per line. The default is 120. + + + +.SH OPTIONS CONTROLLING REPORTING THRESHOLDS + +Reporting thresholds control which hits are reported in output files +(the main output, +.BR \-\-tblout , +and +.BR \-\-domtblout ). +Sequence hits and domain hits are ranked by statistical significance +(E-value) and output is generated in two sections called per-target +and per-domain output. In per-target output, by default, all +sequence hits with an E-value <= 10 are reported. In the per-domain +output, for each target that has passed per-target reporting +thresholds, all domains satisfying per-domain reporting thresholds are +reported. By default, these are domains with conditional E-values of +<= 10. The following options allow you to change the default +E-value reporting thresholds, or to use bit score thresholds instead. + + +.TP +.BI \-E " " +In the per-target output, report target sequences with an E-value of <= +.IR . +The default is 10.0, meaning that on average, about 10 false positives +will be reported per query, so you can see the top of the noise +and decide for yourself if it's really noise. + +.TP +.BI \-T " " +Instead of thresholding per-profile output on E-value, instead +report target sequences with a bit score of >= +.IR . + +.TP +.BI \-\-domE " " +In the per-domain output, for target sequences that have already satisfied +the per-profile reporting threshold, report individual domains +with a conditional E-value of <= +.IR . +The default is 10.0. +A conditional E-value means the expected number of additional false +positive domains in the smaller search space of those comparisons that +already satisfied the per-target reporting threshold (and thus must +have at least one homologous domain already). + + +.TP +.BI \-\-domT " " +Instead of thresholding per-domain output on E-value, instead +report domains with a bit score of >= +.IR . + + + + +.SH OPTIONS FOR INCLUSION THRESHOLDS + +Inclusion thresholds are stricter than reporting thresholds. +Inclusion thresholds control which hits are considered to be reliable +enough to be included in an output alignment or a subsequent search +round, or marked as significant ("!") as opposed to questionable ("?") +in domain output. + +.TP +.BI \-\-incE " " +Use an E-value of <= +.I +as the per-target inclusion threshold. +The default is 0.01, meaning that on average, about 1 false positive +would be expected in every 100 searches with different query +sequences. + +.TP +.BI \-\-incT " " +Instead of using E-values for setting the inclusion threshold, instead +use a bit score of >= +.I +as the per-target inclusion threshold. +By default this option is unset. + +.TP +.BI \-\-incdomE " " +Use a conditional E-value of <= +.I +as the per-domain inclusion threshold, in targets that have already +satisfied the overall per-target inclusion threshold. +The default is 0.01. + +.TP +.BI \-\-incdomT " " +Instead of using E-values, +use a bit score of >= +.I +as the per-domain inclusion threshold. + + + +.SH OPTIONS FOR MODEL-SPECIFIC SCORE THRESHOLDING + +Curated profile databases may define specific bit score thresholds for +each profile, superseding any thresholding based on statistical +significance alone. + +To use these options, the profile must contain the appropriate (GA, +TC, and/or NC) optional score threshold annotation; this is picked up +by +.B hmmbuild +from Stockholm format alignment files. Each thresholding option has +two scores: the per-sequence threshold and the per-domain +threshold +These act as if +.BI \-T " " +.BI \-\-incT " " +.BI \-\-domT " " +.BI \-\-incdomT " " +has been applied specifically using each model's curated thresholds. + +.TP +.B \-\-cut_ga +Use the GA (gathering) bit scores in the model to set +per-sequence (GA1) and per-domain (GA2) reporting and inclusion +thresholds. GA thresholds are generally considered to be the +reliable curated thresholds defining family membership; for example, +in Pfam, these thresholds define what gets included in Pfam Full +alignments based on searches with Pfam Seed models. + +.TP +.B \-\-cut_nc +Use the NC (noise cutoff) bit score thresholds in the model to set +per-sequence (NC1) and per-domain (NC2) reporting and inclusion +thresholds. NC thresholds are generally considered to be the score of +the highest-scoring known false positive. + +.TP +.B \-\-cut_tc +Use the TC (trusted cutoff) bit score thresholds in the model to set +per-sequence (TC1) and per-domain (TC2) reporting and inclusion +thresholds. TC thresholds are generally considered to be the score of +the lowest-scoring known true positive that is above all known false +positives. + + + + +.SH OPTIONS CONTROLLING THE ACCELERATION PIPELINE + +HMMER3 searches are accelerated in a three-step filter pipeline: the +MSV filter, the Viterbi filter, and the Forward filter. The first +filter is the fastest and most approximate; the last is the full +Forward scoring algorithm. There is also a bias filter step between +MSV and Viterbi. Targets that pass all the steps in the acceleration +pipeline are then subjected to postprocessing -- domain +identification and scoring using the Forward/Backward algorithm. + +Changing filter thresholds only removes or includes targets from +consideration; changing filter thresholds does not alter bit scores, +E-values, or alignments, all of which are determined solely in +postprocessing. + +.TP +.B \-\-max +Turn off all filters, including the bias filter, and run full +Forward/Backward postprocessing on every target. This increases +sensitivity somewhat, at a large cost in speed. + +.TP +.BI \-\-F1 " " +Set the P-value threshold for the MSV filter step. The default is +0.02, meaning that roughly 2% of the highest scoring nonhomologous +targets are expected to pass the filter. + +.TP +.BI \-\-F2 " " +Set the P-value threshold for the Viterbi filter step. +The default is 0.001. + +.TP +.BI \-\-F3 " " +Set the P-value threshold for the Forward filter step. +The default is 1e-5. + +.TP +.B \-\-nobias +Turn off the bias filter. This increases sensitivity somewhat, but can +come at a high cost in speed, especially if the query has biased +residue composition (such as a repetitive sequence region, or if it is +a membrane protein with large regions of hydrophobicity). Without the +bias filter, too many sequences may pass the filter with biased +queries, leading to slower than expected performance as the +computationally intensive Forward/Backward algorithms shoulder an +abnormally heavy load. + + + +.SH OTHER OPTIONS + +.TP +.B \-\-nonull2 +Turn off the null2 score corrections for biased composition. + +.TP +.BI \-Z " " +Assert that the total number of targets in your searches is +.IR , +for the purposes of per-sequence E-value calculations, +rather than the actual number of targets seen. + +.TP +.BI \-\-domZ " " +Assert that the total number of targets in your searches is +.IR , +for the purposes of per-domain conditional E-value calculations, +rather than the number of targets that passed the reporting thresholds. + +.TP +.BI \-\-seed " " +Set the random number seed to +.IR . +Some steps in postprocessing require Monte Carlo simulation. The +default is to use a fixed seed (42), so that results are exactly +reproducible. Any other positive integer will give different (but also +reproducible) results. A choice of 0 uses a randomly chosen seed. + +.TP +.BI \-\-tformat " " +Assert that target sequence file +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). + +.TP +.BI \-\-cpu " " +Set the number of parallel worker threads to +.IR . +On multicore machines, the default is 2. +You can also control this number by setting an environment variable, +.IR HMMER_NCPU . +There is also a master thread, so the actual number of threads that +HMMER spawns is +.IR +1. + +This option is not available if HMMER was compiled with POSIX threads +support turned off. + + +.TP +.BI \-\-stall +For debugging the MPI master/worker version: pause after start, to +enable the developer to attach debuggers to the running master and +worker(s) processes. Send SIGCONT signal to release the pause. +(Under gdb: +.BR "(gdb) signal SIGCONT" ) +(Only available if optional MPI support was enabled at compile-time.) + + + +.TP +.B \-\-mpi +Run under MPI control with master/worker parallelization (using +.BR mpirun , +for example, or equivalent). Only available if optional MPI support +was enabled at compile-time. + + + + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + + + diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmsim.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmsim.1 new file mode 100644 index 0000000..b095f6b --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmsim.1 @@ -0,0 +1,515 @@ +.TH "hmmsim" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmsim \- collect profile score distributions on random sequences + +.SH SYNOPSIS +.B hmmsim +[\fIoptions\fR] +.I hmmfile + + +.SH DESCRIPTION + +.PP +The +.B hmmsim +program generates random sequences, scores them with the model(s) in +.IR hmmfile , +and outputs various sorts of histograms, plots, and fitted +distributions for the resulting scores. + +.PP +.B hmmsim +is not a mainstream part of the HMMER package and most users would have +no reason to use it. It is used to develop and test the statistical +methods used to determine P-values and E-values in HMMER3. For +example, it was used to generate most of the results in a 2008 paper +on H3's local alignment statistics (PLoS Comp Bio 4:e1000069, 2008; +http://www.ploscompbiol.org/doi/pcbi.1000069). + +.PP +Because it is a research testbed, you should not expect it to be as +robust as other programs in the package. For example, options may +interact in weird ways; we haven't tested nor tried to anticipate all +different possible combinations. + +.PP +The main task is to fit a maximum likelihood Gumbel distribution to +Viterbi scores or an maximum likelihood exponential tail to +high-scoring Forward scores, and to test that these fitted +distributions obey the conjecture that lambda ~ log_2 for both the +Viterbi Gumbel and the Forward exponential tail. + +.PP +The output is a table of numbers, one row for each model. Four +different parametric fits to the score data are tested: (1) maximum +likelihood fits to both location (mu/tau) and slope (lambda) +parameters; (2) assuming lambda=log_2, maximum likelihood fit to the +location parameter only; (3) same but assuming an edge-corrected +lambda, using current procedures in H3 [Eddy, 2008]; and (4) using +both parameters determined by H3's current procedures. The standard +simple, quick and dirty statistic for goodness-of-fit is 'E@10', the +calculated E-value of the 10th ranked top hit, which we expect to be +about 10. + +.PP +In detail, the columns of the output are: + +.TP +.B name +Name of the model. + +.TP +.B tailp +Fraction of the highest scores used to fit the distribution. For +Viterbi, MSV, and Hybrid scores, this defaults to 1.0 (a Gumbel +distribution is fitted to all the data). For Forward scores, this +defaults to 0.02 (an exponential tail is fitted to the highest 2% +scores). + +.TP +.B mu/tau +Location parameter for the maximum likelihood fit to the data. + +.TP +.B lambda +Slope parameter for the maximum likelihood fit to the data. + +.TP +.B E@10 +The E-value calculated for the 10th ranked high score ('E@10') using the ML +mu/tau and lambda. By definition, this expected to be about 10, if +E-value estimation were accurate. + +.TP +.B mufix +Location parameter, for a maximum likelihood fit with a known (fixed) +slope parameter lambda of log_2 (0.693). + +.TP +.B E@10fix +The E-value calculated for the 10th ranked score using mufix and the +expected lambda = log_2 = 0.693. + + +.TP +.B mufix2 +Location parameter, for a maximum likelihood fit with an +edge-effect-corrected lambda. + +.TP +.B E@10fix2 +The E-value calculated for the 10th ranked score using mufix2 and the +edge-effect-corrected lambda. + +.TP +.B pmu +Location parameter as determined by H3's estimation procedures. + +.TP +.B plambda +Slope parameter as determined by H3's estimation procedures. + +.TP +.B pE@10 +The E-value calculated for the 10th ranked score using pmu, plambda. + + +.PP +At the end of this table, one more line is printed, starting with # +and summarizing the overall CPU time used by the simulations. + +.PP +Some of the optional output files are in xmgrace xy format. xmgrace is +powerful and freely available graph-plotting software. + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + +.TP +.B \-a +Collect expected Viterbi alignment length statistics from each +simulated sequence. This only works with Viterbi scores (the default; +see +.BR \-\-vit ). +Two additional fields are printed in the output table for +each model: the mean length of Viterbi alignments, and the standard +deviation. + +.TP +.B \-v +(Verbose). Print the scores too, one score per line. + +.TP +.BI \-L " " +Set the length of the randomly sampled (nonhomologous) sequences to +.IR . +The default is 100. + + +.TP +.BI \-N " " +Set the number of randomly sampled sequences to +.IR . +The default is 1000. + +.TP +.B \-\-mpi +Run under MPI control with master/worker parallelization (using +.BR mpirun , +for example, or equivalent). Only available if optional MPI support +was enabled at compile-time. + +It is parallelized at the level of sending one profile at a time to an +MPI worker process, so parallelization only helps if you have more +than one profile in the +.IR hmmfile , +and you want to have at least as many profiles as MPI worker +processes. + + + + +.SH OPTIONS CONTROLLING OUTPUT + +.TP +.BI \-o " " +Save the main output table to a file +.I +rather than sending it to stdout. + +.TP +.BI \-\-afile " " +When collecting Viterbi alignment statistics (the +.B \-a +option), for each sampled sequence, output two fields per +line to a file +.IR : +the length of the optimal alignment, and the Viterbi bit score. +Requires that the +.B \-a +option is also used. + +.TP +.BI \-\-efile " " +Output a rank vs. E-value plot in XMGRACE xy format to file +.IR . +The x-axis is the rank of this sequence, from highest score to lowest; +the y-axis is the E-value calculated for this sequence. E-values are +calculated using H3's default procedures (i.e. the pmu, plambda +parameters in the output table). You expect a rough match between rank +and E-value if E-values are accurately estimated. + + +.TP +.BI \-\-ffile " " +Output a "filter power" file to +.IR : +for each model, a line with three fields: +model name, number of sequences passing the P-value threshold, +and fraction of sequences passing the P-value threshold. See +.B \-\-pthresh +for setting the P-value threshold, which defaults to 0.02 (the default +MSV filter threshold in H3). The P-values are as determined by H3's +default procedures (the pmu,plambda parameters in the output table). +If all is well, you expect to see filter power equal to the predicted +P-value setting of the threshold. + +.TP +.BI \-\-pfile " " +Output cumulative survival plots (P(S>x)) to file +.I +in XMGRACE xy format. There are three plots: +(1) the observed score distribution; +(2) the maximum likelihood fitted distribution; +(3) a maximum likelihood fit to the location parameter (mu/tau) while + assuming lambda=log_2. + +.TP +.BI \-\-xfile " " +Output the bit scores as a binary array of double-precision floats (8 +bytes per score) to file +.IR . +Programs like Easel's +.B esl-histplot +can read such binary files. This is useful when generating extremely +large sample sizes. + + +.SH OPTIONS CONTROLLING MODEL CONFIGURATION (MODE) + +H3 only uses multihit local alignment ( +.B \-\-fs +mode), and this is where we believe the statistical fits. +Unihit local alignment scores (Smith/Waterman; +.B \-\-sw +mode) also obey our statistical conjectures. +Glocal alignment statistics (either multihit or unihit) are +still not adequately understood nor adequately fitted. + +.TP +.B \-\-fs +Collect multihit local alignment scores. This is the default. +"fs" comes from HMMER2's historical terminology for multihit local +alignment as 'fragment search mode'. + +.TP +.B \-\-sw +Collect unihit local alignment scores. The H3 J state is disabled. +"sw" comes from HMMER2's historical terminology for unihit local +alignment as 'Smith/Waterman search mode'. + +.TP +.B \-\-ls +Collect multihit glocal alignment scores. In glocal (global/local) +alignment, the entire model must align, to a subsequence of the +target. The H3 local entry/exit transition probabilities are +disabled. 'ls' comes from HMMER2's historical terminology for multihit local +alignment as 'local search mode'. + +.TP +.B \-\-s +Collect unihit glocal alignment scores. Both the H3 J state and local +entry/exit transition probabilities are disabled. 's' comes from +HMMER2's historical terminology for unihit glocal alignment. + + + +.SH OPTIONS CONTROLLING SCORING ALGORITHM + +.TP +.B \-\-vit +Collect Viterbi maximum likelihood alignment scores. This is the default. + +.TP +.B \-\-fwd +Collect Forward log-odds likelihood scores, summed over alignment ensemble. + +.TP +.B \-\-hyb +Collect 'Hybrid' scores, as described in papers by Yu and Hwa (for +instance, Bioinformatics 18:864, 2002). These involve calculating a +Forward matrix and taking the maximum cell value. The number itself is +statistically somewhat unmotivated, but the distribution is expected +be a well-behaved extreme value distribution (Gumbel). + +.TP +.B \-\-msv +Collect MSV (multiple ungapped segment Viterbi) scores, using H3's +main acceleration heuristic. + +.TP +.B \-\-fast +For any of the above options, use H3's optimized production +implementation (using SIMD vectorization). The default is to use the +"generic" implementation (slow and non-vectorized). The optimized +implementations sacrifice a small amount of numerical precision. This +can introduce confounding noise into statistical simulations and fits, +so when one gets super-concerned about exact details, it's better to +be able to factor that source of noise out. + +.SH OPTIONS CONTROLLING FITTED TAIL MASSES FOR FORWARD + +In some experiments, it was useful to fit Forward scores to a range of +different tail masses, rather than just one. These options provide a +mechanism for fitting an evenly-spaced range of different tail masses. +For each different tail mass, a line is generated in the output. + +.TP +.BI \-\-tmin " " +Set the lower bound on the tail mass distribution. (The default is +0.02 for the default single tail mass.) + +.TP +.BI \-\-tmax " " +Set the upper bound on the tail mass distribution. (The default is +0.02 for the default single tail mass.) + +.TP +.BI \-\-tpoints " " +Set the number of tail masses to sample, starting from +.B \-\-tmin +and ending at +.BR \-\-tmax . +(The default is 1, for the default 0.02 single tail mass.) + +.TP +.B \-\-tlinear +Sample a range of tail masses with uniform linear spacing. The default +is to use uniform logarithmic spacing. + + + +.SH OPTIONS CONTROLLING H3 PARAMETER ESTIMATION METHODS + +H3 uses three short random sequence simulations to estimating the +location parameters for the expected score distributions for MSV +scores, Viterbi scores, and Forward scores. These options allow these +simulations to be modified. + +.TP +.BI \-\-EmL " " +Sets the sequence length in simulation that estimates the location +parameter mu for MSV E-values. Default is 200. + +.TP +.BI \-\-EmN " " +Sets the number of sequences in simulation that estimates the location +parameter mu for MSV E-values. Default is 200. + +.TP +.BI \-\-EvL " " +Sets the sequence length in simulation that estimates the location +parameter mu for Viterbi E-values. Default is 200. + +.TP +.BI \-\-EvN " " +Sets the number of sequences in simulation that estimates the location +parameter mu for Viterbi E-values. Default is 200. + +.TP +.BI \-\-EfL " " +Sets the sequence length in simulation that estimates the location +parameter tau for Forward E-values. Default is 100. + +.TP +.BI \-\-EfN " " +Sets the number of sequences in simulation that estimates the location +parameter tau for Forward E-values. Default is 200. + +.TP +.BI \-\-Eft " " +Sets the tail mass fraction to fit in the simulation that estimates +the location parameter tau for Forward evalues. Default is 0.04. + + +.SH DEBUGGING OPTIONS + +.TP +.B \-\-stall +For debugging the MPI master/worker version: pause after start, to +enable the developer to attach debuggers to the running master and +worker(s) processes. Send SIGCONT signal to release the pause. +(Under gdb: +.IR "(gdb) signal SIGCONT" ) +(Only available if optional MPI support was enabled at compile-time.) + +.TP +.BI \-\-seed " " +Set the random number seed to +.IR . +The default is 0, which makes the random number generator use +an arbitrary seed, so that different runs of +.B hmmsim +will almost certainly generate a different statistical sample. +For debugging, it is useful to force reproducible results, by +fixing a random number seed. + + + +.SH EXPERIMENTAL OPTIONS + +These options were used in a small variety of different exploratory +experiments. + +.TP +.B \-\-bgflat +Set the background residue distribution to a uniform distribution, +both for purposes of the null model used in calculating scores, and +for generating the random sequences. The default is to use a standard +amino acid background frequency distribution. + +.TP +.B \-\-bgcomp +Set the background residue distribution to the mean composition of the +profile. This was used in exploring some of the effects of biased +composition. + +.TP +.B \-\-x\-no\-lengthmodel +Turn the H3 target sequence length model off. Set the self-transitions +for N,C,J and the null model to 350/351 instead; this emulates HMMER2. +Not a good idea in general. This was used to demonstrate one of the +main H2 vs. H3 differences. + +.TP +.BI \-\-nu " " +Set the nu parameter for the MSV algorithm -- the expected number of +ungapped local alignments per target sequence. The default is 2.0, +corresponding to a E->J transition probability of 0.5. This was used +to test whether varying nu has significant effect on result (it +doesn't seem to, within reason). +This option +only works if +.B \-\-msv +is selected (it only affects MSV), +and it will not work with +.B \-\-fast +(because the optimized implementations are hardwired to assume nu=2.0). + +.TP +.BI \-\-pthresh " " +Set the filter P-value threshold to use in generating filter power +files with +.BR \-\-ffile . +The default is 0.02 (which would be appropriate for testing MSV +scores, since this is the default MSV filter threshold in H3's +acceleration pipeline.) Other appropriate choices (matching defaults +in the acceleration pipeline) would be 0.001 for +Viterbi, and 1e-5 for Forward. + + + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + + + + + + + + + + + + + diff --git a/bioinformaticsProject/hmmer/share/man/man1/hmmstat.1 b/bioinformaticsProject/hmmer/share/man/man1/hmmstat.1 new file mode 100644 index 0000000..fe708e2 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/hmmstat.1 @@ -0,0 +1,133 @@ +.TH "hmmstat" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +hmmstat \- summary statistics for a profile file + + +.SH SYNOPSIS +.B hmmstat +[\fIoptions\fR] +.I hmmfile + + +.SH DESCRIPTION + +The +.B hmmstat +utility prints out a tabular file of summary statistics for each +profile in +.IR hmmfile . + + +.PP +.I hmmfile +may be '\-' (a dash character), in which case profiles +are read from a +stdin +pipe instead of from a file. + +.PP +The columns are: + +.TP +.B idx +The index of this profile, numbering each profile in the file starting from 1. + +.TP +.B name +The name of the profile. + +.TP +.B accession +The optional accession of the profile, or "\-" if there is none. + +.TP +.B nseq +The number of sequences that the profile was estimated from. + +.TP +.B eff_nseq +The effective number of sequences that the profile was estimated +from, after HMMER applied an effective sequence number calculation +such as the default entropy weighting. + +.TP +.B M +The length of the model in consensus residues (match states). + +.TP +.B relent +Mean relative entropy per match state, in bits. This is the expected +(mean) score per consensus position. This is what the default +entropy-weighting method for effective sequence number estimation +focuses on, so for default HMMER3 models, you expect this value to +reflect the default target for entropy-weighting. + +.TP +.B info +Mean information content per match state, in bits. +Probably not useful. Information content is a slightly +different calculation than relative entropy. + +.TP +.B "p relE" +Mean positional relative entropy, in bits. +This is a fancier version of the per-match-state relative entropy, +taking into account the transition (insertion/deletion) probabilities; +it may be a more accurate estimation of the average score contributed +per model consensus position. + +.TP +.B compKL +Kullback-Leibler divergence from +the default background frequency distribution to +the average composition of the profile's consensus match states, in bits. +The higher this number, the more biased the residue composition of the +profile is. Highly biased profiles can slow the HMMER3 acceleration +pipeline, by causing too many nonhomologous sequences to pass the filters. + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + + + + diff --git a/bioinformaticsProject/hmmer/share/man/man1/jackhmmer.1 b/bioinformaticsProject/hmmer/share/man/man1/jackhmmer.1 new file mode 100644 index 0000000..bff4c29 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/jackhmmer.1 @@ -0,0 +1,804 @@ +.TH "jackhmmer" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +jackhmmer \- iteratively search sequence(s) against a sequence database + +.SH SYNOPSIS +.B jackhmmer +[\fIoptions\fR] +.I seqfile +.I seqdb + +.SH DESCRIPTION + +.PP +.B jackhmmer +iteratively searches each query sequence in +.I seqfile +against the target sequence(s) in +.IR seqdb . +The first iteration is identical to a +.B phmmer +search. +For the next iteration, +a multiple alignment of the query together with all target sequences +satisfying +inclusion thresholds +is assembled, a profile is constructed from this alignment +(identical to using +.B hmmbuild +on the alignment), and profile search of the +.I seqdb +is done (identical to an +.B hmmsearch +with the profile). + + +.PP +The query +.I seqfile +may be '\-' (a dash character), in which case +the query sequences are read from a stdin pipe instead of from a +file. +The +.I seqdb +cannot be read from a stdin stream, because +.B jackhmmer +needs to do multiple passes over the database. + + +.PP +The output format is designed to be human-readable, but is often so +voluminous that reading it is impractical, and parsing it is a pain. The +.B \-\-tblout +and +.B \-\-domtblout +options save output in simple tabular formats that are concise and +easier to parse. +The +.B \-o +option allows redirecting the main output, including throwing it away +in /dev/null. + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + +.TP +.BI \-N " " +Set the maximum number of iterations to +.IR . +The default is 5. If N=1, the result is equivalent to a +.B phmmer +search. + + + + +.SH OPTIONS CONTROLLING OUTPUT + +By default, output for each iteration appears on stdout in a somewhat +human readable, somewhat parseable format. These options allow +redirecting that output or saving additional kinds of output to files, +including checkpoint files for each iteration. + +.TP +.BI \-o " " +Direct the human-readable output to a file +.IR . + +.TP +.BI \-A " " +After the final iteration, save an annotated multiple alignment of all +hits satisfying inclusion thresholds (also including the original query) to +.I +in Stockholm format. + +.TP +.BI \-\-tblout " " +After the final iteration, save a tabular summary of top sequence hits +to +.I +in a readily parseable, columnar, whitespace-delimited format. + +.TP +.BI \-\-domtblout " " +After the final iteration, save a tabular summary of top domain hits +to +.I +in a readily parseable, columnar, whitespace-delimited format. + +.TP +.BI \-\-chkhmm " prefix" +At the start of each iteration, checkpoint the query HMM, saving it +to a file named +\fIprefix\fR\fB-\fR\fIn\fR\fB.hmm\fR +where +.I n +is the iteration number (from 1..N). + +.TP +.BI \-\-chkali " prefix" +At the end of each iteration, checkpoint an alignment of all +domains satisfying inclusion thresholds (e.g. what will become the +query HMM for the next iteration), +saving it +to a file named +\fIprefix\fR\fB-\fR\fIn\fR\fB.sto\fR +in Stockholm format, +where +.I n +is the iteration number (from 1..N). + +.TP +.B \-\-acc +Use accessions instead of names in the main output, where available +for profiles and/or sequences. + +.TP +.B \-\-noali +Omit the alignment section from the main output. This can greatly +reduce the output volume. + +.TP +.B \-\-notextw +Unlimit the length of each line in the main output. The default +is a limit of 120 characters per line, which helps in displaying +the output cleanly on terminals and in editors, but can truncate +target profile description lines. + +.TP +.BI \-\-textw " " +Set the main output's line length limit to +.I +characters per line. The default is 120. + + + + + + +.SH OPTIONS CONTROLLING SINGLE SEQUENCE SCORING (FIRST ITERATION) + +By default, the first iteration uses a search model constructed from a +single query sequence. This model is constructed using a standard +20x20 substitution matrix for residue probabilities, and two +additional parameters for position-independent gap open and gap extend +probabilities. These options allow the default single-sequence scoring +parameters to be changed. + +.TP +.BI \-\-popen " " +Set the gap open probability for a single sequence query model to +.IR . +The default is 0.02. +.I +must be >= 0 and < 0.5. + +.TP +.BI \-\-pextend " " +Set the gap extend probability for a single sequence query model to +.IR . +The default is 0.4. +.I +must be >= 0 and < 1.0. + +.TP +.BI \-\-mx " " +Obtain residue alignment probabilities from the built-in +substitution matrix named +.IR . +Several standard matrices are built-in, and do not need to be +read from files. +The matrix name +.I +can be +PAM30, PAM70, PAM120, PAM240, BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, +or BLOSUM90. +Only one of the +.B \-\-mx +and +.B \-\-mxfile +options may be used. + +.TP +.BI \-\-mxfile " mxfile" +Obtain residue alignment probabilities from the substitution matrix +in file +.IR mxfile . +The default score matrix is BLOSUM62 (this matrix is internal to +HMMER and does not have to be available as a file). +The format of a substitution matrix +.I mxfile +is the standard format accepted by BLAST, FASTA, and other sequence +analysis software. +See +.B ftp.ncbi.nlm.nih.gov/blast/matrices/ +for example files. (The only +exception: we require matrices to be square, so for DNA, use files +like NCBI's NUC.4.4, not NUC.4.2.) + + +.SH OPTIONS CONTROLLING REPORTING THRESHOLDS + +Reporting thresholds control which hits are reported in output files +(the main output, +.BR \-\-tblout , +and +.BR \-\-domtblout ). +In each iteration, sequence hits and domain hits are ranked by +statistical significance (E-value) and output is generated in two +sections called per-target and per-domain output. In per-target +output, by default, all sequence hits with an E-value <= 10 are +reported. In the per-domain output, for each target that has passed +per-target reporting thresholds, all domains satisfying per-domain +reporting thresholds are reported. By default, these are domains with +conditional E-values of <= 10. The following options allow you to +change the default E-value reporting thresholds, or to use bit score +thresholds instead. + + +.TP +.BI \-E " " +Report sequences with E-values <= +.I +in per-sequence output. The default is 10.0. + +.TP +.BI \-T " " +Use a bit score threshold for per-sequence output instead of an +E-value threshold (any setting of +.B \-E +is ignored). Report sequences with a bit score of >= +.IR . +By default this option is unset. + +.TP +.BI \-Z " " +Declare the total size of the database to be +.I +sequences, for purposes of E-value calculation. +Normally E-values are calculated relative to the size of the database +you actually searched (e.g. the number of sequences in +.IR target_seqdb ). +In some cases (for instance, if you've split your +target sequence database into multiple files for parallelization of +your search), you may know better what the actual size of your search +space is. + +.TP +.BI \-\-domE " " +Report domains with conditional E-values <= +.I +in per-domain output, in addition to the top-scoring +domain per significant sequence hit. The default is 10.0. + +.TP +.BI \-\-domT " " +Use a bit score threshold for per-domain output instead of an +E-value threshold (any setting of +.B \-\-domT +is ignored). Report domains with a bit score of >= +.I +in per-domain output, in addition to the top-scoring domain per +significant sequence hit. By default this option is unset. + +.TP +.BI \-\-domZ " " +Declare the number of significant sequences to be +.I +sequences, for purposes of conditional E-value calculation for +additional domain significance. +Normally conditional E-values are calculated relative to the number of +sequences passing per-sequence reporting threshold. + + +.SH OPTIONS CONTROLLING INCLUSION THRESHOLDS + +Inclusion thresholds control which hits are included in the multiple +alignment and profile constructed for the next search iteration. +By default, +a sequence must have a per-sequence +E-value of <= 0.001 (see +.B \-E +option) to be included, and any additional domains in it besides the +top-scoring one must have a conditional E-value of <= 0.001 (see +.B \-\-domE +option). The difference between reporting thresholds and inclusion +thresholds is that inclusion thresholds control which hits actually +get used in the next iteration (or the final output multiple alignment +if the +.B \-A +option is used), whereas reporting thresholds control what you see in +output. Reporting thresholds are generally more loose so you can see +borderline hits in the top of the noise that might be of interest. + +.TP +.BI \-\-incE " " +Include sequences with E-values <= +.I +in subsequent iteration or final +alignment output by +.BR \-A . +The default is 0.001. + +.TP +.BI \-\-incT " " +Use a bit score threshold for per-sequence inclusion instead of an +E-value threshold (any setting of +.B \-\-incE +is ignored). Include sequences with a bit score of >= +.IR . +By default this option is unset. + +.TP +.BI \-\-incdomE " " +Include domains with conditional E-values <= +.I +in subsequent iteration or final alignment output by +.BR \-A , +in addition to the top-scoring +domain per significant sequence hit. +The default is 0.001. + +.TP +.BI \-\-incdomT " " +Use a bit score threshold for per-domain inclusion instead of an +E-value threshold (any setting of +.B \-\-incT +is ignored). Include domains with a bit score of >= +.IR . +By default this option is unset. + + + +.SH OPTIONS CONTROLLING ACCELERATION HEURISTICS + +HMMER3 searches are accelerated in a three-step filter pipeline: the +MSV filter, the Viterbi filter, and the Forward filter. The first +filter is the fastest and most approximate; the last is the full +Forward scoring algorithm, slowest but most accurate. There is also a +bias filter step between MSV and Viterbi. Targets that pass all the +steps in the acceleration pipeline are then subjected to +postprocessing -- domain identification and scoring using the +Forward/Backward algorithm. + +Essentially the only free parameters that control HMMER's heuristic +filters are the P-value thresholds controlling the expected fraction +of nonhomologous sequences that pass the filters. Setting the default +thresholds higher will pass a higher proportion of nonhomologous +sequence, increasing sensitivity at the expense of speed; conversely, +setting lower P-value thresholds will pass a smaller proportion, +decreasing sensitivity and increasing speed. Setting a filter's +P-value threshold to 1.0 means it will passing all sequences, and +effectively disables the filter. + +Changing filter thresholds only removes or includes targets from +consideration; changing filter thresholds does not alter bit scores, +E-values, or alignments, all of which are determined solely in +postprocessing. + +.TP +.B \-\-max +Maximum sensitivity. Turn off all filters, including the bias filter, +and run full Forward/Backward postprocessing on every target. This +increases sensitivity slightly, at a large cost in speed. + +.TP +.BI \-\-F1 " " +First filter threshold; set the P-value threshold for the MSV filter +step. The default is 0.02, meaning that roughly 2% of the highest +scoring nonhomologous targets are expected to pass the filter. + +.TP +.BI \-\-F2 " " +Second filter threshold; set the P-value threshold for the Viterbi +filter step. The default is 0.001. + +.TP +.BI \-\-F3 " " +Third filter threshold; set the P-value threshold for the Forward +filter step. The default is 1e-5. + +.TP +.B \-\-nobias +Turn off the bias filter. This increases sensitivity somewhat, but can +come at a high cost in speed, especially if the query has biased +residue composition (such as a repetitive sequence region, or if it is +a membrane protein with large regions of hydrophobicity). Without the +bias filter, too many sequences may pass the filter with biased +queries, leading to slower than expected performance as the +computationally intensive Forward/Backward algorithms shoulder an +abnormally heavy load. + + + +.SH OPTIONS CONTROLLING PROFILE CONSTRUCTION (LATER ITERATIONS) + +.B jackhmmer +always includes your original query sequence in the alignment result +at every iteration, and consensus positions are always defined by that +query sequence. That is, a +.B jackhmmer +profile is always the same length as your original query, at every +iteration. +Therefore +.B jackhmmer +gives you less control over profile construction than +.B hmmbuild +does; it does not have the +.BR \-\-fast , +or +.BR \-\-hand , +or +.B \-\-symfrac +options. +The only profile construction option available in +.B jackhmmer +is +.BR \-\-fragthresh : + + +.TP +.BI \-\-fragthresh " " +We only want to count terminal gaps as deletions if the aligned +sequence is known to be full-length, not if it is a fragment (for +instance, because only part of it was sequenced). HMMER uses a simple +rule to infer fragments: if the sequence length L is less than +or equal to a fraction +.I +times the alignment length in columns, +then the sequence is handled as a fragment. The default is 0.5. +Setting +.B \-\-fragthresh 0 +will define no (nonempty) sequence as a fragment; you might want to do +this if you know you've got a carefully curated alignment of full-length +sequences. +Setting +.B \-\-fragthresh 1 +will define all sequences as fragments; you might want to do this if +you know your alignment is entirely composed of fragments, such as +translated short reads in metagenomic shotgun data. + + + +.SH OPTIONS CONTROLLING RELATIVE WEIGHTS + +Whenever a profile is built from a multiple alignment, HMMER uses an +ad hoc sequence weighting algorithm to downweight closely related +sequences and upweight distantly related ones. This has the effect of +making models less biased by uneven phylogenetic representation. For +example, two identical sequences would typically each receive half the +weight that one sequence would (and this is why +.B jackhmmer +isn't concerned about always including your original query sequence in +each iteration's alignment, even if it finds it again in the database +you're searching). These options control which algorithm gets used. + +.TP +.B \-\-wpb +Use the Henikoff position-based sequence weighting scheme [Henikoff +and Henikoff, J. Mol. Biol. 243:574, 1994]. This is the default. + +.TP +.B \-\-wgsc +Use the Gerstein/Sonnhammer/Chothia weighting algorithm [Gerstein et +al, J. Mol. Biol. 235:1067, 1994]. + +.TP +.B \-\-wblosum +Use the same clustering scheme that was used to weight data in +calculating BLOSUM subsitution matrices [Henikoff and Henikoff, +Proc. Natl. Acad. Sci 89:10915, 1992]. Sequences are single-linkage +clustered at an identity threshold (default 0.62; see +.BR \-\-wid ) +and within each cluster of c sequences, each sequence gets relative +weight 1/c. + +.TP +.B \-\-wnone +No relative weights. All sequences are assigned uniform weight. + +.TP +.BI \-\-wid " " +Sets the identity threshold used by single-linkage clustering when +using +.BR \-\-wblosum . +Invalid with any other weighting scheme. Default is 0.62. + + + + + +.SH OPTIONS CONTROLLING EFFECTIVE SEQUENCE NUMBER + +After relative weights are determined, they are normalized to sum to a +total effective sequence number, +.IR eff_nseq . +This number may be the actual number of sequences in the alignment, +but it is almost always smaller than that. +The default entropy weighting method +(\fB\-\-eent\fR) +reduces the effective sequence +number to reduce the information content (relative entropy, or average +expected score on true homologs) per consensus position. The target +relative entropy is controlled by a two-parameter function, where the +two parameters are settable with +.B \-\-ere +and +.BR \-\-esigma . + +.TP +.B \-\-eent +Adjust effective sequence number to achieve a specific relative entropy +per position (see +.BR \-\-ere ). +This is the default. + +.TP +.B \-\-eclust +Set effective sequence number to the number of single-linkage clusters +at a specific identity threshold (see +.BR \-\-eid ). +This option is not recommended; it's for experiments evaluating +how much better +.B \-\-eent +is. + +.TP +.B \-\-enone +Turn off effective sequence number determination and just use the +actual number of sequences. One reason you might want to do this is +to try to maximize the relative entropy/position of your model, which +may be useful for short models. + +.TP +.BI \-\-eset " " +Explicitly set the effective sequence number for all models to +.IR . + +.TP +.BI \-\-ere " " +Set the minimum relative entropy/position target to +.IR . +Requires +.BR \-\-eent . +Default depends on the sequence alphabet; for protein +sequences, it is 0.59 bits/position. + +.TP +.BI \-\-esigma " " +Sets the minimum relative entropy contributed by an entire +model alignment, over its whole length. This has the effect +of making short models have +higher relative entropy per position than +.B \-\-ere +alone would give. The default is 45.0 bits. + +.TP +.BI \-\-eid " " +Sets the fractional pairwise identity cutoff used by +single linkage clustering with the +.B \-\-eclust +option. The default is 0.62. + + + +.SH OPTIONS CONTROLLING PRIORS + +In profile construction, by default, weighted counts are converted to +mean posterior probability parameter estimates using mixture Dirichlet +priors. Default mixture Dirichlet prior parameters for protein models +and for nucleic acid (RNA and DNA) models are built in. The following +options allow you to override the default priors. + +.TP +.B \-\-pnone +Don't use any priors. Probability parameters will simply be the +observed frequencies, after relative sequence weighting. + +.TP +.B \-\-plaplace +Use a Laplace +1 prior in place of the default mixture Dirichlet +prior. + + + +.SH OPTIONS CONTROLLING E-VALUE CALIBRATION + +Estimating the location parameters for the expected score +distributions for MSV filter scores, Viterbi filter scores, and +Forward scores requires three short random sequence simulations. + +.TP +.BI \-\-EmL " " +Sets the sequence length in simulation that estimates the location +parameter mu for MSV filter E-values. Default is 200. + +.TP +.BI \-\-EmN " " +Sets the number of sequences in simulation that estimates the location +parameter mu for MSV filter E-values. Default is 200. + +.TP +.BI \-\-EvL " " +Sets the sequence length in simulation that estimates the location +parameter mu for Viterbi filter E-values. Default is 200. + +.TP +.BI \-\-EvN " " +Sets the number of sequences in simulation that estimates the location +parameter mu for Viterbi filter E-values. Default is 200. + +.TP +.BI \-\-EfL " " +Sets the sequence length in simulation that estimates the location +parameter tau for Forward E-values. Default is 100. + +.TP +.BI \-\-EfN " " +Sets the number of sequences in simulation that estimates the location +parameter tau for Forward E-values. Default is 200. + +.TP +.BI \-\-Eft " " +Sets the tail mass fraction to fit in the simulation that estimates +the location parameter tau for Forward evalues. Default is 0.04. + + +.SH OTHER OPTIONS + +.TP +.B \-\-nonull2 +Turn off the null2 score corrections for biased composition. + +.TP +.BI \-Z " " +Assert that the total number of targets in your searches is +.IR , +for the purposes of per-sequence E-value calculations, +rather than the actual number of targets seen. + +.TP +.BI \-\-domZ " " +Assert that the total number of targets in your searches is +.IR , +for the purposes of per-domain conditional E-value calculations, +rather than the number of targets that passed the reporting thresholds. + +.TP +.BI \-\-seed " " +Seed the random number generator with +.IR , +an integer >= 0. +If +.I +is >0, any stochastic simulations will be reproducible; the same +command will give the same results. +If +.I +is 0, the random number generator is seeded arbitrarily, and +stochastic simulations will vary from run to run of the same command. +The default seed is 42. + + +.TP +.BI \-\-qformat " " +Assert that input query +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +.B jackhmmer +always uses a single sequence query to start its search, so when the input +.I seqfile +is an alignment, +.B jackhmmer +reads it one unaligned query sequence at a time, not as an alignment. +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). + +.TP +.BI \-\-tformat " " +Assert that the input target sequence +.I seqdb +is in format +.IR . +See +.B \-\-qformat +above for accepted choices for +.IR . + + + +.TP +.BI \-\-cpu " " +Set the number of parallel worker threads to +.IR . +On multicore machines, the default is 2. +You can also control this number by setting an environment variable, +.IR HMMER_NCPU . +There is also a master thread, so the actual number of threads that +HMMER spawns is +.IR +1. + +This option is not available if HMMER was compiled with POSIX threads +support turned off. + + + +.TP +.BI \-\-stall +For debugging the MPI master/worker version: pause after start, to +enable the developer to attach debuggers to the running master and +worker(s) processes. Send SIGCONT signal to release the pause. +(Under gdb: +.BR "(gdb) signal SIGCONT" ) +(Only available if optional MPI support was enabled at compile-time.) + +.TP +.BI \-\-mpi +Run under MPI control with master/worker parallelization (using +.BR mpirun , +for example, or equivalent). Only available if optional MPI support +was enabled at compile-time. + + + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + + diff --git a/bioinformaticsProject/hmmer/share/man/man1/makehmmerdb.1 b/bioinformaticsProject/hmmer/share/man/man1/makehmmerdb.1 new file mode 100644 index 0000000..33762c1 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/makehmmerdb.1 @@ -0,0 +1,133 @@ +.TH "makehmmerdb" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +makehmmerdb \- build nhmmer database from a sequence file + + +.SH SYNOPSIS +.B makehmmerdb +[\fIoptions\fR] +.I seqfile +.I binaryfile + + +.SH DESCRIPTION + +.PP +.B makehmmerdb +is used to create a binary file from a DNA sequence file. This +binary file may be used as a target database for the DNA search tool +.BR nhmmer . +Using default settings in +.BR nhmmer , +this yields a roughly 10-fold acceleration with small loss of +sensitivity on benchmarks. + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + + + +.SH OTHER OPTIONS + +.TP +.BI \-\-informat " " +Assert that input +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). + + +.TP +.BI \-\-bin_length " " +Bin length. The binary file depends on a data structure called the +FM index, which organizes a permuted copy of the sequence in bins +of length +.IR . +Longer bin length will lead to smaller files (because data is +captured about each bin) and possibly slower query time. The +default is 256. Much more than 512 may lead to notable reduction +in speed. + + +.TP +.BI \-\-sa_freq " " +Suffix array sample rate. The FM index structure also samples from +the underlying suffix array for the sequence database. More frequent +sampling (smaller value for +.IR ) +will yield larger file size and faster search (until file size becomes +large enough to cause I/O to be a bottleneck). The default value +is 8. Must be a power of 2. + + +.TP +.BI \-\-block_size " " +The input sequence is broken into blocks of size +.I +million letters. An FM index is built for each block, rather than +building an FM index for the entire sequence database. Default is +50. Larger blocks do not seem to yield substantial speed increase. + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + + + diff --git a/bioinformaticsProject/hmmer/share/man/man1/nhmmer.1 b/bioinformaticsProject/hmmer/share/man/man1/nhmmer.1 new file mode 100644 index 0000000..792ac74 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/nhmmer.1 @@ -0,0 +1,692 @@ +.TH "nhmmer" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +nhmmer \- search DNA queries against a DNA sequence database + + +.SH SYNOPSIS +.B nhmmer +[\fIoptions\fR] +.I queryfile +.I seqdb + + +.SH DESCRIPTION + +.PP +.B nhmmer +is used to search one or more nucleotide queries against a +nucleotide sequence database. +For each query in +.IR queryfile , +use that query to search the target database of sequences in +.IR seqdb , +and output a ranked list of the hits with the most significant +matches to the query. A query may be either a profile model +built using +.BR hmmbuild , +a sequence alignment, or a single sequence. Sequence based +queries can be in a number of formats (see +.BR \-\-qformat ), +and can typically be autodetected. Note that only +Stockholm +format supports queries made up of more than one sequence +alignment. + + + +.PP +Either the query +.I queryfile +or the target +.I seqdb +may be '\-' (a dash character), in which case +the query file or target database input will be read from a pipe instead of from a +file. Only one input source can come through , not both. +If the +.I queryfile +contains more than one query, then +.I seqdb +cannot come from stdin, because we can't rewind the +streaming target database to search it with another profile. + +.PP +If the query is sequence-based (unaligned or aligned), +a new file containing the HMM(s) built from the input(s) in +.I queryfile +may optionally be produced, with the filename set using the +.B \-\-hmmout +flag. + + +.PP +The output format is designed to be human-readable, but is often so +voluminous that reading it is impractical, and parsing it is a pain. The +.B \-\-tblout +option saves output in a simple tabular format that is concise and +easier to parse. +The +.B \-o +option allows redirecting the main output, including throwing it away +in /dev/null. + + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + + + +.SH OPTIONS FOR CONTROLLING OUTPUT + +.TP +.BI \-o " " +Direct the main human-readable output to a file +.I +instead of the default stdout. + +.TP +.BI \-A " " +Save a multiple alignment of all significant hits (those satisfying +"inclusion thresholds") to the file +.IR . + +.TP +.BI \-\-tblout " " +Save a simple tabular (space-delimited) file summarizing the +per-target output, with one data line per homologous target sequence +found. + +.TP +.BI \-\-dfamtblout " " +Save a tabular (space-delimited) file summarizing the +per-hit output, similar to +.B \-\-tblout +but more succinct. + +.TP +.BI \-\-aliscoresout " " +Save to file a list of per-position scores for each hit. +This is useful, for example, in identifying regions of high +score density for use in resolving overlapping hits from +different models. + +.TP +.BI \-\-hmmout " " +If +.I queryfile +is sequence-based, write the internally-computed HMM(s) to file +.IR . + + +.TP +.B \-\-acc +Use accessions instead of names in the main output, where available +for profiles and/or sequences. + +.TP +.B \-\-noali +Omit the alignment section from the main output. This can greatly +reduce the output volume. + +.TP +.B \-\-notextw +Unlimit the length of each line in the main output. The default +is a limit of 120 characters per line, which helps in displaying +the output cleanly on terminals and in editors, but can truncate +target profile description lines. + +.TP +.BI \-\-textw " " +Set the main output's line length limit to +.I +characters per line. The default is 120. + + + +.SH OPTIONS CONTROLLING SINGLE SEQUENCE SCORING + +By default, if a query is a single sequence from a file in +fasta format, +.B nhmmer +uses a search model constructed from that sequence and a standard +20x20 substitution matrix for residue probabilities, along with two +additional parameters for position-independent gap open and gap extend +probabilities. These options allow the default single-sequence scoring +parameters to be changed, and for single-sequence scoring options to +be applied to a single sequence coming from an aligned format. + +.TP +.BI \-\-singlemx +If a single sequence query comes from a multiple sequence alignment file, +such as in Stockholm format, the search model is by default constructed as is typically done +for multiple sequence alignments. This option forces +.B nhmmer +to use the single-sequence method with substitution score matrix. + +.TP +.BI \-\-mxfile "" +Set the gap open probability for a single sequence query model to +.IR . +The default is 0.02. +.I +must be >= 0 and < 0.5. + +.TP +.BI \-\-pextend " " +Set the gap extend probability for a single sequence query model to +.IR . +The default is 0.4. +.I +must be >= 0 and < 1.0. + + + +.SH OPTIONS CONTROLLING REPORTING THRESHOLDS + +Reporting thresholds control which hits are reported in output files +(the main output, +.BR \-\-tblout , +and +.BR \-\-dfamtblout ). +Hits are ranked by statistical significance (E-value). + + +.TP +.BI \-E " " +Report target sequences with an E-value of <= +.IR . +The default is 10.0, meaning that on average, about 10 false positives +will be reported per query, so you can see the top of the noise +and decide for yourself if it's really noise. + +.TP +.BI \-T " " +Instead of thresholding output on E-value, instead +report target sequences with a bit score of >= +.IR . + + + + +.SH OPTIONS FOR INCLUSION THRESHOLDS + +Inclusion thresholds are stricter than reporting thresholds. +Inclusion thresholds control which hits are considered to be reliable +enough to be included in an output alignment or a subsequent search +round, or marked as significant ("!") as opposed to questionable ("?") +in hit output. + +.TP +.BI \-\-incE " " +Use an E-value of <= +.I +as the inclusion threshold. +The default is 0.01, meaning that on average, about 1 false positive +would be expected in every 100 searches with different query +sequences. + +.TP +.BI \-\-incT " " +Instead of using E-values for setting the inclusion threshold, +use a bit score of >= +.I +as the inclusion threshold. +By default this option is unset. + + + +.SH OPTIONS FOR MODEL-SPECIFIC SCORE THRESHOLDING + +Curated profile databases may define specific bit score thresholds for +each profile, superseding any thresholding based on statistical +significance alone. + +To use these options, the profile must contain the appropriate (GA, +TC, and/or NC) optional score threshold annotation; this is picked up +by +.B hmmbuild +from Stockholm format alignment files. For a nucleotide model, each +thresholding option has a single per-hit threshold +This acts as if +.BI \-T " " +.BI \-\-incT " " +has been applied specifically using each model's curated thresholds. + +.TP +.B \-\-cut_ga +Use the GA (gathering) bit score threshold in the model to set +per-hit reporting and inclusion +thresholds. GA thresholds are generally considered to be the +reliable curated thresholds defining family membership; for example, +in Dfam, these thresholds are applied when annotating a genome +with a model of a family known to be found in that organism. They +may allow for minimal expected false discovery rate. + +.TP +.B \-\-cut_nc +Use the NC (noise cutoff) bit score threshold in the model to set +per-hit reporting and inclusion +thresholds. NC thresholds are less stringent than GA; in the context +of Pfam, they are generally used to store the score of the +highest-scoring known false positive. + +.TP +.B \-\-cut_tc +Use the TC (trusted cutoff) bit score threshold in the model to set +per-hit reporting and inclusion +thresholds. TC thresholds are more stringent than GA, and are +generally considered to be the score of the lowest-scoring known +true positive that is above all known false positives; for example, +in Dfam, these thresholds are applied when annotating a genome +with a model of a family not known to be found in that organism. + + + + +.SH OPTIONS CONTROLLING THE ACCELERATION PIPELINE + +HMMER3 searches are accelerated in a three-step filter pipeline: the +scanning-SSV filter, the Viterbi filter, and the Forward filter. The +first filter is the fastest and most approximate; the last is the full +Forward scoring algorithm. There is also a bias filter step between +SSV and Viterbi. Targets that pass all the steps in the acceleration +pipeline are then subjected to postprocessing -- domain +identification and scoring using the Forward/Backward algorithm. + +Changing filter thresholds only removes or includes targets from +consideration; changing filter thresholds does not alter bit scores, +E-values, or alignments, all of which are determined solely in +postprocessing. + +.TP +.B \-\-max +Turn off (nearly) all filters, including the bias filter, and run full +Forward/Backward postprocessing on most of the target sequence. +In contrast to +.B phmmer +and +.BR hmmsearch , +where this flag really does turn off the filters entirely, the +.B \-\-max +flag in +.B nhmmer +sets the scanning-SSV filter threshold to 0.4, not 1.0. Use of this +flag increases sensitivity somewhat, at a large cost in speed. + +.TP +.BI \-\-F1 " " +Set the P-value threshold for the SSV filter step. The default is +0.02, meaning that roughly 2% of the highest scoring nonhomologous +targets are expected to pass the filter. + +.TP +.BI \-\-F2 " " +Set the P-value threshold for the Viterbi filter step. +The default is 0.001. + +.TP +.BI \-\-F3 " " +Set the P-value threshold for the Forward filter step. +The default is 1e-5. + +.TP +.B \-\-nobias +Turn off the bias filter. This increases sensitivity somewhat, but can +come at a high cost in speed, especially if the query has biased +residue composition (such as a repetitive sequence region, or if it is +a membrane protein with large regions of hydrophobicity). Without the +bias filter, too many sequences may pass the filter with biased +queries, leading to slower than expected performance as the +computationally intensive Forward/Backward algorithms shoulder an +abnormally heavy load. + + + +.SH OPTIONS FOR SPECIFYING THE ALPHABET + +.TP +.B \-\-dna +Assert that sequences in +.I msafile +are DNA, bypassing alphabet autodetection. + +.TP +.B \-\-rna +Assert that sequences in +.I msafile +are RNA, bypassing alphabet autodetection. + + + +.SH OPTIONS CONTROLLING SEED SEARCH HEURISTIC + +When searching with +.BR nhmmer , +one may optionally precompute a binary version of the target database, using +.BR makehmmerdb , +then search against that database. Using default settings, this yields a +roughly 10-fold acceleration with small loss of sensitivity on benchmarks. +This is achieved using a heuristic method that searches for seeds (ungapped +alignments) around which full processing is done. This is essentially +a replacement to the SSV stage. (This method has been extensively tested, +but should still be treated as somewhat experimental.) +The following options only impact +.B nhmmer +if the value of +.B \-\-tformat +is +.BR hmmerdb . + +Changing parameters for this seed-finding step will impact both speed and +sensitivity - typically faster search leads to lower sensitivity. + +.TP +.BI \-\-seed_max_depth " " +The seed step requires that a seed reach a specified bit score in length +no longer than +.IR . +By default, this value is 15. Longer seeds allow a greater chance of +meeting the bit score threshold, leading to diminished filtering +(greater sensitivity, slower run time). + +.TP +.BI \-\-seed_sc_thresh " " +The seed must reach score +.I +(in bits). The default is 15.0 bits. A higher threshold increases +filtering stringency, leading to faster run times and lower +sensitivity. + +.TP +.BI \-\-seed_sc_density " " +Either all prefixes or all suffixes of a seed must have +bit density (bits per aligned position) of at least +.IR . +The default is 0.8 bits/position. An increase in the density +requirement leads to increased filtering stringency, thus faster +run times and lower sensitivity. + +.TP +.BI \-\-seed_drop_max_len " " +A seed may not have a run of length +.I +in which the score drops by +.B \-\-seed_drop_lim +or more. Basically, this prunes seeds that go through long +slightly-negative seed extensions. The default is 4. Increasing +the limit causes (slightly) diminished filtering efficiency, thus +slower run times and higher sensitivity. (minor tuning option) + +.TP +.BI \-\-seed_drop_lim " " +In a seed, there may be no run of length +.B \-\-seed_drop_max_len +in which the score drops by +.BR \-\-seed_drop_lim . +The default is 0.3 bits. Larger numbers mean less filtering. +(minor tuning option) + +.TP +.BI \-\-seed_req_pos " " +A seed must contain a run of at least +.I +positive-scoring matches. The default is 5. Larger values mean +increased filtering. +(minor tuning option) + +.TP +.BI \-\-seed_ssv_length " " +After finding a short seed, an ungapped alignment is extended +in both directions in an attempt to meet the +.B \-\-F1 +score threshold. The window through which this ungapped alignment +extends is length +.IR . +The default is 70. +Decreasing this value slightly reduces run time, at a small risk of +reduced sensitivity. (minor tuning option) + + +.SH OTHER OPTIONS + + +.TP +.BI \-\-qformat " " +Assert that input +.I queryfile +is a sequence file (unaligned or aligned), in format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work, and will serve as the +basis for automatic creation of a profile HMM used for +searching; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. + + +.TP +.BI \-\-qsingle_seqs +Force +.I queryfile +to be read as individual sequences, even if it is in +an msa format. For example, if the input is in aligned +.BR stockholm +format, the +.BR \-\-qsingle_seqs + flag will cause each sequence in that alignment to be used as a seperate query sequence. + +.TP +.BI \-\-tformat " " +Assert that target sequence database +.I seqdb +is in format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR fasta , +.BR embl , +.BR genbank , +.BR ncbi , +.BR fmindex . +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). +The format +.B ncbi +indicates that the database file is a binary file produced using +.BR makeblastdb . +The format +.B fmindex +indicates that the database file is a binary file produced using +.BR makehmmerdb . + + +.TP +.B \-\-nonull2 +Turn off the null2 score corrections for biased composition. + +.TP +.BI \-Z " " +For the purposes of per-hit E-value calculations, +Assert that the total size of the target database is +.I +million nucleotides, +rather than the actual number of targets seen. + + +.TP +.BI \-\-seed " " +Set the random number seed to +.IR . +Some steps in postprocessing require Monte Carlo simulation. The +default is to use a fixed seed (42), so that results are exactly +reproducible. Any other positive integer will give different (but also +reproducible) results. A choice of 0 uses a randomly chosen seed. + + +.TP +.BI \-\-w_beta " " +Window length tail mass. +The upper bound, +.IR W , +on the length at which nhmmer expects to find an instance of the +model is set such that the fraction of all sequences generated +by the model with length >= W is less than +.IR . +The default is 1e-7. +This flag may be used to override the value of +.I W +established for the model by +.BR hmmbuild , +or when the query is sequence-based. + + + +.TP +.BI \-\-w_length " " +Override the model instance length upper bound, W, +which is otherwise controlled by +.BR \-\-w_beta . +It should be larger than the model length. The value of W +is used deep in the acceleration pipeline, and modest changes +are not expected to impact results (though larger values of W +do lead to longer run time). +This flag may be used to override the value of +W established for the model by +.BR hmmbuild , +or when the query is sequence-based. + + + +.TP +.B \-\-watson +Only search the top strand. By default both the query sequence +and its reverse-complement are searched. + +.TP +.B \-\-crick +Only search the bottom (reverse-complement) strand. By +default both the query sequence and its reverse-complement are searched. + + +.TP +.BI \-\-cpu " " +Set the number of parallel worker threads to +.IR . +On multicore machines, the default is 2. +You can also control this number by setting an environment variable, +.IR HMMER_NCPU . +There is also a master thread, so the actual number of threads that +HMMER spawns is +.IR +1. + +This option is not available if HMMER was compiled with POSIX threads +support turned off. + + + + +.TP +.BI \-\-stall +For debugging the MPI master/worker version: pause after start, to +enable the developer to attach debuggers to the running master and +worker(s) processes. Send SIGCONT signal to release the pause. +(Under gdb: +.BR "(gdb) signal SIGCONT" ) +(Only available if optional MPI support was enabled at compile-time.) + +.TP +.BI \-\-mpi +Run under MPI control with master/worker parallelization (using +.BR mpirun , +for example, or equivalent). Only available if optional MPI support +was enabled at compile-time. + + + + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + + + diff --git a/bioinformaticsProject/hmmer/share/man/man1/nhmmscan.1 b/bioinformaticsProject/hmmer/share/man/man1/nhmmscan.1 new file mode 100644 index 0000000..3553137 --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/nhmmscan.1 @@ -0,0 +1,468 @@ +.TH "nhmmscan" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +nhmmscan \- search DNA sequence(s) against a DNA profile database + + +.SH SYNOPSIS +.B nhmmscan +[\fIoptions\fR] +.I hmmdb +.I seqfile + + + +.SH DESCRIPTION + +.PP +.B nhmmscan +is used to search nucleotide sequences against collections +of nucleotide profiles. For each sequence in +.IR seqfile , +use that query sequence to search the target database of +profiles in +.IR hmmdb , +and output ranked lists of the profiles with the +most significant matches to the sequence. + +.PP +The +.I seqfile +may contain more than one query sequence. It can be in FASTA format, +or several other common sequence file formats (genbank, embl, and +uniprot, among others), or in alignment file formats (stockholm, +aligned fasta, and others). See the +.I \-\-qformat +option for a complete list. + +.PP +The +.I hmmdb +needs to be press'ed using +.B hmmpress +before it can be searched with +.BR nhmmscan . +This creates four binary files, +suffixed +.B .h3{fimp}. + +.PP +The query +.I seqfile +may be '\-' (a dash character), in which case +the query sequences are read from a stdin pipe instead of from a +file. +The +.I hmmdb +cannot be read from a stdin stream, because it needs to have +the four auxiliary binary files generated by +.BR hmmpress . + +.PP +The output format is designed to be human-readable, but is often so +voluminous that reading it is impractical, and parsing it is a pain. The +.B \-\-tblout +option saves output in a simple tabular format that is concise and +easier to parse. +The +.B \-o +option allows redirecting the main output, including throwing it away +in /dev/null. + + + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + + + +.SH OPTIONS FOR CONTROLLING OUTPUT + +.TP +.BI \-o " " +Direct the main human-readable output to a file +.I +instead of the default stdout. + +.TP +.BI \-\-tblout " " +Save a simple tabular (space-delimited) file summarizing the +per-hit output, with one data line per homologous target model +hit found. + +.TP +.BI \-\-dfamtblout " " +Save a tabular (space-delimited) file summarizing the +per-hit output, similar to +.B \-\-tblout +but more succinct. + +.TP +.BI \-\-aliscoresout " " +Save to file a list of per-position scores for each hit. +This is useful, for example, in identifying regions of high +score density for use in resolving overlapping hits from +different models. + + +.TP +.B \-\-acc +Use accessions instead of names in the main output, where available +for profiles and/or sequences. + +.TP +.B \-\-noali +Omit the alignment section from the main output. This can greatly +reduce the output volume. + +.TP +.B \-\-notextw +Unlimit the length of each line in the main output. The default +is a limit of 120 characters per line, which helps in displaying +the output cleanly on terminals and in editors, but can truncate +target profile description lines. + +.TP +.BI \-\-textw " " +Set the main output's line length limit to +.I +characters per line. The default is 120. + + + +.SH OPTIONS FOR REPORTING THRESHOLDS + +Reporting thresholds control which hits are reported in output files +(the main output, +.BR \-\-tblout , +and +.BR \-\-dfamtblout ). +Hits are ranked by statistical significance (E-value). + +.TP +.BI \-E " " +Report target profiles with an E-value of <= +.IR . +The default is 10.0, meaning that on average, about 10 false positives +will be reported per query, so you can see the top of the noise +and decide for yourself if it's really noise. + +.TP +.BI \-T " " +Instead of thresholding output on E-value, instead +report target profiles with a bit score of >= +.IR . + + + + +.SH OPTIONS FOR INCLUSION THRESHOLDS + +Inclusion thresholds are stricter than reporting thresholds. +Inclusion thresholds control which hits are considered to be +reliable enough +to be included in an output alignment or a subsequent search round. +In +.BR nhmmscan , +which does not have any alignment output (like +.BR nhmmer ), +inclusion thresholds have little effect. They only affect what hits +get marked as significant (!) or questionable (?) in hit +output. + +.TP +.BI \-\-incE " " +Use an E-value of <= +.I +as the inclusion threshold. +The default is 0.01, meaning that on average, about 1 false positive +would be expected in every 100 searches with different query +sequences. + +.TP +.BI \-\-incT " " +Instead of using E-values for setting the inclusion threshold, +use a bit score of >= +.I +as the inclusion threshold. +It would be unusual to use bit score thresholds with +.BR hmmscan , +because you don't expect a single score threshold to work for +different profiles; different profiles have slightly different +expected score distributions. + + + +.SH OPTIONS FOR MODEL-SPECIFIC SCORE THRESHOLDING + +Curated profile databases may define specific bit score thresholds for +each profile, superseding any thresholding based on statistical +significance alone. + +To use these options, the profile must contain the appropriate (GA, +TC, and/or NC) optional score threshold annotation; this is picked up +by +.B hmmbuild +from Stockholm format alignment files. For a nucleotide model, each +thresholding option has a single per-hit threshold +This acts as if +.BI \-T " " +.BI \-\-incT " " +has been applied specifically using each model's curated thresholds. + +.TP +.B \-\-cut_ga +Use the GA (gathering) bit score threshold in the model to set +per-hit reporting and inclusion +thresholds. GA thresholds are generally considered to be the +reliable curated thresholds defining family membership; for example, +in Dfam, these thresholds are applied when annotating a genome +with a model of a family known to be found in that organism. They +may allow for minimal expected false discovery rate. + +.TP +.B \-\-cut_nc +Use the NC (noise cutoff) bit score threshold in the model to set +per-hit reporting and inclusion +thresholds. NC thresholds are less stringent than GA; in the context +of Pfam, they are generally used to store the score of the +highest-scoring known false positive. + +.TP +.B \-\-cut_tc +Use the TC (trusted cutoff) bit score threshold in the model to set +per-hit reporting and inclusion +thresholds. TC thresholds are more stringent than GA, and are +generally considered to be the score of the lowest-scoring known +true positive that is above all known false positives; for example, +in Dfam, these thresholds are applied when annotating a genome +with a model of a family not known to be found in that organism. + + + +.SH CONTROL OF THE ACCELERATION PIPELINE + +HMMER3 searches are accelerated in a three-step filter pipeline: the +scanning-SSV filter, the Viterbi filter, and the Forward filter. The +first filter is the fastest and most approximate; the last is the full +Forward scoring algorithm. There is also a bias filter step between +SSV and Viterbi. Targets that pass all the steps in the acceleration +pipeline are then subjected to postprocessing -- domain +identification and scoring using the Forward/Backward algorithm. + +Changing filter thresholds only removes or includes targets from +consideration; changing filter thresholds does not alter bit scores, +E-values, or alignments, all of which are determined solely in +postprocessing. + +.TP +.B \-\-max +Turn off (nearly) all filters, including the bias filter, and run full +Forward/Backward postprocessing on most of the target sequence. +In contrast to +.B hmmscan, +where this flag really does turn off the filters entirely, the +.B \-\-max +flag in +.B nhmmscan +sets the scanning-SSV filter threshold to 0.4, not 1.0. Use of this +flag increases sensitivity somewhat, at a large cost in speed. + +.TP +.BI \-\-F1 " " +Set the P-value threshold for the MSV filter step. The default is +0.02, meaning that roughly 2% of the highest scoring nonhomologous +targets are expected to pass the filter. + +.TP +.BI \-\-F2 " " +Set the P-value threshold for the Viterbi filter step. +The default is 0.001. + +.TP +.BI \-\-F3 " " +Set the P-value threshold for the Forward filter step. +The default is 1e-5. + +.TP +.B \-\-nobias +Turn off the bias filter. This increases sensitivity somewhat, but can +come at a high cost in speed, especially if the query has biased +residue composition (such as a repetitive sequence region, or if it is +a membrane protein with large regions of hydrophobicity). Without the +bias filter, too many sequences may pass the filter with biased +queries, leading to slower than expected performance as the +computationally intensive Forward/Backward algorithms shoulder an +abnormally heavy load. + + + +.SH OTHER OPTIONS + +.TP +.B \-\-nonull2 +Turn off the null2 score corrections for biased composition. + +.TP +.BI \-Z " " +Assert that the total number of targets in your searches is +.IR , +for the purposes of per-sequence E-value calculations, +rather than the actual number of targets seen. + +.TP +.BI \-\-seed " " +Set the random number seed to +.IR . +Some steps in postprocessing require Monte Carlo simulation. The +default is to use a fixed seed (42), so that results are exactly +reproducible. Any other positive integer will give different (but also +reproducible) results. A choice of 0 uses an arbitrarily chosen seed. + +.TP +.BI \-\-qformat " " +Assert that input query +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). + + +.TP +.BI \-\-w_beta " " +Window length tail mass. +The upper bound, W, +on the length at which nhmmer expects to find an instance of the +model is set such that the fraction of all sequences generated +by the model with length >= W is less than +.IR . +The default is 1e-7. +This flag may be used to override the value of W +established for the model by +.BR hmmbuild . + + + +.TP +.BI \-\-w_length " " +Override the model instance length upper bound, W, +which is otherwise controlled by +.BR \-\-w_beta . +It should be larger than the model length. The value of W +is used deep in the acceleration pipeline, and modest changes +are not expected to impact results (though larger values of W +do lead to longer run time). +This flag may be used to override the value of W +established for the model by +.BR hmmbuild . + + +.TP +.B \-\-watson +Only search the top strand. By default both the query sequence +and its reverse-complement are searched. + +.TP +.B \-\-crick +Only search the bottom (reverse-complement) strand. By +default both the query sequence and its reverse-complement are searched. + + +.TP +.BI \-\-cpu " " +Set the number of parallel worker threads to +.IR . +On multicore machines, the default is 2. +You can also control this number by setting an environment variable, +.IR HMMER_NCPU . +There is also a master thread, so the actual number of threads that +HMMER spawns is +.IR +1. + +This option is not available if HMMER was compiled with POSIX threads +support turned off. + + + + +.TP +.BI \-\-stall +For debugging the MPI master/worker version: pause after start, to +enable the developer to attach debuggers to the running master and +worker(s) processes. Send SIGCONT signal to release the pause. +(Under gdb: +.BR "(gdb) signal SIGCONT" ) + +(Only available if optional MPI support was enabled at compile-time.) + +.TP +.BI \-\-mpi +Run under MPI control with master/worker parallelization (using +.BR mpirun , +for example, or equivalent). Only available if optional MPI support +was enabled at compile-time. + + + + + + + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + diff --git a/bioinformaticsProject/hmmer/share/man/man1/phmmer.1 b/bioinformaticsProject/hmmer/share/man/man1/phmmer.1 new file mode 100644 index 0000000..4b70f1a --- /dev/null +++ b/bioinformaticsProject/hmmer/share/man/man1/phmmer.1 @@ -0,0 +1,523 @@ +.TH "phmmer" 1 "Nov 2020" "HMMER 3.3.2" "HMMER Manual" + +.SH NAME +phmmer \- search protein sequence(s) against a protein sequence database + + +.SH SYNOPSIS +.B phmmer +[\fIoptions\fR] +.I seqfile +.I seqdb + + +.SH DESCRIPTION + +.PP +.B phmmer +is used to search one or more query protein sequences against a protein sequence database. +For each query sequence in +.IR seqfile , +use that sequence to search the target database of sequences in +.IR seqdb , +and output ranked lists of the sequences with the most significant +matches to the query. + +.PP +Either the query +.I seqfile +or the target +.I seqdb +may be '\-' (a dash character), in which case +the query sequences or target database input will be read from a pipe instead of from a +file. Only one input source can come through , not both. +An exception is that if the +.I seqfile +contains more than one query sequence, then +.I seqdb +cannot come from , because we can't rewind the +streaming target database to search it with another query. + + +.PP +The output format is designed to be human-readable, but is often so +voluminous that reading it is impractical, and parsing it is a pain. The +.B \-\-tblout +and +.B \-\-domtblout +options save output in simple tabular formats that are concise and +easier to parse. +The +.B \-o +option allows redirecting the main output, including throwing it away +in /dev/null. + +.SH OPTIONS + +.TP +.B \-h +Help; print a brief reminder of command line usage and all available +options. + + +.SH OPTIONS FOR CONTROLLING OUTPUT + +.TP +.BI \-o " " +Direct the main human-readable output to a file +.I +instead of the default stdout. + +.TP +.BI \-A " " +Save a multiple alignment of all significant hits (those satisfying +inclusion thresholds) +to the file +.I +in Stockholm format. + +.TP +.BI \-\-tblout " " +Save a simple tabular (space-delimited) file summarizing the +per-target output, with one data line per homologous target sequence +found. + +.TP +.BI \-\-domtblout " " +Save a simple tabular (space-delimited) file summarizing the +per-domain output, with one data line per homologous domain +detected in a query sequence for each homologous model. + +.TP +.B \-\-acc +Use accessions instead of names in the main output, where available +for profiles and/or sequences. + +.TP +.B \-\-noali +Omit the alignment section from the main output. This can greatly +reduce the output volume. + +.TP +.B \-\-notextw +Unlimit the length of each line in the main output. The default +is a limit of 120 characters per line, which helps in displaying +the output cleanly on terminals and in editors, but can truncate +target profile description lines. + +.TP +.BI \-\-textw " " +Set the main output's line length limit to +.I +characters per line. The default is 120. + + + +.SH OPTIONS CONTROLLING SCORING SYSTEM + +The probability model in +.B phmmer +is constructed by inferring residue probabilities from a standard +20x20 substitution score matrix, plus two additional parameters for +position-independent gap open and gap extend probabilities. + +.TP +.BI \-\-popen " " +Set the gap open probability for a single sequence query model to +.IR . +The default is 0.02. +.I +must be >= 0 and < 0.5. + +.TP +.BI \-\-pextend " " +Set the gap extend probability for a single sequence query model to +.IR . +The default is 0.4. +.I +must be >= 0 and < 1.0. + +.TP +.BI \-\-mx " " +Obtain residue alignment probabilities from the built-in +substitution matrix named +.IR . +Several standard matrices are built-in, and do not need to be +read from files. +The matrix name +.I +can be +PAM30, PAM70, PAM120, PAM240, BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, +or BLOSUM90. +Only one of the +.B \-\-mx +and +.B \-\-mxfile +options may be used. + +.TP +.BI \-\-mxfile " mxfile" +Obtain residue alignment probabilities from the substitution matrix +in file +.IR mxfile . +The default score matrix is BLOSUM62 (this matrix is internal to +HMMER and does not have to be available as a file). +The format of a substitution matrix +.I mxfile +is the standard format accepted by BLAST, FASTA, and other sequence +analysis software. +See ftp.ncbi.nlm.nih.gov/blast/matrices/ for example files. (The only +exception: we require matrices to be square, so for DNA, use files +like NCBI's NUC.4.4, not NUC.4.2.) + + + +.SH OPTIONS CONTROLLING REPORTING THRESHOLDS + +Reporting thresholds control which hits are reported in output files +(the main output, +.BR \-\-tblout , +and +.BR \-\-domtblout ). +Sequence hits and domain hits are ranked by statistical significance +(E-value) and output is generated in two sections called per-target +and per-domain output. In per-target output, by default, all +sequence hits with an E-value <= 10 are reported. In the per-domain +output, for each target that has passed per-target reporting +thresholds, all domains satisfying per-domain reporting thresholds are +reported. By default, these are domains with conditional E-values of +<= 10. The following options allow you to change the default +E-value reporting thresholds, or to use bit score thresholds instead. + + +.TP +.BI \-E " " +In the per-target output, report target sequences with an E-value of <= +.IR . +The default is 10.0, meaning that on average, about 10 false positives +will be reported per query, so you can see the top of the noise +and decide for yourself if it's really noise. + +.TP +.BI \-T " " +Instead of thresholding per-profile output on E-value, instead +report target sequences with a bit score of >= +.IR . + +.TP +.BI \-\-domE " " +In the per-domain output, for target sequences that have already satisfied +the per-profile reporting threshold, report individual domains +with a conditional E-value of <= +.IR . +The default is 10.0. +A conditional E-value means the expected number of additional false +positive domains in the smaller search space of those comparisons that +already satisfied the per-target reporting threshold (and thus must +have at least one homologous domain already). + +.TP +.BI \-\-domT " " +Instead of thresholding per-domain output on E-value, instead +report domains with a bit score of >= +.IR . + +.SH OPTIONS CONTROLLING INCLUSION THRESHOLDS + +Inclusion thresholds are stricter than reporting thresholds. They +control which hits are included in any output multiple alignment (the +.B \-A +option) and which domains are marked as significant ("!") as opposed +to questionable ("?") in domain output. + +.TP +.BI \-\-incE " " +Use an E-value of <= +.I +as the per-target inclusion threshold. +The default is 0.01, meaning that on average, about 1 false positive +would be expected in every 100 searches with different query +sequences. + +.TP +.BI \-\-incT " " +Instead of using E-values for setting the inclusion threshold, instead +use a bit score of >= +.I +as the per-target inclusion threshold. +By default this option is unset. + +.TP +.BI \-\-incdomE " " +Use a conditional E-value of <= +.I +as the per-domain inclusion threshold, in targets that have already +satisfied the overall per-target inclusion threshold. +The default is 0.01. + +.TP +.BI \-\-incdomT " " +Instead of using E-values, +use a bit score of >= +.I +as the per-domain inclusion threshold. +By default this option is unset. + + + + +.SH OPTIONS CONTROLLING THE ACCELERATION PIPELINE + +HMMER3 searches are accelerated in a three-step filter pipeline: the +MSV filter, the Viterbi filter, and the Forward filter. The first +filter is the fastest and most approximate; the last is the full +Forward scoring algorithm, slowest but most accurate. There is also a +bias filter step between MSV and Viterbi. Targets that pass all the +steps in the acceleration pipeline are then subjected to +postprocessing -- domain identification and scoring using the +Forward/Backward algorithm. + +Essentially the only free parameters that control HMMER's heuristic +filters are the P-value thresholds controlling the expected fraction +of nonhomologous sequences that pass the filters. Setting the default +thresholds higher will pass a higher proportion of nonhomologous +sequence, increasing sensitivity at the expense of speed; conversely, +setting lower P-value thresholds will pass a smaller proportion, +decreasing sensitivity and increasing speed. Setting a filter's +P-value threshold to 1.0 means it will passing all sequences, and +effectively disables the filter. + +Changing filter thresholds only removes or includes targets from +consideration; changing filter thresholds does not alter bit scores, +E-values, or alignments, all of which are determined solely in +postprocessing. + +.TP +.B \-\-max +Maximum sensitivity. Turn off all filters, including the bias filter, +and run full Forward/Backward postprocessing on every target. This +increases sensitivity slightly, at a large cost in speed. + +.TP +.BI \-\-F1 " " +First filter threshold; set the P-value threshold for the MSV filter +step. The default is 0.02, meaning that roughly 2% of the highest +scoring nonhomologous targets are expected to pass the filter. + +.TP +.BI \-\-F2 " " +Second filter threshold; set the P-value threshold for the Viterbi +filter step. The default is 0.001. + +.TP +.BI \-\-F3 " " +Third filter threshold; set the P-value threshold for the Forward +filter step. The default is 1e-5. + +.TP +.B \-\-nobias +Turn off the bias filter. This increases sensitivity somewhat, but can +come at a high cost in speed, especially if the query has biased +residue composition (such as a repetitive sequence region, or if it is +a membrane protein with large regions of hydrophobicity). Without the +bias filter, too many sequences may pass the filter with biased +queries, leading to slower than expected performance as the +computationally intensive Forward/Backward algorithms shoulder an +abnormally heavy load. + + + + +.SH OPTIONS CONTROLLING E-VALUE CALIBRATION + +Estimating the location parameters for the expected score +distributions for MSV filter scores, Viterbi filter scores, and +Forward scores requires three short random sequence simulations. + +.TP +.BI \-\-EmL " " +Sets the sequence length in simulation that estimates the location +parameter mu for MSV filter E-values. Default is 200. + +.TP +.BI \-\-EmN " " +Sets the number of sequences in simulation that estimates the location +parameter mu for MSV filter E-values. Default is 200. + +.TP +.BI \-\-EvL " " +Sets the sequence length in simulation that estimates the location +parameter mu for Viterbi filter E-values. Default is 200. + +.TP +.BI \-\-EvN " " +Sets the number of sequences in simulation that estimates the location +parameter mu for Viterbi filter E-values. Default is 200. + +.TP +.BI \-\-EfL " " +Sets the sequence length in simulation that estimates the location +parameter tau for Forward E-values. Default is 100. + +.TP +.BI \-\-EfN " " +Sets the number of sequences in simulation that estimates the location +parameter tau for Forward E-values. Default is 200. + +.TP +.BI \-\-Eft " " +Sets the tail mass fraction to fit in the simulation that estimates +the location parameter tau for Forward evalues. Default is 0.04. + + + + +.SH OTHER OPTIONS + +.TP +.B \-\-nonull2 +Turn off the null2 score corrections for biased composition. + +.TP +.BI \-Z " " +Assert that the total number of targets in your searches is +.IR , +for the purposes of per-sequence E-value calculations, +rather than the actual number of targets seen. + +.TP +.BI \-\-domZ " " +Assert that the total number of targets in your searches is +.IR , +for the purposes of per-domain conditional E-value calculations, +rather than the number of targets that passed the reporting thresholds. + +.TP +.BI \-\-seed " " +Seed the random number generator with +.IR , +an integer >= 0. +If +.I +is >0, any stochastic simulations will be reproducible; the same +command will give the same results. +If +.I +is 0, the random number generator is seeded arbitrarily, and +stochastic simulations will vary from run to run of the same command. +The default seed is 42. + +.TP +.BI \-\-qformat " " +Assert that input +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +.B phmmer +always uses a single sequence query to start its search, so when the input +.I seqfile +is an alignment, +.B phmmer +reads it one unaligned query sequence at a time, not as an alignment. +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). + +.BI \-\-tformat " " +Assert that target sequence database +.I seqdb +is in format +.IR , +bypassing format autodetection. +See +.B \-\-qformat +above for list of accepted format codes for +.IR . + + +.TP +.BI \-\-cpu " " +Set the number of parallel worker threads to +.IR . +On multicore machines, the default is 2. +You can also control this number by setting an environment variable, +.IR HMMER_NCPU . +There is also a master thread, so the actual number of threads that +HMMER spawns is +.IR +1. + +This option is not available if HMMER was compiled with POSIX threads +support turned off. + + + +.TP +.BI \-\-stall +For debugging the MPI master/worker version: pause after start, to +enable the developer to attach debuggers to the running master and +worker(s) processes. Send SIGCONT signal to release the pause. +(Under gdb: +.BR "(gdb) signal SIGCONT" ) +(Only available if optional MPI support was enabled at compile-time.) + +.TP +.BI \-\-mpi +Run under MPI control with master/worker parallelization (using +.BR mpirun , +for example, or equivalent). Only available if optional MPI support +was enabled at compile-time. + + + + + +.SH SEE ALSO + +See +.BR hmmer (1) +for a master man page with a list of all the individual man pages +for programs in the HMMER package. + +.PP +For complete documentation, see the user guide that came with your +HMMER distribution (Userguide.pdf); or see the HMMER web page +(http://hmmer.org/). + + + +.SH COPYRIGHT + +.nf +Copyright (C) 2020 Howard Hughes Medical Institute. +Freely distributed under the BSD open source license. +.fi + +For additional information on copyright and licensing, see the file +called COPYRIGHT in your HMMER source distribution, or see the HMMER +web page +(http://hmmer.org/). + + +.SH AUTHOR + +.nf +http://eddylab.org +.fi + diff --git a/bioinformaticsProject/hsp70search.fasta b/bioinformaticsProject/hsp70search.fasta new file mode 100644 index 0000000..9ba5e5f --- /dev/null +++ b/bioinformaticsProject/hsp70search.fasta @@ -0,0 +1,16 @@ +# --- full sequence ---- --- best 1 domain ---- --- domain number estimation ---- +# target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description of target +#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ ----- --- --- --- --- --- --- --- --- --------------------- +WP_048118742.1 - hsp70Alignment - 0 1101.3 23.8 0 1101.1 23.8 1.0 1 0 0 1 1 1 1 MULTISPECIES: molecular chaperone DnaK +WP_048116754.1 - hsp70Alignment - 6.5e-110 365.8 8.9 2.2e-100 334.3 3.6 2.0 2 0 0 2 2 2 2 MULTISPECIES: hypothetical protein +WP_048116752.1 - hsp70Alignment - 6.8e-107 355.8 13.6 8.6e-100 332.4 6.5 2.0 2 0 0 2 2 2 2 MULTISPECIES: Hsp70 family protein +# +# Program: hmmsearch +# Version: 3.3.2 (Nov 2020) +# Pipeline mode: SEARCH +# Query file: ./ref_sequences/hsp70build.fasta +# Target file: ./proteomes/proteome_50.fasta +# Option settings: ./hmmer/bin/hmmsearch --tblout hsp70search.fasta ./ref_sequences/hsp70build.fasta ./proteomes/proteome_50.fasta +# Current dir: /afs/crc.nd.edu/user/n/nross3/Private/Biocomp_Project/bioinformaticsProject +# Date: Fri Oct 15 10:02:56 2021 +# [ok] diff --git a/bioinformaticsProject/mcrAsearch.fasta b/bioinformaticsProject/mcrAsearch.fasta new file mode 100644 index 0000000..fef9962 --- /dev/null +++ b/bioinformaticsProject/mcrAsearch.fasta @@ -0,0 +1,14 @@ +# --- full sequence ---- --- best 1 domain ---- --- domain number estimation ---- +# target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description of target +#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ ----- --- --- --- --- --- --- --- --- --------------------- +WP_048120720.1 - mcrAlignment - 0 1125.7 2.5 0 1125.5 2.5 1.0 1 0 0 1 1 1 1 MULTISPECIES: coenzyme-B sulfoethylthiotransferase subunit alpha +# +# Program: hmmsearch +# Version: 3.3.2 (Nov 2020) +# Pipeline mode: SEARCH +# Query file: ./ref_sequences/mcrAbuild.fasta +# Target file: ./proteomes/proteome_50.fasta +# Option settings: ./hmmer/bin/hmmsearch --tblout mcrAsearch.fasta ./ref_sequences/mcrAbuild.fasta ./proteomes/proteome_50.fasta +# Current dir: /afs/crc.nd.edu/user/n/nross3/Private/Biocomp_Project/bioinformaticsProject +# Date: Fri Oct 15 10:02:55 2021 +# [ok] diff --git a/bioinformaticsProject/muscle b/bioinformaticsProject/muscle new file mode 100755 index 0000000..569be49 Binary files /dev/null and b/bioinformaticsProject/muscle differ