From 8ffb42f027aa493c721f5b3d13c987a1d854dbac Mon Sep 17 00:00:00 2001 From: Casey Novak Date: Wed, 13 Oct 2021 11:48:20 -0400 Subject: [PATCH 01/13] hmm profile built for mcrA and hsp70 genes --- bioinformaticsProject/BioProject.sh | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 bioinformaticsProject/BioProject.sh diff --git a/bioinformaticsProject/BioProject.sh b/bioinformaticsProject/BioProject.sh new file mode 100644 index 0000000..8df04bf --- /dev/null +++ b/bioinformaticsProject/BioProject.sh @@ -0,0 +1,22 @@ +#combine reference sequence files for mcrAgene and hsp70gene + +cat mcrA_combined_file.fasta +cat hsp70gene_combined_file.fasta + +# Align mcrA reference sequences using muscle and build an hmm profile using hmm build + +../../../muscle3.8.31_i86linux64 -in mcrAgene_combined_file.fasta -out mcrA_muscle_results +~/Private/bin/bin/hmmbuild mcrA_hmmbuild_results mcrA_muscle_results + +# Search proteomes using hmm profile for mcrA gene + +for proteome in proteomes; do ~/Private/bin/bin/hmmsearch --tblout mcrA_search_results ./ref_sequences/ mcrA_hmmbuild_results ./proteomes/proteome_*.fasta >> proteome_mcrA_results; done + + + + + +# ALigns hsp70 reference sequences using muscle and build an hmm profile using hmm build + +../../../muscle3.8.31_i86linux64 -in hsp70gene_combined_file.fasta -out hsp70_muscle_results +~/Private/bin/bin/hmmbuild hsp70_hmmbuild_results hsp70_muscle_results From 4e415345999eb12a9e592a8ca7d44278afe2d478 Mon Sep 17 00:00:00 2001 From: Casey Novak Date: Wed, 13 Oct 2021 11:51:59 -0400 Subject: [PATCH 02/13] Edited cat commands --- bioinformaticsProject/BioProject.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bioinformaticsProject/BioProject.sh b/bioinformaticsProject/BioProject.sh index 8df04bf..73ed165 100644 --- a/bioinformaticsProject/BioProject.sh +++ b/bioinformaticsProject/BioProject.sh @@ -1,7 +1,7 @@ #combine reference sequence files for mcrAgene and hsp70gene -cat mcrA_combined_file.fasta -cat hsp70gene_combined_file.fasta +cat hsp70gene_*.fasta >> hsp70gene_combined_file.fasta +cat mcrAgene_*.fasta >> mcrAgene_combined_file.fasta # Align mcrA reference sequences using muscle and build an hmm profile using hmm build From dfe13fb5a22471bfc844b4573a376f92a916cf71 Mon Sep 17 00:00:00 2001 From: Casey Novak Date: Wed, 13 Oct 2021 12:25:44 -0400 Subject: [PATCH 03/13] Was able to get results for one proteome, still need to make loop to search through all 50 --- bioinformaticsProject/BioProject.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bioinformaticsProject/BioProject.sh b/bioinformaticsProject/BioProject.sh index 73ed165..587d875 100644 --- a/bioinformaticsProject/BioProject.sh +++ b/bioinformaticsProject/BioProject.sh @@ -12,7 +12,9 @@ cat mcrAgene_*.fasta >> mcrAgene_combined_file.fasta for proteome in proteomes; do ~/Private/bin/bin/hmmsearch --tblout mcrA_search_results ./ref_sequences/ mcrA_hmmbuild_results ./proteomes/proteome_*.fasta >> proteome_mcrA_results; done +#Worked for one file, need to turn into loop +~/Private/bin/bin/hmmsearch --tblout mcrA_hmmsearch_results_proteome01 ./ref_sequences/mcrA_hmmbuild_results ./proteomes/proteome_01.fasta From fa269e03e7455dbf7e8d77cd2b7c93b01df176ef Mon Sep 17 00:00:00 2001 From: Erin Maron Date: Wed, 13 Oct 2021 15:04:23 -0400 Subject: [PATCH 04/13] try this --- bioinformaticsProject/BioProject.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bioinformaticsProject/BioProject.sh b/bioinformaticsProject/BioProject.sh index 587d875..253911d 100644 --- a/bioinformaticsProject/BioProject.sh +++ b/bioinformaticsProject/BioProject.sh @@ -10,7 +10,10 @@ cat mcrAgene_*.fasta >> mcrAgene_combined_file.fasta # Search proteomes using hmm profile for mcrA gene -for proteome in proteomes; do ~/Private/bin/bin/hmmsearch --tblout mcrA_search_results ./ref_sequences/ mcrA_hmmbuild_results ./proteomes/proteome_*.fasta >> proteome_mcrA_results; done +for proteome in proteomes +do +~/Private/bin/bin/hmmsearch --tblout mcrA_search_results ./ref_sequences/mcrA_hmmbuild_results ./proteomes/*.fasta >> proteome_mcrA_results +done #Worked for one file, need to turn into loop From b3b11698f1332aa574d37d9f21c488843f9e0632 Mon Sep 17 00:00:00 2001 From: Casey Novak Date: Wed, 13 Oct 2021 18:04:25 -0400 Subject: [PATCH 05/13] For loops are working now and code runs from bashproject directory --- bioinformaticsProject/BioProject.sh | 32 ++++++++++++++++------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/bioinformaticsProject/BioProject.sh b/bioinformaticsProject/BioProject.sh index 253911d..b73e1b8 100644 --- a/bioinformaticsProject/BioProject.sh +++ b/bioinformaticsProject/BioProject.sh @@ -1,27 +1,31 @@ -#combine reference sequence files for mcrAgene and hsp70gene +# Usage: bash BioProject.sh +# must be used in bioinformatics folder for relative paths to work, should we change this so that it can be used from anywhere? +# Path to bioinformatics folder: /afs/crc.nd.edu/user/c/cnovak/Private/bashproject/Biocomputing_Project/bioinformaticsProject -cat hsp70gene_*.fasta >> hsp70gene_combined_file.fasta -cat mcrAgene_*.fasta >> mcrAgene_combined_file.fasta +# Combine reference sequence files for mcrAgene + +#cat hsp70gene_*.fasta >> hsp70gene_combined_file.fasta +cat ./ref_sequences/mcrAgene_*.fasta >> mcrAgene_combined_file1.fasta # Align mcrA reference sequences using muscle and build an hmm profile using hmm build -../../../muscle3.8.31_i86linux64 -in mcrAgene_combined_file.fasta -out mcrA_muscle_results -~/Private/bin/bin/hmmbuild mcrA_hmmbuild_results mcrA_muscle_results +~/Private/bashproject/muscle3.8.31_i86linux64 -in mcrAgene_combined_file1.fasta -out mcrA_muscle_results1 +~/Private/bin/bin/hmmbuild mcrA_hmmbuild_results1 mcrA_muscle_results1 # Search proteomes using hmm profile for mcrA gene -for proteome in proteomes -do -~/Private/bin/bin/hmmsearch --tblout mcrA_search_results ./ref_sequences/mcrA_hmmbuild_results ./proteomes/*.fasta >> proteome_mcrA_results -done +for proteome in {01..50}; do ~/Private/bin/bin/hmmsearch --tblout mcrA_hmmsearch_results_proteome$proteome mcrA_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta; done + +# Combine reference sequence files for hsp70gene -#Worked for one file, need to turn into loop +cat ./ref_sequences/hsp70gene_*.fasta >> hsp70gene_combined_file1.fasta -~/Private/bin/bin/hmmsearch --tblout mcrA_hmmsearch_results_proteome01 ./ref_sequences/mcrA_hmmbuild_results ./proteomes/proteome_01.fasta +# Align hsp70 reference sequences using muscle and build an hmm profile using hmm build +#~/Private/bashproject/muscle3.8.31_i86linux64 -in hsp70gene_combined_file1.fasta -out hsp70_muscle_results1 +#~/Private/bin/bin/hmmbuild hsp70_hmmbuild_results1 hsp70_muscle_results1 +# Search proteomes using hmm profile for hsp70 gene -# ALigns hsp70 reference sequences using muscle and build an hmm profile using hmm build +for proteome in {01..50}; do ~/Private/bin/bin/hmmsearch --tblout hsp70_hmmsearch_results_proteome$proteome hsp70_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta; done -../../../muscle3.8.31_i86linux64 -in hsp70gene_combined_file.fasta -out hsp70_muscle_results -~/Private/bin/bin/hmmbuild hsp70_hmmbuild_results hsp70_muscle_results From a7e9f5d68fd244a173b1dfafbebaddcaa64cc129 Mon Sep 17 00:00:00 2001 From: Casey Novak Date: Wed, 13 Oct 2021 18:43:51 -0400 Subject: [PATCH 06/13] Updated with loops tested and running for both genes --- bioinformaticsProject/BioProject.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bioinformaticsProject/BioProject.sh b/bioinformaticsProject/BioProject.sh index b73e1b8..a8d9982 100644 --- a/bioinformaticsProject/BioProject.sh +++ b/bioinformaticsProject/BioProject.sh @@ -22,8 +22,8 @@ cat ./ref_sequences/hsp70gene_*.fasta >> hsp70gene_combined_file1.fasta # Align hsp70 reference sequences using muscle and build an hmm profile using hmm build -#~/Private/bashproject/muscle3.8.31_i86linux64 -in hsp70gene_combined_file1.fasta -out hsp70_muscle_results1 -#~/Private/bin/bin/hmmbuild hsp70_hmmbuild_results1 hsp70_muscle_results1 +~/Private/bashproject/muscle3.8.31_i86linux64 -in hsp70gene_combined_file1.fasta -out hsp70_muscle_results1 +~/Private/bin/bin/hmmbuild hsp70_hmmbuild_results1 hsp70_muscle_results1 # Search proteomes using hmm profile for hsp70 gene From 2ca0aadfc2d3358c2afa246de87e406d3bf33bb3 Mon Sep 17 00:00:00 2001 From: Casey Novak Date: Wed, 13 Oct 2021 20:03:03 -0400 Subject: [PATCH 07/13] Combined loops, created a table with results, need to add headers to table --- bioinformaticsProject/BioProject.sh | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/bioinformaticsProject/BioProject.sh b/bioinformaticsProject/BioProject.sh index a8d9982..ae45dcf 100644 --- a/bioinformaticsProject/BioProject.sh +++ b/bioinformaticsProject/BioProject.sh @@ -13,8 +13,7 @@ cat ./ref_sequences/mcrAgene_*.fasta >> mcrAgene_combined_file1.fasta ~/Private/bin/bin/hmmbuild mcrA_hmmbuild_results1 mcrA_muscle_results1 # Search proteomes using hmm profile for mcrA gene - -for proteome in {01..50}; do ~/Private/bin/bin/hmmsearch --tblout mcrA_hmmsearch_results_proteome$proteome mcrA_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta; done +#for proteome in {01..50}; do ~/Private/bin/bin/hmmsearch --tblout mcrA_hmmsearch_results_proteome$proteome mcrA_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta; done # Combine reference sequence files for hsp70gene @@ -25,7 +24,18 @@ cat ./ref_sequences/hsp70gene_*.fasta >> hsp70gene_combined_file1.fasta ~/Private/bashproject/muscle3.8.31_i86linux64 -in hsp70gene_combined_file1.fasta -out hsp70_muscle_results1 ~/Private/bin/bin/hmmbuild hsp70_hmmbuild_results1 hsp70_muscle_results1 +# Create output file +Echo 'Proteome_number, mcrA_matches, hsp70_matches' > summary_table.csv + # Search proteomes using hmm profile for hsp70 gene -for proteome in {01..50}; do ~/Private/bin/bin/hmmsearch --tblout hsp70_hmmsearch_results_proteome$proteome hsp70_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta; done +for proteome in {01..50} +do +~/Private/bin/bin/hmmsearch --tblout hsp70_hmmsearch_results_proteome$proteome hsp70_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta +hsp70matches=$(cat hsp70_hmmsearch_results_proteome$proteome | grep -E -v "#" | wc -l) +proteome_name=proteome$proteome +~/Private/bin/bin/hmmsearch --tblout mcrA_hmmsearch_results_proteome$proteome mcrA_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta +mcrAmatches=$(cat mcrA_hmmsearch_results_proteome$proteome | grep -E -v "#" | wc -l) +echo $proteome_name, $mcrAmatches, $hsp70matches >> summary_table.csv +done From 34479ae921e59e6e4710da14cfce2ee0c6dceecc Mon Sep 17 00:00:00 2001 From: Erin Maron Date: Wed, 13 Oct 2021 23:16:17 -0400 Subject: [PATCH 08/13] edited --- BioProject.sh | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 BioProject.sh diff --git a/BioProject.sh b/BioProject.sh new file mode 100644 index 0000000..40b55c4 --- /dev/null +++ b/BioProject.sh @@ -0,0 +1,45 @@ +# Usage: bash BioProject.sh +# must be used in bioinformatics folder for relative paths to work, should we change this so that it can be used from anywhere? +# Path to bioinformatics folder: /afs/crc.nd.edu/user/c/cnovak/Private/bashproject/Biocomputing_Project/bioinformaticsProject + +# Combine reference sequence files for mcrAgene + +cat ./ref_sequences/mcrAgene_*.fasta >> mcrAgene_combined_file1.fasta + +# Align mcrA reference sequences using muscle and build an hmm profile using hmm build + +~/Private/bashproject/muscle3.8.31_i86linux64 -in mcrAgene_combined_file1.fasta -out mcrA_muscle_results1 +~/Private/bin/bin/hmmbuild mcrA_hmmbuild_results1 mcrA_muscle_results1 + +# Search proteomes using hmm profile for mcrA gene +#for proteome in {01..50}; do ~/Private/bin/bin/hmmsearch --tblout mcrA_hmmsearch_results_proteome$proteome mcrA_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta; done + +# Combine reference sequence files for hsp70gene + +cat ./ref_sequences/hsp70gene_*.fasta >> hsp70gene_combined_file1.fasta + +# Align hsp70 reference sequences using muscle and build an hmm profile using hmm build + +~/Private/bashproject/muscle3.8.31_i86linux64 -in hsp70gene_combined_file1.fasta -out hsp70_muscle_results1 +~/Private/bin/bin/hmmbuild hsp70_hmmbuild_results1 hsp70_muscle_results1 + +# Create output file +Echo 'Proteome_number, mcrA_matches, hsp70_matches' > summary_table.csv + +# Search proteomes using hmm profile for hsp70 gene + +for proteome in {01..50} +do +~/Private/bin/bin/hmmsearch --tblout hsp70_hmmsearch_results_proteome$proteome hsp70_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta +hsp70matches=$(cat hsp70_hmmsearch_results_proteome$proteome | grep -E -v "#" | wc -l) +proteome_name=proteome$proteome +~/Private/bin/bin/hmmsearch --tblout mcrA_hmmsearch_results_proteome$proteome mcrA_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta +mcrAmatches=$(cat mcrA_hmmsearch_results_proteome$proteome | grep -E -v "#" | wc -l) +echo $proteome_name, $mcrAmatches, $hsp70matches >> summary_table.csv +done + +#Titles +sed -i 1i "Proteome_number mcrA_matches Hsp70_matches" summary_table.csv| cat summary_table.csv| sort -t "," -k2 + +#Text File with final proteomes +cat summary_table.csv| sort -t "," -k2| tail -n -16| cut -d "," -f1 >> Proteome_Results.txt From 68c1304ca4d1fb97ee97698955ac9949c735724f Mon Sep 17 00:00:00 2001 From: erinmaron <91560265+erinmaron@users.noreply.github.com> Date: Thu, 14 Oct 2021 13:39:52 -0400 Subject: [PATCH 09/13] Update BioProject.sh --- BioProject.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/BioProject.sh b/BioProject.sh index 40b55c4..f7a1cf0 100644 --- a/BioProject.sh +++ b/BioProject.sh @@ -39,7 +39,8 @@ echo $proteome_name, $mcrAmatches, $hsp70matches >> summary_table.csv done #Titles -sed -i 1i "Proteome_number mcrA_matches Hsp70_matches" summary_table.csv| cat summary_table.csv| sort -t "," -k2 +echo Proteome_number mcrA_matches Hsp70_matches +cat summary_table.csv| sort -t "," -k2 #Text File with final proteomes cat summary_table.csv| sort -t "," -k2| tail -n -16| cut -d "," -f1 >> Proteome_Results.txt From b0021b899178ee38a5b506dea1fc136850e40eca Mon Sep 17 00:00:00 2001 From: Casey Novak Date: Thu, 14 Oct 2021 15:51:54 -0400 Subject: [PATCH 10/13] I think the code is done but we could maybe sort the summary table? Or add better comments at the beginning --- bioinformaticsProject/BioProject.sh | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/bioinformaticsProject/BioProject.sh b/bioinformaticsProject/BioProject.sh index ae45dcf..a13d816 100644 --- a/bioinformaticsProject/BioProject.sh +++ b/bioinformaticsProject/BioProject.sh @@ -3,32 +3,23 @@ # Path to bioinformatics folder: /afs/crc.nd.edu/user/c/cnovak/Private/bashproject/Biocomputing_Project/bioinformaticsProject # Combine reference sequence files for mcrAgene - -#cat hsp70gene_*.fasta >> hsp70gene_combined_file.fasta cat ./ref_sequences/mcrAgene_*.fasta >> mcrAgene_combined_file1.fasta # Align mcrA reference sequences using muscle and build an hmm profile using hmm build - ~/Private/bashproject/muscle3.8.31_i86linux64 -in mcrAgene_combined_file1.fasta -out mcrA_muscle_results1 ~/Private/bin/bin/hmmbuild mcrA_hmmbuild_results1 mcrA_muscle_results1 -# Search proteomes using hmm profile for mcrA gene -#for proteome in {01..50}; do ~/Private/bin/bin/hmmsearch --tblout mcrA_hmmsearch_results_proteome$proteome mcrA_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta; done - # Combine reference sequence files for hsp70gene - cat ./ref_sequences/hsp70gene_*.fasta >> hsp70gene_combined_file1.fasta # Align hsp70 reference sequences using muscle and build an hmm profile using hmm build - ~/Private/bashproject/muscle3.8.31_i86linux64 -in hsp70gene_combined_file1.fasta -out hsp70_muscle_results1 ~/Private/bin/bin/hmmbuild hsp70_hmmbuild_results1 hsp70_muscle_results1 -# Create output file -Echo 'Proteome_number, mcrA_matches, hsp70_matches' > summary_table.csv - -# Search proteomes using hmm profile for hsp70 gene +# Create output file with titles for each column +echo "Proteome_number, mcrA_matches, hsp70_matches" > summary_table.csv +# Search proteomes using hmm profile for hsp70 gene and mcrA gene, isolate match results for each gene and send to summary table for proteome in {01..50} do ~/Private/bin/bin/hmmsearch --tblout hsp70_hmmsearch_results_proteome$proteome hsp70_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta @@ -39,3 +30,13 @@ mcrAmatches=$(cat mcrA_hmmsearch_results_proteome$proteome | grep -E -v "#" | wc echo $proteome_name, $mcrAmatches, $hsp70matches >> summary_table.csv done +# Sorted summary table with results from all searches +cat summary_table.csv + +# Text file with candidate pH-resistant methanogens +echo "These are the proteomes that we have identified to have the best possible chance of working. We selected these four because they all have the mcrA gene present, indicating that they are methanogenic organisms, and because these four have the greatest amount of matches for the hsp70 gene, meaning that they are the most pH resistant proteomes of the 50 that we were given. The next best proteomes only had two matches of the hsp70 gene instead of 3, so we used this as our decision point." > Proteome_Results.txt +cat summary_table.csv | grep -E -v P | sort -t "," -k2 -n | tail -n -16 | sort -t "," -k3 -n | tail -n -4 | cut -d "," -f1 >> Proteome_Results.txt +cat Proteome_Results.txt + + + From 0c96cc8a9a111b115cfb1a08d7bf7d921def4369 Mon Sep 17 00:00:00 2001 From: Casey Novak Date: Thu, 14 Oct 2021 18:46:53 -0400 Subject: [PATCH 11/13] Final Code --- bioinformaticsProject/BioProject.sh | 45 +++++++++++++++-------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/bioinformaticsProject/BioProject.sh b/bioinformaticsProject/BioProject.sh index a13d816..f0c21dd 100644 --- a/bioinformaticsProject/BioProject.sh +++ b/bioinformaticsProject/BioProject.sh @@ -1,42 +1,43 @@ +# Script searches genomes for genes of interest and produces a summary table with the results of the searches. +# Determines which of the 50 isolated proteomes are the best candidates for continued growth experiments by searching for matches to the mcrA gene and the hsp70 gene. # Usage: bash BioProject.sh -# must be used in bioinformatics folder for relative paths to work, should we change this so that it can be used from anywhere? -# Path to bioinformatics folder: /afs/crc.nd.edu/user/c/cnovak/Private/bashproject/Biocomputing_Project/bioinformaticsProject +# Script was written to be used from the bioinformaticeProject directory based on rlative paths but could be used from anywhere with adjusted paths to necessary files. +# Path to bioinformatics folder: ~/Private/bashproject/Biocomputing_Project/bioinformaticsProject # Combine reference sequence files for mcrAgene -cat ./ref_sequences/mcrAgene_*.fasta >> mcrAgene_combined_file1.fasta +cat ./ref_sequences/mcrAgene_*.fasta >> mcrAgene_combined_file.fasta -# Align mcrA reference sequences using muscle and build an hmm profile using hmm build -~/Private/bashproject/muscle3.8.31_i86linux64 -in mcrAgene_combined_file1.fasta -out mcrA_muscle_results1 -~/Private/bin/bin/hmmbuild mcrA_hmmbuild_results1 mcrA_muscle_results1 +# Align mcrA reference sequences using muscle and build an hmm profile using hmmbuild +~/Private/bashproject/muscle3.8.31_i86linux64 -in mcrAgene_combined_file.fasta -out mcrA_muscle_results +~/Private/bin/bin/hmmbuild mcrA_hmmbuild_results mcrA_muscle_results # Combine reference sequence files for hsp70gene -cat ./ref_sequences/hsp70gene_*.fasta >> hsp70gene_combined_file1.fasta +cat ./ref_sequences/hsp70gene_*.fasta >> hsp70gene_combined_file.fasta -# Align hsp70 reference sequences using muscle and build an hmm profile using hmm build -~/Private/bashproject/muscle3.8.31_i86linux64 -in hsp70gene_combined_file1.fasta -out hsp70_muscle_results1 -~/Private/bin/bin/hmmbuild hsp70_hmmbuild_results1 hsp70_muscle_results1 +# Align hsp70 reference sequences using muscle and build an hmm profile using hmmbuild +~/Private/bashproject/muscle3.8.31_i86linux64 -in hsp70gene_combined_file.fasta -out hsp70_muscle_results +~/Private/bin/bin/hmmbuild hsp70_hmmbuild_results hsp70_muscle_results -# Create output file with titles for each column +# Create output table with titles for each column echo "Proteome_number, mcrA_matches, hsp70_matches" > summary_table.csv -# Search proteomes using hmm profile for hsp70 gene and mcrA gene, isolate match results for each gene and send to summary table -for proteome in {01..50} +# Search proteomes using hmm profile for mcrA gene and hsp70 gene, isolate match results for each gene and append to summary table +for number in {01..50} do -~/Private/bin/bin/hmmsearch --tblout hsp70_hmmsearch_results_proteome$proteome hsp70_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta -hsp70matches=$(cat hsp70_hmmsearch_results_proteome$proteome | grep -E -v "#" | wc -l) -proteome_name=proteome$proteome -~/Private/bin/bin/hmmsearch --tblout mcrA_hmmsearch_results_proteome$proteome mcrA_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta -mcrAmatches=$(cat mcrA_hmmsearch_results_proteome$proteome | grep -E -v "#" | wc -l) +~/Private/bin/bin/hmmsearch --tblout mcrA_hmmsearch_results_proteome$number mcrA_hmmbuild_results ./proteomes/proteome_$number.fasta +mcrAmatches=$(cat mcrA_hmmsearch_results_proteome$number | grep -E -v "#" | wc -l) +~/Private/bin/bin/hmmsearch --tblout hsp70_hmmsearch_results_proteome$number hsp70_hmmbuild_results ./proteomes/proteome_$number.fasta +hsp70matches=$(cat hsp70_hmmsearch_results_proteome$number | grep -E -v "#" | wc -l) +proteome_name=proteome$number echo $proteome_name, $mcrAmatches, $hsp70matches >> summary_table.csv done -# Sorted summary table with results from all searches -cat summary_table.csv +# Summary table with results from all searches +cat summary_table.csv # Text file with candidate pH-resistant methanogens -echo "These are the proteomes that we have identified to have the best possible chance of working. We selected these four because they all have the mcrA gene present, indicating that they are methanogenic organisms, and because these four have the greatest amount of matches for the hsp70 gene, meaning that they are the most pH resistant proteomes of the 50 that we were given. The next best proteomes only had two matches of the hsp70 gene instead of 3, so we used this as our decision point." > Proteome_Results.txt +echo "These are the proteomes that we have identified as being the best candidates for continued growth experiments. We selected these proteomes because they all have the mcrA gene present, indicating that they are methanogens, and because these four proteomes all have three matches for the hsp70 gene, which is the most off all the methanogens present. This means that they are the most pH resistant proteomes of the methanogens, and will have the best chance at growth. The next best proteome candidates had only two hsp70 gene matches, so we used this as our decision point." > Proteome_Results.txt cat summary_table.csv | grep -E -v P | sort -t "," -k2 -n | tail -n -16 | sort -t "," -k3 -n | tail -n -4 | cut -d "," -f1 >> Proteome_Results.txt cat Proteome_Results.txt - From d5937a5f3780dab13103e5bed785daf7094cad5e Mon Sep 17 00:00:00 2001 From: Casey Novak Date: Fri, 15 Oct 2021 01:06:00 -0400 Subject: [PATCH 12/13] Final Answers for BioProject --- bioinformaticsProject/BioProject.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bioinformaticsProject/BioProject.sh b/bioinformaticsProject/BioProject.sh index f0c21dd..77c174b 100644 --- a/bioinformaticsProject/BioProject.sh +++ b/bioinformaticsProject/BioProject.sh @@ -1,7 +1,7 @@ # Script searches genomes for genes of interest and produces a summary table with the results of the searches. -# Determines which of the 50 isolated proteomes are the best candidates for continued growth experiments by searching for matches to the mcrA gene and the hsp70 gene. +# Determines which of the 50 isolated proteomes are the best candidates for continued growth experiments by searching for matches to the mcrA gene and the hsp70 gene. # Usage: bash BioProject.sh -# Script was written to be used from the bioinformaticeProject directory based on rlative paths but could be used from anywhere with adjusted paths to necessary files. +# Script was written to be used from the bioinformaticsProject directory based on relative paths but could be used from anywhere with adjusted paths to necessary files. # Path to bioinformatics folder: ~/Private/bashproject/Biocomputing_Project/bioinformaticsProject # Combine reference sequence files for mcrAgene @@ -36,7 +36,7 @@ done cat summary_table.csv # Text file with candidate pH-resistant methanogens -echo "These are the proteomes that we have identified as being the best candidates for continued growth experiments. We selected these proteomes because they all have the mcrA gene present, indicating that they are methanogens, and because these four proteomes all have three matches for the hsp70 gene, which is the most off all the methanogens present. This means that they are the most pH resistant proteomes of the methanogens, and will have the best chance at growth. The next best proteome candidates had only two hsp70 gene matches, so we used this as our decision point." > Proteome_Results.txt +echo "These are the proteomes that we have identified as being the best candidates for continued growth experiments. We selected these proteomes because they all have the mcrA gene present, indicating that they are methanogens, and because these four proteomes all have three matches for the hsp70 gene, which is the most of all the methanogens present. This means that they are the most pH resistant proteomes of the methanogens, and will have the best chance at growth. The next best proteome candidates had only two hsp70 gene matches, so we used this as our decision point." > Proteome_Results.txt cat summary_table.csv | grep -E -v P | sort -t "," -k2 -n | tail -n -16 | sort -t "," -k3 -n | tail -n -4 | cut -d "," -f1 >> Proteome_Results.txt cat Proteome_Results.txt From fd443b6542fbf7fdf605107de73ea6c14507a27b Mon Sep 17 00:00:00 2001 From: caseyenovak <89488861+caseyenovak@users.noreply.github.com> Date: Fri, 15 Oct 2021 01:09:56 -0400 Subject: [PATCH 13/13] Delete BioProject.sh --- BioProject.sh | 46 ---------------------------------------------- 1 file changed, 46 deletions(-) delete mode 100644 BioProject.sh diff --git a/BioProject.sh b/BioProject.sh deleted file mode 100644 index f7a1cf0..0000000 --- a/BioProject.sh +++ /dev/null @@ -1,46 +0,0 @@ -# Usage: bash BioProject.sh -# must be used in bioinformatics folder for relative paths to work, should we change this so that it can be used from anywhere? -# Path to bioinformatics folder: /afs/crc.nd.edu/user/c/cnovak/Private/bashproject/Biocomputing_Project/bioinformaticsProject - -# Combine reference sequence files for mcrAgene - -cat ./ref_sequences/mcrAgene_*.fasta >> mcrAgene_combined_file1.fasta - -# Align mcrA reference sequences using muscle and build an hmm profile using hmm build - -~/Private/bashproject/muscle3.8.31_i86linux64 -in mcrAgene_combined_file1.fasta -out mcrA_muscle_results1 -~/Private/bin/bin/hmmbuild mcrA_hmmbuild_results1 mcrA_muscle_results1 - -# Search proteomes using hmm profile for mcrA gene -#for proteome in {01..50}; do ~/Private/bin/bin/hmmsearch --tblout mcrA_hmmsearch_results_proteome$proteome mcrA_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta; done - -# Combine reference sequence files for hsp70gene - -cat ./ref_sequences/hsp70gene_*.fasta >> hsp70gene_combined_file1.fasta - -# Align hsp70 reference sequences using muscle and build an hmm profile using hmm build - -~/Private/bashproject/muscle3.8.31_i86linux64 -in hsp70gene_combined_file1.fasta -out hsp70_muscle_results1 -~/Private/bin/bin/hmmbuild hsp70_hmmbuild_results1 hsp70_muscle_results1 - -# Create output file -Echo 'Proteome_number, mcrA_matches, hsp70_matches' > summary_table.csv - -# Search proteomes using hmm profile for hsp70 gene - -for proteome in {01..50} -do -~/Private/bin/bin/hmmsearch --tblout hsp70_hmmsearch_results_proteome$proteome hsp70_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta -hsp70matches=$(cat hsp70_hmmsearch_results_proteome$proteome | grep -E -v "#" | wc -l) -proteome_name=proteome$proteome -~/Private/bin/bin/hmmsearch --tblout mcrA_hmmsearch_results_proteome$proteome mcrA_hmmbuild_results1 ./proteomes/proteome_$proteome.fasta -mcrAmatches=$(cat mcrA_hmmsearch_results_proteome$proteome | grep -E -v "#" | wc -l) -echo $proteome_name, $mcrAmatches, $hsp70matches >> summary_table.csv -done - -#Titles -echo Proteome_number mcrA_matches Hsp70_matches -cat summary_table.csv| sort -t "," -k2 - -#Text File with final proteomes -cat summary_table.csv| sort -t "," -k2| tail -n -16| cut -d "," -f1 >> Proteome_Results.txt