timaburton · johannaolesk · Oct 14, 2021 · Oct 14, 2021 · Oct 14, 2021 · Oct 15, 2021
diff --git a/bioinformaticsProject/Project1.sh b/bioinformaticsProject/Project1.sh
@@ -0,0 +1,54 @@
+## Identifying candidate pH-resistant methanogenic Archaea
+# Project by Johanna Olesk and Nihat Aliyev
+
+# The aim of this project is to identify which of the 50 isolated microbe proteomes belong to pH-resistant methanogens. This can be done by looking at the presence of the methyl-coenzyme M reductase (mcrA) gene catalyzing the last step of the methanogenesis, and the number of copies of HSP70 gene which is involved in protein biogenesis and refolding for stress resistance.
+
+# The inputs of the code are the reference sequences together with the proteome sequences.
+# The outputs of the code are a table showing a match count for mcrA gene and hsp70 gene for each proteome, and a text file with the names of the candidate pH-resistant methanogens based on the results.
+
+# USAGE: bash Project1.sh reference_sequence_path proteome_sequence_path
+# NB! be sure that after entering the reference_sequence_path and proteome_sequence_path there is no / (slash) automatically added at the end, otherwise it will result in double // (slash) in between the directory and the file, resulting in an error
+
+mkdir Results
+
+# 1. concatenating the reference sequences:
+cat $1/mcrAgene_*.fasta > mcrAgene.fasta
+cat $1/hsp70gene_*.fasta > hsp70gene.fasta
+
+# 2. aligning the concatenated ref sequence files using muscle
+../muscle3.8.31_i86linux64 -in mcrAgene.fasta -out mcrAgene_aligned.fasta
+../muscle3.8.31_i86linux64 -in hsp70gene.fasta -out hsp70gene_aligned.fasta
+
+# 3. build HMM profile
+../hmmer/bin/hmmbuild mcrA_profile.hmm mcrAgene_aligned.fasta
+../hmmer/bin/hmmbuild hsp70_profile.hmm hsp70gene_aligned.fasta
+
+echo "proteome_ID,mcrA_gene,hsp70_gene" > Results/match_count_table.csv # create an empty table with column names
+
+# 4. search for gene in each proteome
+for proteome in $2/proteome_*.fasta
+do
+../hmmer/bin/hmmsearch --tblout mcrA_search.txt mcrA_profile.hmm $proteome
+../hmmer/bin/hmmsearch --tblout hsp70_search.txt hsp70_profile.hmm $proteome
+mcrA_match=$(cat mcrA_search.txt | grep -v "#" | wc -l)
+hsp70_match=$(cat hsp70_search.txt | grep -v "#" | wc -l)
+# make a table with 3 columns (proteome name, mcrA match count, hsp70 match count)
+proteomeID=$(echo $proteome | sed 's/.*\/\(.*\)\..*/\1/') # extract the proteome ID only, instead of the whole file path
+echo "$proteomeID,$mcrA_match,$hsp70_match" >> Results/match_count_table.csv
+done
+
+echo "These are the pH-resistant methanogens chosen according to the presence of the mcrA gene and the number of copies of hsp70 gene (we chose the copy number 2 or more)." > Results/pHresistant_methanogens.txt # make an empty text file with an introductory sentence
+
+# 5. choose the pH-resistant methanogens
+cat Results/match_count_table.csv | awk -F , '$3>"1"' | awk -F , '$2=="1"' > chosen_methanogens.txt # chooses the proteomes that have mcrA gene and 2 or more copies of hsp70 gene
+cut -d , -f 1 chosen_methanogens.txt >> Results/pHresistant_methanogens.txt
+
+rm mcrAgene.fasta
+rm hsp70gene.fasta
+rm mcrAgene_aligned.fasta
+rm hsp70gene_aligned.fasta
+rm mcrA_profile.hmm
+rm hsp70_profile.hmm
+rm mcrA_search.txt
+rm hsp70_search.txt
+rm chosen_methanogens.txt
diff --git a/bioinformaticsProject/Results/match_count_table.csv b/bioinformaticsProject/Results/match_count_table.csv
@@ -0,0 +1,51 @@
+proteome_ID,mcrA_gene,hsp70_gene
+proteome_01,0,4
+proteome_02,0,2
+proteome_03,1,3
+proteome_04,0,4
+proteome_05,1,2
+proteome_06,0,0
+proteome_07,1,2
+proteome_08,0,5
+proteome_09,0,1
+proteome_10,0,3
+proteome_11,0,6
+proteome_12,0,6
+proteome_13,0,3
+proteome_14,0,2
+proteome_15,1,1
+proteome_16,1,1
+proteome_17,0,4
+proteome_18,0,8
+proteome_19,2,1
+proteome_20,0,3
+proteome_21,0,5
+proteome_22,0,9
+proteome_23,2,2
+proteome_24,1,2
+proteome_25,0,5
+proteome_26,0,1
+proteome_27,0,1
+proteome_28,0,1
+proteome_29,1,0
+proteome_30,0,1
+proteome_31,0,7
+proteome_32,0,4
+proteome_33,0,0
+proteome_34,0,2
+proteome_35,0,1
+proteome_36,0,3
+proteome_37,0,1
+proteome_38,1,1
+proteome_39,1,1
+proteome_40,0,2
+proteome_41,0,1
+proteome_42,1,3
+proteome_43,0,3
+proteome_44,1,1
+proteome_45,1,3
+proteome_46,0,2
+proteome_47,0,1
+proteome_48,1,1
+proteome_49,0,3
+proteome_50,1,3
diff --git a/bioinformaticsProject/Results/pHresistant_methanogens.txt b/bioinformaticsProject/Results/pHresistant_methanogens.txt
@@ -0,0 +1,8 @@
+These are the pH-resistant methanogens chosen according to the presence of the mcrA gene and the number of copies of hsp70 gene (we chose the copy number 2 or more).
+proteome_05
+proteome_07
+proteome_24
+proteome_03
+proteome_42
+proteome_45
+proteome_50