diff --git a/bioinformaticsProject/Project1.sh b/bioinformaticsProject/Project1.sh new file mode 100644 index 0000000..431f9eb --- /dev/null +++ b/bioinformaticsProject/Project1.sh @@ -0,0 +1,54 @@ +## Identifying candidate pH-resistant methanogenic Archaea +# Project by Johanna Olesk and Nihat Aliyev + +# The aim of this project is to identify which of the 50 isolated microbe proteomes belong to pH-resistant methanogens. This can be done by looking at the presence of the methyl-coenzyme M reductase (mcrA) gene catalyzing the last step of the methanogenesis, and the number of copies of HSP70 gene which is involved in protein biogenesis and refolding for stress resistance. + +# The inputs of the code are the reference sequences together with the proteome sequences. +# The outputs of the code are a table showing a match count for mcrA gene and hsp70 gene for each proteome, and a text file with the names of the candidate pH-resistant methanogens based on the results. + +# USAGE: bash Project1.sh reference_sequence_path proteome_sequence_path +# NB! be sure that after entering the reference_sequence_path and proteome_sequence_path there is no / (slash) automatically added at the end, otherwise it will result in double // (slash) in between the directory and the file, resulting in an error + +mkdir Results + +# 1. concatenating the reference sequences: +cat $1/mcrAgene_*.fasta > mcrAgene.fasta +cat $1/hsp70gene_*.fasta > hsp70gene.fasta + +# 2. aligning the concatenated ref sequence files using muscle +../muscle3.8.31_i86linux64 -in mcrAgene.fasta -out mcrAgene_aligned.fasta +../muscle3.8.31_i86linux64 -in hsp70gene.fasta -out hsp70gene_aligned.fasta + +# 3. build HMM profile +../hmmer/bin/hmmbuild mcrA_profile.hmm mcrAgene_aligned.fasta +../hmmer/bin/hmmbuild hsp70_profile.hmm hsp70gene_aligned.fasta + +echo "proteome_ID,mcrA_gene,hsp70_gene" > Results/match_count_table.csv # create an empty table with column names + +# 4. search for gene in each proteome +for proteome in $2/proteome_*.fasta +do +../hmmer/bin/hmmsearch --tblout mcrA_search.txt mcrA_profile.hmm $proteome +../hmmer/bin/hmmsearch --tblout hsp70_search.txt hsp70_profile.hmm $proteome +mcrA_match=$(cat mcrA_search.txt | grep -v "#" | wc -l) +hsp70_match=$(cat hsp70_search.txt | grep -v "#" | wc -l) +# make a table with 3 columns (proteome name, mcrA match count, hsp70 match count) +proteomeID=$(echo $proteome | sed 's/.*\/\(.*\)\..*/\1/') # extract the proteome ID only, instead of the whole file path +echo "$proteomeID,$mcrA_match,$hsp70_match" >> Results/match_count_table.csv +done + +echo "These are the pH-resistant methanogens chosen according to the presence of the mcrA gene and the number of copies of hsp70 gene (we chose the copy number 2 or more)." > Results/pHresistant_methanogens.txt # make an empty text file with an introductory sentence + +# 5. choose the pH-resistant methanogens +cat Results/match_count_table.csv | awk -F , '$3>"1"' | awk -F , '$2=="1"' > chosen_methanogens.txt # chooses the proteomes that have mcrA gene and 2 or more copies of hsp70 gene +cut -d , -f 1 chosen_methanogens.txt >> Results/pHresistant_methanogens.txt + +rm mcrAgene.fasta +rm hsp70gene.fasta +rm mcrAgene_aligned.fasta +rm hsp70gene_aligned.fasta +rm mcrA_profile.hmm +rm hsp70_profile.hmm +rm mcrA_search.txt +rm hsp70_search.txt +rm chosen_methanogens.txt diff --git a/bioinformaticsProject/Results/match_count_table.csv b/bioinformaticsProject/Results/match_count_table.csv new file mode 100644 index 0000000..2cd51ca --- /dev/null +++ b/bioinformaticsProject/Results/match_count_table.csv @@ -0,0 +1,51 @@ +proteome_ID,mcrA_gene,hsp70_gene +proteome_01,0,4 +proteome_02,0,2 +proteome_03,1,3 +proteome_04,0,4 +proteome_05,1,2 +proteome_06,0,0 +proteome_07,1,2 +proteome_08,0,5 +proteome_09,0,1 +proteome_10,0,3 +proteome_11,0,6 +proteome_12,0,6 +proteome_13,0,3 +proteome_14,0,2 +proteome_15,1,1 +proteome_16,1,1 +proteome_17,0,4 +proteome_18,0,8 +proteome_19,2,1 +proteome_20,0,3 +proteome_21,0,5 +proteome_22,0,9 +proteome_23,2,2 +proteome_24,1,2 +proteome_25,0,5 +proteome_26,0,1 +proteome_27,0,1 +proteome_28,0,1 +proteome_29,1,0 +proteome_30,0,1 +proteome_31,0,7 +proteome_32,0,4 +proteome_33,0,0 +proteome_34,0,2 +proteome_35,0,1 +proteome_36,0,3 +proteome_37,0,1 +proteome_38,1,1 +proteome_39,1,1 +proteome_40,0,2 +proteome_41,0,1 +proteome_42,1,3 +proteome_43,0,3 +proteome_44,1,1 +proteome_45,1,3 +proteome_46,0,2 +proteome_47,0,1 +proteome_48,1,1 +proteome_49,0,3 +proteome_50,1,3 diff --git a/bioinformaticsProject/Results/pHresistant_methanogens.txt b/bioinformaticsProject/Results/pHresistant_methanogens.txt new file mode 100644 index 0000000..b155ce1 --- /dev/null +++ b/bioinformaticsProject/Results/pHresistant_methanogens.txt @@ -0,0 +1,8 @@ +These are the pH-resistant methanogens chosen according to the presence of the mcrA gene and the number of copies of hsp70 gene (we chose the copy number 2 or more). +proteome_05 +proteome_07 +proteome_24 +proteome_03 +proteome_42 +proteome_45 +proteome_50